]> git.proxmox.com Git - mirror_lxcfs.git/blame - bindings.c
Merge pull request #316 from KellenRenshaw/lp1860813
[mirror_lxcfs.git] / bindings.c
CommitLineData
237e200e
SH
1/* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9#define FUSE_USE_VERSION 26
10
0ecddf02 11#define __STDC_FORMAT_MACROS
237e200e 12#include <dirent.h>
29a73c2f 13#include <errno.h>
237e200e
SH
14#include <fcntl.h>
15#include <fuse.h>
0ecddf02 16#include <inttypes.h>
237e200e 17#include <libgen.h>
237e200e 18#include <pthread.h>
29a73c2f 19#include <sched.h>
db1b32f6 20#include <stdarg.h>
29a73c2f 21#include <stdbool.h>
0ecddf02 22#include <stdint.h>
29a73c2f
CB
23#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26#include <time.h>
27#include <unistd.h>
28#include <wait.h>
d89504c4 29#include <linux/magic.h>
237e200e 30#include <linux/sched.h>
29a73c2f
CB
31#include <sys/epoll.h>
32#include <sys/mman.h>
33#include <sys/mount.h>
237e200e
SH
34#include <sys/param.h>
35#include <sys/socket.h>
29a73c2f 36#include <sys/syscall.h>
0ecddf02 37#include <sys/sysinfo.h>
d89504c4 38#include <sys/vfs.h>
237e200e 39
237e200e 40#include "bindings.h"
c9236032
HY
41#include "memory_utils.h"
42#include "config.h"
237e200e 43
29a73c2f
CB
44/* Define pivot_root() if missing from the C library */
45#ifndef HAVE_PIVOT_ROOT
46static int pivot_root(const char * new_root, const char * put_old)
47{
48#ifdef __NR_pivot_root
49return syscall(__NR_pivot_root, new_root, put_old);
50#else
51errno = ENOSYS;
52return -1;
53#endif
54}
55#else
56extern int pivot_root(const char * new_root, const char * put_old);
57#endif
58
8be92dd1
JS
59struct cpuacct_usage {
60 uint64_t user;
61 uint64_t system;
056adcef 62 uint64_t idle;
77005a6c 63 bool online;
8be92dd1
JS
64};
65
0e47acaa 66/* The function of hash table.*/
67#define LOAD_SIZE 100 /*the size of hash_table */
6db4f7a3 68#define FLUSH_TIME 5 /*the flush rate */
69#define DEPTH_DIR 3 /*the depth of per cgroup */
70/* The function of calculate loadavg .*/
71#define FSHIFT 11 /* nr of bits of precision */
72#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
73#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
74#define EXP_5 2014 /* 1/exp(5sec/5min) */
75#define EXP_15 2037 /* 1/exp(5sec/15min) */
76#define LOAD_INT(x) ((x) >> FSHIFT)
77#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
beb5024e 78/*
6db4f7a3 79 * This parameter is used for proc_loadavg_read().
80 * 1 means use loadavg, 0 means not use.
81 */
82static int loadavg = 0;
a83618e2 83static volatile sig_atomic_t loadavg_stop = 0;
056adcef 84static int calc_hash(const char *name)
0e47acaa 85{
86 unsigned int hash = 0;
87 unsigned int x = 0;
88 /* ELFHash algorithm. */
89 while (*name) {
90 hash = (hash << 4) + *name++;
91 x = hash & 0xf0000000;
92 if (x != 0)
93 hash ^= (x >> 24);
94 hash &= ~x;
95 }
b077527b 96 return (hash & 0x7fffffff);
0e47acaa 97}
98
99struct load_node {
100 char *cg; /*cg */
101 unsigned long avenrun[3]; /* Load averages */
102 unsigned int run_pid;
103 unsigned int total_pid;
104 unsigned int last_pid;
105 int cfd; /* The file descriptor of the mounted cgroup */
106 struct load_node *next;
107 struct load_node **pre;
108};
109
110struct load_head {
111 /*
112 * The lock is about insert load_node and refresh load_node.To the first
113 * load_node of each hash bucket, insert and refresh in this hash bucket is
114 * mutually exclusive.
115 */
116 pthread_mutex_t lock;
117 /*
118 * The rdlock is about read loadavg and delete load_node.To each hash
119 * bucket, read and delete is mutually exclusive. But at the same time, we
120 * allow paratactic read operation. This rdlock is at list level.
121 */
122 pthread_rwlock_t rdlock;
123 /*
124 * The rilock is about read loadavg and insert load_node.To the first
125 * load_node of each hash bucket, read and insert is mutually exclusive.
126 * But at the same time, we allow paratactic read operation.
127 */
128 pthread_rwlock_t rilock;
129 struct load_node *next;
130};
131
132static struct load_head load_hash[LOAD_SIZE]; /* hash table */
133/*
134 * init_load initialize the hash table.
135 * Return 0 on success, return -1 on failure.
136 */
137static int init_load(void)
138{
139 int i;
140 int ret;
141
142 for (i = 0; i < LOAD_SIZE; i++) {
143 load_hash[i].next = NULL;
144 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
145 if (ret != 0) {
146 lxcfs_error("%s\n", "Failed to initialize lock");
147 goto out3;
148 }
149 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
150 if (ret != 0) {
151 lxcfs_error("%s\n", "Failed to initialize rdlock");
152 goto out2;
153 }
154 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
155 if (ret != 0) {
156 lxcfs_error("%s\n", "Failed to initialize rilock");
157 goto out1;
158 }
159 }
160 return 0;
161out1:
162 pthread_rwlock_destroy(&load_hash[i].rdlock);
163out2:
164 pthread_mutex_destroy(&load_hash[i].lock);
165out3:
166 while (i > 0) {
167 i--;
168 pthread_mutex_destroy(&load_hash[i].lock);
169 pthread_rwlock_destroy(&load_hash[i].rdlock);
170 pthread_rwlock_destroy(&load_hash[i].rilock);
171 }
172 return -1;
173}
174
175static void insert_node(struct load_node **n, int locate)
176{
177 struct load_node *f;
178
179 pthread_mutex_lock(&load_hash[locate].lock);
180 pthread_rwlock_wrlock(&load_hash[locate].rilock);
181 f = load_hash[locate].next;
182 load_hash[locate].next = *n;
183
184 (*n)->pre = &(load_hash[locate].next);
185 if (f)
186 f->pre = &((*n)->next);
187 (*n)->next = f;
188 pthread_mutex_unlock(&load_hash[locate].lock);
189 pthread_rwlock_unlock(&load_hash[locate].rilock);
190}
191/*
192 * locate_node() finds special node. Not return NULL means success.
193 * It should be noted that rdlock isn't unlocked at the end of code
194 * because this function is used to read special node. Delete is not
195 * allowed before read has ended.
196 * unlock rdlock only in proc_loadavg_read().
197 */
198static struct load_node *locate_node(char *cg, int locate)
199{
200 struct load_node *f = NULL;
201 int i = 0;
202
203 pthread_rwlock_rdlock(&load_hash[locate].rilock);
204 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
205 if (load_hash[locate].next == NULL) {
206 pthread_rwlock_unlock(&load_hash[locate].rilock);
207 return f;
208 }
209 f = load_hash[locate].next;
210 pthread_rwlock_unlock(&load_hash[locate].rilock);
211 while (f && ((i = strcmp(f->cg, cg)) != 0))
212 f = f->next;
213 return f;
214}
215/* Delete the load_node n and return the next node of it. */
216static struct load_node *del_node(struct load_node *n, int locate)
217{
218 struct load_node *g;
219
220 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
221 if (n->next == NULL) {
222 *(n->pre) = NULL;
223 } else {
224 *(n->pre) = n->next;
225 n->next->pre = n->pre;
226 }
227 g = n->next;
228 free(n->cg);
229 free(n);
230 pthread_rwlock_unlock(&load_hash[locate].rdlock);
231 return g;
232}
233
a83618e2 234static void load_free(void)
9c480eb7 235{
236 int i;
237 struct load_node *f, *p;
238
239 for (i = 0; i < LOAD_SIZE; i++) {
240 pthread_mutex_lock(&load_hash[i].lock);
241 pthread_rwlock_wrlock(&load_hash[i].rilock);
242 pthread_rwlock_wrlock(&load_hash[i].rdlock);
243 if (load_hash[i].next == NULL) {
244 pthread_mutex_unlock(&load_hash[i].lock);
245 pthread_mutex_destroy(&load_hash[i].lock);
246 pthread_rwlock_unlock(&load_hash[i].rilock);
247 pthread_rwlock_destroy(&load_hash[i].rilock);
248 pthread_rwlock_unlock(&load_hash[i].rdlock);
249 pthread_rwlock_destroy(&load_hash[i].rdlock);
250 continue;
251 }
252 for (f = load_hash[i].next; f; ) {
253 free(f->cg);
254 p = f->next;
255 free(f);
256 f = p;
257 }
258 pthread_mutex_unlock(&load_hash[i].lock);
259 pthread_mutex_destroy(&load_hash[i].lock);
260 pthread_rwlock_unlock(&load_hash[i].rilock);
261 pthread_rwlock_destroy(&load_hash[i].rilock);
262 pthread_rwlock_unlock(&load_hash[i].rdlock);
263 pthread_rwlock_destroy(&load_hash[i].rdlock);
264 }
265}
056adcef
JS
266
267/* Data for CPU view */
268struct cg_proc_stat {
269 char *cg;
270 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
271 struct cpuacct_usage *view; // Usage stats reported to the container
272 int cpu_count;
2f49b662 273 pthread_mutex_t lock; // For node manipulation
056adcef
JS
274 struct cg_proc_stat *next;
275};
276
277struct cg_proc_stat_head {
278 struct cg_proc_stat *next;
951acc94 279 time_t lastcheck;
2f49b662
JS
280
281 /*
282 * For access to the list. Reading can be parallel, pruning is exclusive.
283 */
284 pthread_rwlock_t lock;
056adcef
JS
285};
286
287#define CPUVIEW_HASH_SIZE 100
288static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
289
290static bool cpuview_init_head(struct cg_proc_stat_head **head)
291{
292 *head = malloc(sizeof(struct cg_proc_stat_head));
293 if (!(*head)) {
294 lxcfs_error("%s\n", strerror(errno));
295 return false;
296 }
297
951acc94 298 (*head)->lastcheck = time(NULL);
056adcef 299 (*head)->next = NULL;
2f49b662
JS
300
301 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
302 lxcfs_error("%s\n", "Failed to initialize list lock");
303 free(*head);
304 return false;
305 }
306
056adcef
JS
307 return true;
308}
309
310static bool init_cpuview()
311{
312 int i;
313
314 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
315 proc_stat_history[i] = NULL;
316
317 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
318 if (!cpuview_init_head(&proc_stat_history[i]))
319 goto err;
320 }
321
322 return true;
323
324err:
325 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
326 if (proc_stat_history[i]) {
327 free(proc_stat_history[i]);
328 proc_stat_history[i] = NULL;
329 }
330 }
331
332 return false;
333}
334
951acc94
JS
335static void free_proc_stat_node(struct cg_proc_stat *node)
336{
2f49b662 337 pthread_mutex_destroy(&node->lock);
951acc94
JS
338 free(node->cg);
339 free(node->usage);
340 free(node->view);
341 free(node);
342}
343
056adcef
JS
344static void cpuview_free_head(struct cg_proc_stat_head *head)
345{
346 struct cg_proc_stat *node, *tmp;
347
348 if (head->next) {
349 node = head->next;
350
351 for (;;) {
352 tmp = node;
353 node = node->next;
951acc94 354 free_proc_stat_node(tmp);
056adcef
JS
355
356 if (!node)
357 break;
358 }
359 }
360
2f49b662 361 pthread_rwlock_destroy(&head->lock);
056adcef
JS
362 free(head);
363}
364
365static void free_cpuview()
366{
367 int i;
368
369 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
370 if (proc_stat_history[i])
371 cpuview_free_head(proc_stat_history[i]);
372 }
373}
374
237e200e
SH
375/*
376 * A table caching which pid is init for a pid namespace.
377 * When looking up which pid is init for $qpid, we first
378 * 1. Stat /proc/$qpid/ns/pid.
379 * 2. Check whether the ino_t is in our store.
380 * a. if not, fork a child in qpid's ns to send us
381 * ucred.pid = 1, and read the initpid. Cache
382 * initpid and creation time for /proc/initpid
383 * in a new store entry.
384 * b. if so, verify that /proc/initpid still matches
385 * what we have saved. If not, clear the store
386 * entry and go back to a. If so, return the
387 * cached initpid.
388 */
389struct pidns_init_store {
390 ino_t ino; // inode number for /proc/$pid/ns/pid
391 pid_t initpid; // the pid of nit in that ns
392 long int ctime; // the time at which /proc/$initpid was created
393 struct pidns_init_store *next;
394 long int lastcheck;
395};
396
397/* lol - look at how they are allocated in the kernel */
398#define PIDNS_HASH_SIZE 4096
399#define HASH(x) ((x) % PIDNS_HASH_SIZE)
400
401static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
402static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
403static void lock_mutex(pthread_mutex_t *l)
404{
405 int ret;
406
407 if ((ret = pthread_mutex_lock(l)) != 0) {
b8defc3d 408 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
409 exit(1);
410 }
411}
412
29a73c2f
CB
413/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
414 * Number of hierarchies mounted. */
415static int num_hierarchies;
416
417/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
418 * Hierachies mounted {cpuset, blkio, ...}:
419 * Initialized via __constructor__ collect_and_mount_subsystems(). */
420static char **hierarchies;
421
422/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
423 * Open file descriptors:
424 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
425 * private mount namespace.
426 * Initialized via __constructor__ collect_and_mount_subsystems().
427 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
428 * mounts and respective files in the private namespace even when located in
429 * another namespace using the *at() family of functions
430 * {openat(), fchownat(), ...}. */
431static int *fd_hierarchies;
a257a8ee 432static int cgroup_mount_ns_fd = -1;
29a73c2f 433
237e200e
SH
434static void unlock_mutex(pthread_mutex_t *l)
435{
436 int ret;
437
438 if ((ret = pthread_mutex_unlock(l)) != 0) {
b8defc3d 439 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
440 exit(1);
441 }
442}
443
444static void store_lock(void)
445{
446 lock_mutex(&pidns_store_mutex);
447}
448
449static void store_unlock(void)
450{
451 unlock_mutex(&pidns_store_mutex);
452}
453
454/* Must be called under store_lock */
455static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
456{
457 struct stat initsb;
458 char fnam[100];
459
460 snprintf(fnam, 100, "/proc/%d", e->initpid);
461 if (stat(fnam, &initsb) < 0)
462 return false;
7dd6560a
CB
463
464 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
465 initsb.st_ctime, e->initpid);
466
237e200e
SH
467 if (e->ctime != initsb.st_ctime)
468 return false;
469 return true;
470}
471
472/* Must be called under store_lock */
473static void remove_initpid(struct pidns_init_store *e)
474{
475 struct pidns_init_store *tmp;
476 int h;
477
7dd6560a
CB
478 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
479
237e200e
SH
480 h = HASH(e->ino);
481 if (pidns_hash_table[h] == e) {
482 pidns_hash_table[h] = e->next;
483 free(e);
484 return;
485 }
486
487 tmp = pidns_hash_table[h];
488 while (tmp) {
489 if (tmp->next == e) {
490 tmp->next = e->next;
491 free(e);
492 return;
493 }
494 tmp = tmp->next;
495 }
496}
497
498#define PURGE_SECS 5
499/* Must be called under store_lock */
500static void prune_initpid_store(void)
501{
502 static long int last_prune = 0;
503 struct pidns_init_store *e, *prev, *delme;
504 long int now, threshold;
505 int i;
506
507 if (!last_prune) {
508 last_prune = time(NULL);
509 return;
510 }
511 now = time(NULL);
512 if (now < last_prune + PURGE_SECS)
513 return;
7dd6560a
CB
514
515 lxcfs_debug("%s\n", "Pruning.");
516
237e200e
SH
517 last_prune = now;
518 threshold = now - 2 * PURGE_SECS;
519
520 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
521 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
522 if (e->lastcheck < threshold) {
7dd6560a
CB
523
524 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
525
237e200e
SH
526 delme = e;
527 if (prev)
528 prev->next = e->next;
529 else
530 pidns_hash_table[i] = e->next;
531 e = e->next;
532 free(delme);
533 } else {
534 prev = e;
535 e = e->next;
536 }
537 }
538 }
539}
540
541/* Must be called under store_lock */
542static void save_initpid(struct stat *sb, pid_t pid)
543{
544 struct pidns_init_store *e;
545 char fpath[100];
546 struct stat procsb;
547 int h;
548
7dd6560a
CB
549 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
550
237e200e
SH
551 snprintf(fpath, 100, "/proc/%d", pid);
552 if (stat(fpath, &procsb) < 0)
553 return;
554 do {
555 e = malloc(sizeof(*e));
556 } while (!e);
557 e->ino = sb->st_ino;
558 e->initpid = pid;
559 e->ctime = procsb.st_ctime;
560 h = HASH(e->ino);
561 e->next = pidns_hash_table[h];
562 e->lastcheck = time(NULL);
563 pidns_hash_table[h] = e;
564}
565
566/*
567 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
568 * entry for the inode number and creation time. Verify that the init pid
569 * is still valid. If not, remove it. Return the entry if valid, NULL
570 * otherwise.
571 * Must be called under store_lock
572 */
573static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
574{
575 int h = HASH(sb->st_ino);
576 struct pidns_init_store *e = pidns_hash_table[h];
577
578 while (e) {
579 if (e->ino == sb->st_ino) {
580 if (initpid_still_valid(e, sb)) {
581 e->lastcheck = time(NULL);
582 return e;
583 }
584 remove_initpid(e);
585 return NULL;
586 }
587 e = e->next;
588 }
589
590 return NULL;
591}
592
0f657ce3 593static int is_dir(const char *path, int fd)
237e200e
SH
594{
595 struct stat statbuf;
0f657ce3 596 int ret = fstatat(fd, path, &statbuf, fd);
237e200e
SH
597 if (ret == 0 && S_ISDIR(statbuf.st_mode))
598 return 1;
599 return 0;
600}
601
602static char *must_copy_string(const char *str)
603{
604 char *dup = NULL;
605 if (!str)
606 return NULL;
607 do {
608 dup = strdup(str);
609 } while (!dup);
610
611 return dup;
612}
613
614static inline void drop_trailing_newlines(char *s)
615{
616 int l;
617
618 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
619 s[l-1] = '\0';
620}
621
622#define BATCH_SIZE 50
623static void dorealloc(char **mem, size_t oldlen, size_t newlen)
624{
625 int newbatches = (newlen / BATCH_SIZE) + 1;
626 int oldbatches = (oldlen / BATCH_SIZE) + 1;
627
628 if (!*mem || newbatches > oldbatches) {
629 char *tmp;
630 do {
631 tmp = realloc(*mem, newbatches * BATCH_SIZE);
632 } while (!tmp);
633 *mem = tmp;
634 }
635}
636static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
637{
638 size_t newlen = *len + linelen;
639 dorealloc(contents, *len, newlen + 1);
640 memcpy(*contents + *len, line, linelen+1);
641 *len = newlen;
642}
643
60f2ae53 644static char *slurp_file(const char *from, int fd)
237e200e
SH
645{
646 char *line = NULL;
647 char *contents = NULL;
60f2ae53 648 FILE *f = fdopen(fd, "r");
237e200e
SH
649 size_t len = 0, fulllen = 0;
650 ssize_t linelen;
651
652 if (!f)
653 return NULL;
654
655 while ((linelen = getline(&line, &len, f)) != -1) {
656 append_line(&contents, &fulllen, line, linelen);
657 }
658 fclose(f);
659
660 if (contents)
661 drop_trailing_newlines(contents);
662 free(line);
663 return contents;
664}
665
c9236032
HY
666static int preserve_ns(const int pid, const char *ns)
667{
668 int ret;
669/* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
670#define __NS_PATH_LEN 50
671 char path[__NS_PATH_LEN];
672
673 /* This way we can use this function to also check whether namespaces
674 * are supported by the kernel by passing in the NULL or the empty
675 * string.
676 */
677 ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
678 !ns || strcmp(ns, "") == 0 ? "" : "/",
679 !ns || strcmp(ns, "") == 0 ? "" : ns);
680 if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
681 errno = EFBIG;
682 return -1;
683 }
684
685 return open(path, O_RDONLY | O_CLOEXEC);
686}
687
688/**
689 * in_same_namespace - Check whether two processes are in the same namespace.
690 * @pid1 - PID of the first process.
691 * @pid2 - PID of the second process.
692 * @ns - Name of the namespace to check. Must correspond to one of the names
693 * for the namespaces as shown in /proc/<pid/ns/
694 *
695 * If the two processes are not in the same namespace returns an fd to the
696 * namespace of the second process identified by @pid2. If the two processes are
697 * in the same namespace returns -EINVAL, -1 if an error occurred.
698 */
699static int in_same_namespace(pid_t pid1, pid_t pid2, const char *ns)
700{
701 __do_close_prot_errno int ns_fd1 = -1, ns_fd2 = -1;
702 int ret = -1;
703 struct stat ns_st1, ns_st2;
704
705 ns_fd1 = preserve_ns(pid1, ns);
706 if (ns_fd1 < 0) {
707 /* The kernel does not support this namespace. This is not an
708 * error.
709 */
710 if (errno == ENOENT)
711 return -EINVAL;
712
713 return -1;
714 }
715
716 ns_fd2 = preserve_ns(pid2, ns);
717 if (ns_fd2 < 0)
718 return -1;
719
720 ret = fstat(ns_fd1, &ns_st1);
721 if (ret < 0)
722 return -1;
723
724 ret = fstat(ns_fd2, &ns_st2);
725 if (ret < 0)
726 return -1;
727
728 /* processes are in the same namespace */
729 if ((ns_st1.st_dev == ns_st2.st_dev) && (ns_st1.st_ino == ns_st2.st_ino))
730 return -EINVAL;
731
732 /* processes are in different namespaces */
733 return move_fd(ns_fd2);
734}
735
736static bool is_shared_pidns(pid_t pid)
737{
738 if (pid != 1)
739 return false;
740
6e3637bb 741 if (in_same_namespace(pid, getpid(), "pid") == -EINVAL)
c9236032
HY
742 return true;
743
744 return false;
745}
746
ba59ea09 747static bool write_string(const char *fnam, const char *string, int fd)
237e200e
SH
748{
749 FILE *f;
750 size_t len, ret;
751
beb5024e
CB
752 f = fdopen(fd, "w");
753 if (!f)
237e200e 754 return false;
beb5024e 755
237e200e
SH
756 len = strlen(string);
757 ret = fwrite(string, 1, len, f);
758 if (ret != len) {
beb5024e
CB
759 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
760 strerror(errno), string, fnam);
237e200e
SH
761 fclose(f);
762 return false;
763 }
beb5024e 764
237e200e 765 if (fclose(f) < 0) {
beb5024e 766 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
237e200e
SH
767 return false;
768 }
beb5024e 769
237e200e
SH
770 return true;
771}
772
237e200e
SH
773struct cgfs_files {
774 char *name;
775 uint32_t uid, gid;
776 uint32_t mode;
777};
778
0619767c 779#define ALLOC_NUM 20
237e200e
SH
780static bool store_hierarchy(char *stridx, char *h)
781{
0619767c
SH
782 if (num_hierarchies % ALLOC_NUM == 0) {
783 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
784 n *= ALLOC_NUM;
785 char **tmp = realloc(hierarchies, n * sizeof(char *));
0619767c 786 if (!tmp) {
b8defc3d 787 lxcfs_error("%s\n", strerror(errno));
0619767c
SH
788 exit(1);
789 }
237e200e 790 hierarchies = tmp;
237e200e 791 }
f676eb79 792
0619767c 793 hierarchies[num_hierarchies++] = must_copy_string(h);
237e200e
SH
794 return true;
795}
796
797static void print_subsystems(void)
798{
799 int i;
800
a257a8ee 801 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
cc97d34c 802 fprintf(stderr, "hierarchies:\n");
237e200e
SH
803 for (i = 0; i < num_hierarchies; i++) {
804 if (hierarchies[i])
b8defc3d
CB
805 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
806 fd_hierarchies[i], hierarchies[i]);
237e200e
SH
807 }
808}
809
810static bool in_comma_list(const char *needle, const char *haystack)
811{
812 const char *s = haystack, *e;
813 size_t nlen = strlen(needle);
814
06081b29 815 while (*s && (e = strchr(s, ','))) {
237e200e
SH
816 if (nlen != e - s) {
817 s = e + 1;
818 continue;
819 }
820 if (strncmp(needle, s, nlen) == 0)
821 return true;
822 s = e + 1;
823 }
824 if (strcmp(needle, s) == 0)
825 return true;
826 return false;
827}
828
829/* do we need to do any massaging here? I'm not sure... */
5dd3e6fd
CB
830/* Return the mounted controller and store the corresponding open file descriptor
831 * referring to the controller mountpoint in the private lxcfs namespace in
832 * @cfd.
833 */
834static char *find_mounted_controller(const char *controller, int *cfd)
237e200e
SH
835{
836 int i;
837
838 for (i = 0; i < num_hierarchies; i++) {
839 if (!hierarchies[i])
840 continue;
5dd3e6fd
CB
841 if (strcmp(hierarchies[i], controller) == 0) {
842 *cfd = fd_hierarchies[i];
237e200e 843 return hierarchies[i];
5dd3e6fd
CB
844 }
845 if (in_comma_list(controller, hierarchies[i])) {
846 *cfd = fd_hierarchies[i];
237e200e 847 return hierarchies[i];
5dd3e6fd 848 }
237e200e
SH
849 }
850
851 return NULL;
852}
853
854bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
855 const char *value)
856{
ba59ea09 857 int ret, fd, cfd;
237e200e 858 size_t len;
f5a6d92e 859 char *fnam, *tmpc;
237e200e 860
f5a6d92e 861 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
862 if (!tmpc)
863 return false;
f5a6d92e
CB
864
865 /* Make sure we pass a relative path to *at() family of functions.
866 * . + /cgroup + / + file + \0
867 */
ba59ea09 868 len = strlen(cgroup) + strlen(file) + 3;
237e200e 869 fnam = alloca(len);
ba59ea09
CB
870 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
871 if (ret < 0 || (size_t)ret >= len)
872 return false;
873
874 fd = openat(cfd, fnam, O_WRONLY);
875 if (fd < 0)
876 return false;
f676eb79 877
ba59ea09 878 return write_string(fnam, value, fd);
237e200e
SH
879}
880
881// Chown all the files in the cgroup directory. We do this when we create
882// a cgroup on behalf of a user.
f23fe717 883static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e 884{
f23fe717 885 struct dirent *direntp;
237e200e
SH
886 char path[MAXPATHLEN];
887 size_t len;
888 DIR *d;
f23fe717 889 int fd1, ret;
237e200e
SH
890
891 len = strlen(dirname);
892 if (len >= MAXPATHLEN) {
b8defc3d 893 lxcfs_error("Pathname too long: %s\n", dirname);
237e200e
SH
894 return;
895 }
896
f23fe717
CB
897 fd1 = openat(fd, dirname, O_DIRECTORY);
898 if (fd1 < 0)
899 return;
900
901 d = fdopendir(fd1);
237e200e 902 if (!d) {
b8defc3d 903 lxcfs_error("Failed to open %s\n", dirname);
237e200e
SH
904 return;
905 }
906
f23fe717 907 while ((direntp = readdir(d))) {
237e200e
SH
908 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
909 continue;
910 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
911 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 912 lxcfs_error("Pathname too long under %s\n", dirname);
237e200e
SH
913 continue;
914 }
f23fe717 915 if (fchownat(fd, path, uid, gid, 0) < 0)
b8defc3d 916 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
237e200e
SH
917 }
918 closedir(d);
919}
920
921int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
922{
5dd3e6fd 923 int cfd;
237e200e 924 size_t len;
f5a6d92e 925 char *dirnam, *tmpc;
237e200e 926
f5a6d92e 927 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
928 if (!tmpc)
929 return -EINVAL;
f5a6d92e
CB
930
931 /* Make sure we pass a relative path to *at() family of functions.
932 * . + /cg + \0
933 */
f23fe717 934 len = strlen(cg) + 2;
237e200e 935 dirnam = alloca(len);
f23fe717 936 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
237e200e 937
f23fe717 938 if (mkdirat(cfd, dirnam, 0755) < 0)
237e200e
SH
939 return -errno;
940
941 if (uid == 0 && gid == 0)
942 return 0;
943
f23fe717 944 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
237e200e
SH
945 return -errno;
946
f23fe717 947 chown_all_cgroup_files(dirnam, uid, gid, cfd);
237e200e
SH
948
949 return 0;
950}
951
7213ec5c 952static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
237e200e 953{
b7672ded 954 struct dirent *direntp;
237e200e
SH
955 DIR *dir;
956 bool ret = false;
957 char pathname[MAXPATHLEN];
b7672ded
CB
958 int dupfd;
959
960 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
961 if (dupfd < 0)
962 return false;
237e200e 963
b7672ded 964 dir = fdopendir(dupfd);
237e200e 965 if (!dir) {
7dd6560a 966 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
7213ec5c 967 close(dupfd);
237e200e
SH
968 return false;
969 }
970
b7672ded 971 while ((direntp = readdir(dir))) {
237e200e
SH
972 struct stat mystat;
973 int rc;
974
237e200e
SH
975 if (!strcmp(direntp->d_name, ".") ||
976 !strcmp(direntp->d_name, ".."))
977 continue;
978
979 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
980 if (rc < 0 || rc >= MAXPATHLEN) {
b8defc3d 981 lxcfs_error("%s\n", "Pathname too long.");
237e200e
SH
982 continue;
983 }
984
2e81a5e3
CB
985 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
986 if (rc) {
7dd6560a 987 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
237e200e
SH
988 continue;
989 }
7dd6560a 990 if (S_ISDIR(mystat.st_mode))
2e81a5e3 991 if (!recursive_rmdir(pathname, fd, cfd))
7dd6560a 992 lxcfs_debug("Error removing %s.\n", pathname);
237e200e
SH
993 }
994
995 ret = true;
996 if (closedir(dir) < 0) {
b8defc3d 997 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
237e200e
SH
998 ret = false;
999 }
1000
2e81a5e3 1001 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
7dd6560a 1002 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
237e200e
SH
1003 ret = false;
1004 }
7213ec5c
CB
1005
1006 close(dupfd);
237e200e
SH
1007
1008 return ret;
1009}
1010
1011bool cgfs_remove(const char *controller, const char *cg)
1012{
b7672ded 1013 int fd, cfd;
237e200e 1014 size_t len;
f5a6d92e 1015 char *dirnam, *tmpc;
7213ec5c 1016 bool bret;
237e200e 1017
f5a6d92e 1018 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1019 if (!tmpc)
1020 return false;
f5a6d92e
CB
1021
1022 /* Make sure we pass a relative path to *at() family of functions.
1023 * . + /cg + \0
1024 */
b7672ded 1025 len = strlen(cg) + 2;
237e200e 1026 dirnam = alloca(len);
b7672ded
CB
1027 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
1028
1029 fd = openat(cfd, dirnam, O_DIRECTORY);
1030 if (fd < 0)
1031 return false;
1032
7213ec5c
CB
1033 bret = recursive_rmdir(dirnam, fd, cfd);
1034 close(fd);
1035 return bret;
237e200e
SH
1036}
1037
1038bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
1039{
5dd3e6fd 1040 int cfd;
237e200e 1041 size_t len;
f5a6d92e 1042 char *pathname, *tmpc;
237e200e 1043
f5a6d92e 1044 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1045 if (!tmpc)
1046 return false;
f5a6d92e
CB
1047
1048 /* Make sure we pass a relative path to *at() family of functions.
1049 * . + /file + \0
1050 */
534690b4 1051 len = strlen(file) + 2;
237e200e 1052 pathname = alloca(len);
534690b4
CB
1053 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1054 if (fchmodat(cfd, pathname, mode, 0) < 0)
237e200e
SH
1055 return false;
1056 return true;
1057}
1058
0f657ce3 1059static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e
SH
1060{
1061 size_t len;
1062 char *fname;
1063
1064 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
1065 fname = alloca(len);
1066 snprintf(fname, len, "%s/tasks", dirname);
0f657ce3 1067 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
1068 return -errno;
1069 snprintf(fname, len, "%s/cgroup.procs", dirname);
0f657ce3 1070 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
1071 return -errno;
1072 return 0;
1073}
1074
1075int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
1076{
5dd3e6fd 1077 int cfd;
237e200e 1078 size_t len;
f5a6d92e 1079 char *pathname, *tmpc;
237e200e 1080
f5a6d92e 1081 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1082 if (!tmpc)
1083 return -EINVAL;
f5a6d92e
CB
1084
1085 /* Make sure we pass a relative path to *at() family of functions.
1086 * . + /file + \0
1087 */
0f657ce3 1088 len = strlen(file) + 2;
237e200e 1089 pathname = alloca(len);
0f657ce3
CB
1090 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1091 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
237e200e
SH
1092 return -errno;
1093
0f657ce3 1094 if (is_dir(pathname, cfd))
237e200e 1095 // like cgmanager did, we want to chown the tasks file as well
0f657ce3 1096 return chown_tasks_files(pathname, uid, gid, cfd);
237e200e
SH
1097
1098 return 0;
1099}
1100
1101FILE *open_pids_file(const char *controller, const char *cgroup)
1102{
3ffd08ee 1103 int fd, cfd;
237e200e 1104 size_t len;
f5a6d92e 1105 char *pathname, *tmpc;
237e200e 1106
f5a6d92e 1107 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1108 if (!tmpc)
1109 return NULL;
f5a6d92e
CB
1110
1111 /* Make sure we pass a relative path to *at() family of functions.
1112 * . + /cgroup + / "cgroup.procs" + \0
1113 */
3ffd08ee 1114 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
237e200e 1115 pathname = alloca(len);
3ffd08ee
CB
1116 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1117
1118 fd = openat(cfd, pathname, O_WRONLY);
1119 if (fd < 0)
1120 return NULL;
1121
1122 return fdopen(fd, "w");
237e200e
SH
1123}
1124
f366da65
WB
1125static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1126 void ***list, size_t typesize,
1127 void* (*iterator)(const char*, const char*, const char*))
237e200e 1128{
4ea38a4c 1129 int cfd, fd, ret;
237e200e 1130 size_t len;
4ea38a4c 1131 char *cg, *tmpc;
237e200e 1132 char pathname[MAXPATHLEN];
f366da65 1133 size_t sz = 0, asz = 0;
4ea38a4c 1134 struct dirent *dirent;
237e200e 1135 DIR *dir;
237e200e 1136
4ea38a4c 1137 tmpc = find_mounted_controller(controller, &cfd);
f366da65 1138 *list = NULL;
237e200e 1139 if (!tmpc)
e97c834b 1140 return false;
237e200e 1141
f5a6d92e 1142 /* Make sure we pass a relative path to *at() family of functions. */
4ea38a4c
CB
1143 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1144 cg = alloca(len);
1145 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1146 if (ret < 0 || (size_t)ret >= len) {
b8defc3d 1147 lxcfs_error("Pathname too long under %s\n", cgroup);
4ea38a4c
CB
1148 return false;
1149 }
237e200e 1150
4ea38a4c
CB
1151 fd = openat(cfd, cg, O_DIRECTORY);
1152 if (fd < 0)
1153 return false;
1154
1155 dir = fdopendir(fd);
237e200e
SH
1156 if (!dir)
1157 return false;
1158
4ea38a4c 1159 while ((dirent = readdir(dir))) {
237e200e 1160 struct stat mystat;
237e200e 1161
4ea38a4c
CB
1162 if (!strcmp(dirent->d_name, ".") ||
1163 !strcmp(dirent->d_name, ".."))
237e200e
SH
1164 continue;
1165
4ea38a4c
CB
1166 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1167 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 1168 lxcfs_error("Pathname too long under %s\n", cg);
237e200e
SH
1169 continue;
1170 }
1171
4ea38a4c 1172 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
237e200e 1173 if (ret) {
b8defc3d 1174 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
237e200e
SH
1175 continue;
1176 }
f366da65
WB
1177 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1178 (directories && !S_ISDIR(mystat.st_mode)))
237e200e
SH
1179 continue;
1180
1181 if (sz+2 >= asz) {
f366da65 1182 void **tmp;
237e200e
SH
1183 asz += BATCH_SIZE;
1184 do {
f366da65 1185 tmp = realloc(*list, asz * typesize);
237e200e
SH
1186 } while (!tmp);
1187 *list = tmp;
1188 }
4ea38a4c 1189 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
237e200e
SH
1190 (*list)[sz+1] = NULL;
1191 sz++;
1192 }
1193 if (closedir(dir) < 0) {
b8defc3d 1194 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
237e200e
SH
1195 return false;
1196 }
1197 return true;
1198}
1199
f366da65
WB
1200static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1201{
1202 char *dup;
1203 do {
1204 dup = strdup(dir_entry);
1205 } while (!dup);
1206 return dup;
1207}
1208
1209bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1210{
1211 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1212}
1213
237e200e
SH
1214void free_key(struct cgfs_files *k)
1215{
1216 if (!k)
1217 return;
1218 free(k->name);
1219 free(k);
1220}
1221
1222void free_keys(struct cgfs_files **keys)
1223{
1224 int i;
1225
1226 if (!keys)
1227 return;
1228 for (i = 0; keys[i]; i++) {
1229 free_key(keys[i]);
1230 }
1231 free(keys);
1232}
1233
1234bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1235{
60f2ae53 1236 int ret, fd, cfd;
237e200e 1237 size_t len;
f5a6d92e 1238 char *fnam, *tmpc;
237e200e 1239
f5a6d92e 1240 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1241 if (!tmpc)
1242 return false;
f5a6d92e
CB
1243
1244 /* Make sure we pass a relative path to *at() family of functions.
1245 * . + /cgroup + / + file + \0
1246 */
60f2ae53 1247 len = strlen(cgroup) + strlen(file) + 3;
237e200e 1248 fnam = alloca(len);
60f2ae53
CB
1249 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1250 if (ret < 0 || (size_t)ret >= len)
234a820c 1251 return false;
60f2ae53
CB
1252
1253 fd = openat(cfd, fnam, O_RDONLY);
1254 if (fd < 0)
234a820c 1255 return false;
237e200e 1256
60f2ae53 1257 *value = slurp_file(fnam, fd);
237e200e
SH
1258 return *value != NULL;
1259}
1260
951acc94
JS
1261bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1262{
1263 int ret, cfd;
1264 size_t len;
1265 char *fnam, *tmpc;
1266
1267 tmpc = find_mounted_controller(controller, &cfd);
1268 if (!tmpc)
1269 return false;
1270
1271 /* Make sure we pass a relative path to *at() family of functions.
1272 * . + /cgroup + / + file + \0
1273 */
1274 len = strlen(cgroup) + strlen(file) + 3;
1275 fnam = alloca(len);
1276 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1277 if (ret < 0 || (size_t)ret >= len)
1278 return false;
1279
1280 return (faccessat(cfd, fnam, F_OK, 0) == 0);
1281}
1282
237e200e
SH
1283struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1284{
4ea38a4c 1285 int ret, cfd;
237e200e 1286 size_t len;
f5a6d92e 1287 char *fnam, *tmpc;
237e200e
SH
1288 struct stat sb;
1289 struct cgfs_files *newkey;
237e200e 1290
f5a6d92e 1291 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1292 if (!tmpc)
1293 return false;
1294
1295 if (file && *file == '/')
1296 file++;
1297
06081b29 1298 if (file && strchr(file, '/'))
237e200e
SH
1299 return NULL;
1300
f5a6d92e
CB
1301 /* Make sure we pass a relative path to *at() family of functions.
1302 * . + /cgroup + / + file + \0
1303 */
4ea38a4c 1304 len = strlen(cgroup) + 3;
237e200e
SH
1305 if (file)
1306 len += strlen(file) + 1;
1307 fnam = alloca(len);
4ea38a4c
CB
1308 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1309 file ? "/" : "", file ? file : "");
237e200e 1310
4ea38a4c 1311 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1312 if (ret < 0)
1313 return NULL;
1314
1315 do {
1316 newkey = malloc(sizeof(struct cgfs_files));
1317 } while (!newkey);
1318 if (file)
1319 newkey->name = must_copy_string(file);
06081b29
CB
1320 else if (strrchr(cgroup, '/'))
1321 newkey->name = must_copy_string(strrchr(cgroup, '/'));
237e200e
SH
1322 else
1323 newkey->name = must_copy_string(cgroup);
1324 newkey->uid = sb.st_uid;
1325 newkey->gid = sb.st_gid;
1326 newkey->mode = sb.st_mode;
1327
1328 return newkey;
1329}
1330
f366da65 1331static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
237e200e 1332{
f366da65
WB
1333 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1334 if (!entry) {
b8defc3d
CB
1335 lxcfs_error("Error getting files under %s:%s\n", controller,
1336 cgroup);
237e200e 1337 }
f366da65
WB
1338 return entry;
1339}
1340
1341bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1342{
1343 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
237e200e
SH
1344}
1345
1346bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
5dd3e6fd
CB
1347{
1348 int cfd;
1349 size_t len;
f5a6d92e 1350 char *fnam, *tmpc;
237e200e
SH
1351 int ret;
1352 struct stat sb;
1353
f5a6d92e 1354 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1355 if (!tmpc)
1356 return false;
f5a6d92e
CB
1357
1358 /* Make sure we pass a relative path to *at() family of functions.
1359 * . + /cgroup + / + f + \0
1360 */
d04232f2 1361 len = strlen(cgroup) + strlen(f) + 3;
237e200e 1362 fnam = alloca(len);
d04232f2
CB
1363 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1364 if (ret < 0 || (size_t)ret >= len)
1365 return false;
237e200e 1366
d04232f2 1367 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1368 if (ret < 0 || !S_ISDIR(sb.st_mode))
1369 return false;
f5a6d92e 1370
237e200e
SH
1371 return true;
1372}
1373
1374#define SEND_CREDS_OK 0
1375#define SEND_CREDS_NOTSK 1
1376#define SEND_CREDS_FAIL 2
1377static bool recv_creds(int sock, struct ucred *cred, char *v);
1378static int wait_for_pid(pid_t pid);
1379static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
b10bdd6c 1380static int send_creds_clone_wrapper(void *arg);
237e200e
SH
1381
1382/*
b10bdd6c 1383 * clone a task which switches to @task's namespace and writes '1'.
237e200e
SH
1384 * over a unix sock so we can read the task's reaper's pid in our
1385 * namespace
b10bdd6c
FG
1386 *
1387 * Note: glibc's fork() does not respect pidns, which can lead to failed
1388 * assertions inside glibc (and thus failed forks) if the child's pid in
1389 * the pidns and the parent pid outside are identical. Using clone prevents
1390 * this issue.
237e200e
SH
1391 */
1392static void write_task_init_pid_exit(int sock, pid_t target)
1393{
237e200e
SH
1394 char fnam[100];
1395 pid_t pid;
237e200e 1396 int fd, ret;
b10bdd6c
FG
1397 size_t stack_size = sysconf(_SC_PAGESIZE);
1398 void *stack = alloca(stack_size);
237e200e
SH
1399
1400 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1401 if (ret < 0 || ret >= sizeof(fnam))
1402 _exit(1);
1403
1404 fd = open(fnam, O_RDONLY);
1405 if (fd < 0) {
1406 perror("write_task_init_pid_exit open of ns/pid");
1407 _exit(1);
1408 }
1409 if (setns(fd, 0)) {
1410 perror("write_task_init_pid_exit setns 1");
1411 close(fd);
1412 _exit(1);
1413 }
b10bdd6c 1414 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
237e200e
SH
1415 if (pid < 0)
1416 _exit(1);
1417 if (pid != 0) {
1418 if (!wait_for_pid(pid))
1419 _exit(1);
1420 _exit(0);
1421 }
b10bdd6c
FG
1422}
1423
1424static int send_creds_clone_wrapper(void *arg) {
1425 struct ucred cred;
1426 char v;
1427 int sock = *(int *)arg;
237e200e
SH
1428
1429 /* we are the child */
1430 cred.uid = 0;
1431 cred.gid = 0;
1432 cred.pid = 1;
1433 v = '1';
1434 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
b10bdd6c
FG
1435 return 1;
1436 return 0;
237e200e
SH
1437}
1438
1439static pid_t get_init_pid_for_task(pid_t task)
1440{
1441 int sock[2];
1442 pid_t pid;
1443 pid_t ret = -1;
1444 char v = '0';
1445 struct ucred cred;
1446
1447 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1448 perror("socketpair");
1449 return -1;
1450 }
1451
1452 pid = fork();
1453 if (pid < 0)
1454 goto out;
1455 if (!pid) {
1456 close(sock[1]);
1457 write_task_init_pid_exit(sock[0], task);
1458 _exit(0);
1459 }
1460
1461 if (!recv_creds(sock[1], &cred, &v))
1462 goto out;
1463 ret = cred.pid;
1464
1465out:
1466 close(sock[0]);
1467 close(sock[1]);
1468 if (pid > 0)
1469 wait_for_pid(pid);
1470 return ret;
1471}
1472
71f17cd2 1473pid_t lookup_initpid_in_store(pid_t qpid)
237e200e
SH
1474{
1475 pid_t answer = 0;
1476 struct stat sb;
1477 struct pidns_init_store *e;
1478 char fnam[100];
1479
1480 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1481 store_lock();
1482 if (stat(fnam, &sb) < 0)
1483 goto out;
1484 e = lookup_verify_initpid(&sb);
1485 if (e) {
1486 answer = e->initpid;
1487 goto out;
1488 }
1489 answer = get_init_pid_for_task(qpid);
1490 if (answer > 0)
1491 save_initpid(&sb, answer);
1492
1493out:
1494 /* we prune at end in case we are returning
1495 * the value we were about to return */
1496 prune_initpid_store();
1497 store_unlock();
1498 return answer;
1499}
1500
1501static int wait_for_pid(pid_t pid)
1502{
1503 int status, ret;
1504
1505 if (pid <= 0)
1506 return -1;
1507
1508again:
1509 ret = waitpid(pid, &status, 0);
1510 if (ret == -1) {
1511 if (errno == EINTR)
1512 goto again;
1513 return -1;
1514 }
1515 if (ret != pid)
1516 goto again;
1517 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1518 return -1;
1519 return 0;
1520}
1521
237e200e 1522/*
db1b32f6
SX
1523 * append the given formatted string to *src.
1524 * src: a pointer to a char* in which to append the formatted string.
237e200e
SH
1525 * sz: the number of characters printed so far, minus trailing \0.
1526 * asz: the allocated size so far
db1b32f6
SX
1527 * format: string format. See printf for details.
1528 * ...: varargs. See printf for details.
237e200e 1529 */
db1b32f6 1530static void must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...)
237e200e 1531{
db1b32f6
SX
1532 char tmp[BUF_RESERVE_SIZE];
1533 va_list args;
237e200e 1534
db1b32f6
SX
1535 va_start (args, format);
1536 int tmplen = vsnprintf(tmp, BUF_RESERVE_SIZE, format, args);
1537 va_end(args);
237e200e
SH
1538
1539 if (!*src || tmplen + *sz + 1 >= *asz) {
1540 char *tmp;
1541 do {
1542 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1543 } while (!tmp);
1544 *src = tmp;
1545 *asz += BUF_RESERVE_SIZE;
1546 }
bbfd0e33 1547 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
237e200e 1548 *sz += tmplen;
237e200e
SH
1549}
1550
db1b32f6
SX
1551/*
1552 * append pid to *src.
1553 * src: a pointer to a char* in which ot append the pid.
1554 * sz: the number of characters printed so far, minus trailing \0.
1555 * asz: the allocated size so far
1556 * pid: the pid to append
1557 */
1558static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1559{
1560 must_strcat(src, sz, asz, "%d\n", (int)pid);
1561}
1562
237e200e
SH
1563/*
1564 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1565 * valid in the caller's namespace, return the id mapped into
1566 * pid's namespace.
1567 * Returns the mapped id, or -1 on error.
1568 */
1569unsigned int
1570convert_id_to_ns(FILE *idfile, unsigned int in_id)
1571{
1572 unsigned int nsuid, // base id for a range in the idfile's namespace
1573 hostuid, // base id for a range in the caller's namespace
1574 count; // number of ids in this range
1575 char line[400];
1576 int ret;
1577
1578 fseek(idfile, 0L, SEEK_SET);
1579 while (fgets(line, 400, idfile)) {
1580 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1581 if (ret != 3)
1582 continue;
1583 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1584 /*
1585 * uids wrapped around - unexpected as this is a procfile,
1586 * so just bail.
1587 */
b8defc3d 1588 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
237e200e
SH
1589 nsuid, hostuid, count, line);
1590 return -1;
1591 }
1592 if (hostuid <= in_id && hostuid+count > in_id) {
1593 /*
1594 * now since hostuid <= in_id < hostuid+count, and
1595 * hostuid+count and nsuid+count do not wrap around,
1596 * we know that nsuid+(in_id-hostuid) which must be
1597 * less that nsuid+(count) must not wrap around
1598 */
1599 return (in_id - hostuid) + nsuid;
1600 }
1601 }
1602
1603 // no answer found
1604 return -1;
1605}
1606
1607/*
1608 * for is_privileged_over,
1609 * specify whether we require the calling uid to be root in his
1610 * namespace
1611 */
1612#define NS_ROOT_REQD true
1613#define NS_ROOT_OPT false
1614
1615#define PROCLEN 100
1616
1617static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1618{
1619 char fpath[PROCLEN];
1620 int ret;
1621 bool answer = false;
1622 uid_t nsuid;
1623
1624 if (victim == -1 || uid == -1)
1625 return false;
1626
1627 /*
1628 * If the request is one not requiring root in the namespace,
1629 * then having the same uid suffices. (i.e. uid 1000 has write
1630 * access to files owned by uid 1000
1631 */
1632 if (!req_ns_root && uid == victim)
1633 return true;
1634
1635 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1636 if (ret < 0 || ret >= PROCLEN)
1637 return false;
1638 FILE *f = fopen(fpath, "r");
1639 if (!f)
1640 return false;
1641
1642 /* if caller's not root in his namespace, reject */
1643 nsuid = convert_id_to_ns(f, uid);
1644 if (nsuid)
1645 goto out;
1646
1647 /*
1648 * If victim is not mapped into caller's ns, reject.
1649 * XXX I'm not sure this check is needed given that fuse
1650 * will be sending requests where the vfs has converted
1651 */
1652 nsuid = convert_id_to_ns(f, victim);
1653 if (nsuid == -1)
1654 goto out;
1655
1656 answer = true;
1657
1658out:
1659 fclose(f);
1660 return answer;
1661}
1662
1663static bool perms_include(int fmode, mode_t req_mode)
1664{
1665 mode_t r;
1666
1667 switch (req_mode & O_ACCMODE) {
1668 case O_RDONLY:
1669 r = S_IROTH;
1670 break;
1671 case O_WRONLY:
1672 r = S_IWOTH;
1673 break;
1674 case O_RDWR:
1675 r = S_IROTH | S_IWOTH;
1676 break;
1677 default:
1678 return false;
1679 }
1680 return ((fmode & r) == r);
1681}
1682
1683
1684/*
1685 * taskcg is a/b/c
1686 * querycg is /a/b/c/d/e
1687 * we return 'd'
1688 */
1689static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1690{
1691 char *start, *end;
1692
1693 if (strlen(taskcg) <= strlen(querycg)) {
b8defc3d 1694 lxcfs_error("%s\n", "I was fed bad input.");
237e200e
SH
1695 return NULL;
1696 }
1697
06081b29 1698 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
237e200e
SH
1699 start = strdup(taskcg + 1);
1700 else
1701 start = strdup(taskcg + strlen(querycg) + 1);
1702 if (!start)
1703 return NULL;
1704 end = strchr(start, '/');
1705 if (end)
1706 *end = '\0';
1707 return start;
1708}
1709
1710static void stripnewline(char *x)
1711{
1712 size_t l = strlen(x);
1713 if (l && x[l-1] == '\n')
1714 x[l-1] = '\0';
1715}
1716
71f17cd2 1717char *get_pid_cgroup(pid_t pid, const char *contrl)
237e200e 1718{
5dd3e6fd 1719 int cfd;
237e200e
SH
1720 char fnam[PROCLEN];
1721 FILE *f;
1722 char *answer = NULL;
1723 char *line = NULL;
1724 size_t len = 0;
1725 int ret;
5dd3e6fd 1726 const char *h = find_mounted_controller(contrl, &cfd);
237e200e
SH
1727 if (!h)
1728 return NULL;
1729
1730 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1731 if (ret < 0 || ret >= PROCLEN)
1732 return NULL;
1733 if (!(f = fopen(fnam, "r")))
1734 return NULL;
1735
1736 while (getline(&line, &len, f) != -1) {
1737 char *c1, *c2;
1738 if (!line[0])
1739 continue;
1740 c1 = strchr(line, ':');
1741 if (!c1)
1742 goto out;
1743 c1++;
1744 c2 = strchr(c1, ':');
1745 if (!c2)
1746 goto out;
1747 *c2 = '\0';
1748 if (strcmp(c1, h) != 0)
1749 continue;
1750 c2++;
1751 stripnewline(c2);
1752 do {
1753 answer = strdup(c2);
1754 } while (!answer);
1755 break;
1756 }
1757
1758out:
1759 fclose(f);
1760 free(line);
1761 return answer;
1762}
1763
1764/*
1765 * check whether a fuse context may access a cgroup dir or file
1766 *
1767 * If file is not null, it is a cgroup file to check under cg.
1768 * If file is null, then we are checking perms on cg itself.
1769 *
1770 * For files we can check the mode of the list_keys result.
1771 * For cgroups, we must make assumptions based on the files under the
1772 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1773 * yet.
1774 */
1775static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1776{
1777 struct cgfs_files *k = NULL;
1778 bool ret = false;
1779
1780 k = cgfs_get_key(contrl, cg, file);
1781 if (!k)
1782 return false;
1783
1784 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1785 if (perms_include(k->mode >> 6, mode)) {
1786 ret = true;
1787 goto out;
1788 }
1789 }
1790 if (fc->gid == k->gid) {
1791 if (perms_include(k->mode >> 3, mode)) {
1792 ret = true;
1793 goto out;
1794 }
1795 }
1796 ret = perms_include(k->mode, mode);
1797
1798out:
1799 free_key(k);
1800 return ret;
1801}
1802
1803#define INITSCOPE "/init.scope"
71f17cd2 1804void prune_init_slice(char *cg)
237e200e
SH
1805{
1806 char *point;
1807 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1808
1809 if (cg_len < initscope_len)
1810 return;
1811
1812 point = cg + cg_len - initscope_len;
1813 if (strcmp(point, INITSCOPE) == 0) {
1814 if (point == cg)
1815 *(point+1) = '\0';
1816 else
1817 *point = '\0';
1818 }
1819}
1820
1821/*
1822 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1823 * If pid is in /a, he may act on /a/b, but not on /b.
1824 * if the answer is false and nextcg is not NULL, then *nextcg will point
1825 * to a string containing the next cgroup directory under cg, which must be
1826 * freed by the caller.
1827 */
1828static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1829{
1830 bool answer = false;
1831 char *c2 = get_pid_cgroup(pid, contrl);
1832 char *linecmp;
1833
1834 if (!c2)
1835 return false;
1836 prune_init_slice(c2);
1837
1838 /*
12c31268
CB
1839 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1840 * they pass in a cgroup without leading '/'
1841 *
1842 * The original line here was:
1843 * linecmp = *cg == '/' ? c2 : c2+1;
1844 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1845 * Serge, do you know?
237e200e 1846 */
12c31268
CB
1847 if (*cg == '/' || !strncmp(cg, "./", 2))
1848 linecmp = c2;
1849 else
1850 linecmp = c2 + 1;
237e200e
SH
1851 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1852 if (nextcg) {
1853 *nextcg = get_next_cgroup_dir(linecmp, cg);
1854 }
1855 goto out;
1856 }
1857 answer = true;
1858
1859out:
1860 free(c2);
1861 return answer;
1862}
1863
1864/*
1865 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1866 */
1867static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1868{
1869 bool answer = false;
1870 char *c2, *task_cg;
1871 size_t target_len, task_len;
1872
f7bff426 1873 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
237e200e
SH
1874 return true;
1875
1876 c2 = get_pid_cgroup(pid, contrl);
1877 if (!c2)
1878 return false;
1879 prune_init_slice(c2);
1880
1881 task_cg = c2 + 1;
1882 target_len = strlen(cg);
1883 task_len = strlen(task_cg);
1884 if (task_len == 0) {
1885 /* Task is in the root cg, it can see everything. This case is
1886 * not handled by the strmcps below, since they test for the
1887 * last /, but that is the first / that we've chopped off
1888 * above.
1889 */
1890 answer = true;
1891 goto out;
1892 }
1893 if (strcmp(cg, task_cg) == 0) {
1894 answer = true;
1895 goto out;
1896 }
1897 if (target_len < task_len) {
1898 /* looking up a parent dir */
1899 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1900 answer = true;
1901 goto out;
1902 }
1903 if (target_len > task_len) {
1904 /* looking up a child dir */
1905 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1906 answer = true;
1907 goto out;
1908 }
1909
1910out:
1911 free(c2);
1912 return answer;
1913}
1914
1915/*
1916 * given /cgroup/freezer/a/b, return "freezer".
1917 * the returned char* should NOT be freed.
1918 */
1919static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1920{
1921 const char *p1;
1922 char *contr, *slash;
1923
99142521 1924 if (strlen(path) < 9) {
e254948f 1925 errno = EACCES;
237e200e 1926 return NULL;
99142521
CB
1927 }
1928 if (*(path + 7) != '/') {
1929 errno = EINVAL;
237e200e 1930 return NULL;
99142521 1931 }
3adc421c 1932 p1 = path + 8;
237e200e 1933 contr = strdupa(p1);
99142521
CB
1934 if (!contr) {
1935 errno = ENOMEM;
237e200e 1936 return NULL;
99142521 1937 }
237e200e
SH
1938 slash = strstr(contr, "/");
1939 if (slash)
1940 *slash = '\0';
1941
1942 int i;
3adc421c 1943 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
1944 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1945 return hierarchies[i];
1946 }
99142521 1947 errno = ENOENT;
237e200e
SH
1948 return NULL;
1949}
1950
1951/*
1952 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1953 * Note that the returned value may include files (keynames) etc
1954 */
1955static const char *find_cgroup_in_path(const char *path)
1956{
1957 const char *p1;
1958
bc70ba9b 1959 if (strlen(path) < 9) {
e254948f 1960 errno = EACCES;
237e200e 1961 return NULL;
bc70ba9b
CB
1962 }
1963 p1 = strstr(path + 8, "/");
1964 if (!p1) {
1965 errno = EINVAL;
237e200e 1966 return NULL;
bc70ba9b
CB
1967 }
1968 errno = 0;
1969 return p1 + 1;
237e200e
SH
1970}
1971
1972/*
1973 * split the last path element from the path in @cg.
1974 * @dir is newly allocated and should be freed, @last not
1975*/
1976static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1977{
1978 char *p;
1979
1980 do {
1981 *dir = strdup(cg);
1982 } while (!*dir);
1983 *last = strrchr(cg, '/');
1984 if (!*last) {
1985 *last = NULL;
1986 return;
1987 }
1988 p = strrchr(*dir, '/');
1989 *p = '\0';
1990}
1991
1992/*
1993 * FUSE ops for /cgroup
1994 */
1995
1996int cg_getattr(const char *path, struct stat *sb)
1997{
1998 struct timespec now;
1999 struct fuse_context *fc = fuse_get_context();
2000 char * cgdir = NULL;
2001 char *last = NULL, *path1, *path2;
2002 struct cgfs_files *k = NULL;
2003 const char *cgroup;
2004 const char *controller = NULL;
2005 int ret = -ENOENT;
2006
2007
2008 if (!fc)
2009 return -EIO;
2010
2011 memset(sb, 0, sizeof(struct stat));
2012
2013 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2014 return -EINVAL;
2015
2016 sb->st_uid = sb->st_gid = 0;
2017 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2018 sb->st_size = 0;
2019
2020 if (strcmp(path, "/cgroup") == 0) {
2021 sb->st_mode = S_IFDIR | 00755;
2022 sb->st_nlink = 2;
2023 return 0;
2024 }
2025
2026 controller = pick_controller_from_path(fc, path);
2027 if (!controller)
2f7036d0 2028 return -errno;
237e200e
SH
2029 cgroup = find_cgroup_in_path(path);
2030 if (!cgroup) {
2031 /* this is just /cgroup/controller, return it as a dir */
2032 sb->st_mode = S_IFDIR | 00755;
2033 sb->st_nlink = 2;
2034 return 0;
2035 }
2036
2037 get_cgdir_and_path(cgroup, &cgdir, &last);
2038
2039 if (!last) {
2040 path1 = "/";
2041 path2 = cgdir;
2042 } else {
2043 path1 = cgdir;
2044 path2 = last;
2045 }
2046
2047 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 2048 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
2049 initpid = fc->pid;
2050 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
2051 * Then check that caller's cgroup is under path if last is a child
2052 * cgroup, or cgdir if last is a file */
2053
2054 if (is_child_cgroup(controller, path1, path2)) {
2055 if (!caller_may_see_dir(initpid, controller, cgroup)) {
2056 ret = -ENOENT;
2057 goto out;
2058 }
2059 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2060 /* this is just /cgroup/controller, return it as a dir */
2061 sb->st_mode = S_IFDIR | 00555;
2062 sb->st_nlink = 2;
2063 ret = 0;
2064 goto out;
2065 }
2066 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
2067 ret = -EACCES;
2068 goto out;
2069 }
2070
2071 // get uid, gid, from '/tasks' file and make up a mode
2072 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2073 sb->st_mode = S_IFDIR | 00755;
2074 k = cgfs_get_key(controller, cgroup, NULL);
2075 if (!k) {
2076 sb->st_uid = sb->st_gid = 0;
2077 } else {
2078 sb->st_uid = k->uid;
2079 sb->st_gid = k->gid;
2080 }
2081 free_key(k);
2082 sb->st_nlink = 2;
2083 ret = 0;
2084 goto out;
2085 }
2086
2087 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
2088 sb->st_mode = S_IFREG | k->mode;
2089 sb->st_nlink = 1;
2090 sb->st_uid = k->uid;
2091 sb->st_gid = k->gid;
2092 sb->st_size = 0;
2093 free_key(k);
2094 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2095 ret = -ENOENT;
2096 goto out;
2097 }
237e200e
SH
2098 ret = 0;
2099 }
2100
2101out:
2102 free(cgdir);
2103 return ret;
2104}
2105
2106int cg_opendir(const char *path, struct fuse_file_info *fi)
2107{
2108 struct fuse_context *fc = fuse_get_context();
2109 const char *cgroup;
2110 struct file_info *dir_info;
2111 char *controller = NULL;
2112
2113 if (!fc)
2114 return -EIO;
2115
2116 if (strcmp(path, "/cgroup") == 0) {
2117 cgroup = NULL;
2118 controller = NULL;
2119 } else {
2120 // return list of keys for the controller, and list of child cgroups
2121 controller = pick_controller_from_path(fc, path);
2122 if (!controller)
2f7036d0 2123 return -errno;
237e200e
SH
2124
2125 cgroup = find_cgroup_in_path(path);
2126 if (!cgroup) {
2127 /* this is just /cgroup/controller, return its contents */
2128 cgroup = "/";
2129 }
2130 }
2131
2132 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 2133 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
2134 initpid = fc->pid;
2135 if (cgroup) {
2136 if (!caller_may_see_dir(initpid, controller, cgroup))
2137 return -ENOENT;
2138 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2139 return -EACCES;
2140 }
2141
2142 /* we'll free this at cg_releasedir */
2143 dir_info = malloc(sizeof(*dir_info));
2144 if (!dir_info)
2145 return -ENOMEM;
2146 dir_info->controller = must_copy_string(controller);
2147 dir_info->cgroup = must_copy_string(cgroup);
2148 dir_info->type = LXC_TYPE_CGDIR;
2149 dir_info->buf = NULL;
2150 dir_info->file = NULL;
2151 dir_info->buflen = 0;
2152
2153 fi->fh = (unsigned long)dir_info;
2154 return 0;
2155}
2156
2157int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2158 struct fuse_file_info *fi)
2159{
2160 struct file_info *d = (struct file_info *)fi->fh;
2161 struct cgfs_files **list = NULL;
2162 int i, ret;
2163 char *nextcg = NULL;
2164 struct fuse_context *fc = fuse_get_context();
2165 char **clist = NULL;
2166
d639f863
CB
2167 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2168 return -EIO;
2169
237e200e 2170 if (d->type != LXC_TYPE_CGDIR) {
b8defc3d 2171 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
237e200e
SH
2172 return -EIO;
2173 }
2174 if (!d->cgroup && !d->controller) {
2175 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2176 int i;
2177
2178 for (i = 0; i < num_hierarchies; i++) {
2179 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2180 return -EIO;
2181 }
2182 }
2183 return 0;
2184 }
2185
2186 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2187 // not a valid cgroup
2188 ret = -EINVAL;
2189 goto out;
2190 }
2191
2192 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 2193 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
2194 initpid = fc->pid;
2195 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2196 if (nextcg) {
2197 ret = filler(buf, nextcg, NULL, 0);
2198 free(nextcg);
2199 if (ret != 0) {
2200 ret = -EIO;
2201 goto out;
2202 }
2203 }
2204 ret = 0;
2205 goto out;
2206 }
2207
b737a54a 2208 for (i = 0; list && list[i]; i++) {
237e200e
SH
2209 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2210 ret = -EIO;
2211 goto out;
2212 }
2213 }
2214
2215 // now get the list of child cgroups
2216
2217 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2218 ret = 0;
2219 goto out;
2220 }
f366da65
WB
2221 if (clist) {
2222 for (i = 0; clist[i]; i++) {
2223 if (filler(buf, clist[i], NULL, 0) != 0) {
2224 ret = -EIO;
2225 goto out;
2226 }
237e200e
SH
2227 }
2228 }
2229 ret = 0;
2230
2231out:
2232 free_keys(list);
2233 if (clist) {
2234 for (i = 0; clist[i]; i++)
2235 free(clist[i]);
2236 free(clist);
2237 }
2238 return ret;
2239}
2240
71f17cd2 2241void do_release_file_info(struct fuse_file_info *fi)
237e200e 2242{
43215927
SH
2243 struct file_info *f = (struct file_info *)fi->fh;
2244
237e200e
SH
2245 if (!f)
2246 return;
43215927
SH
2247
2248 fi->fh = 0;
2249
237e200e 2250 free(f->controller);
43215927 2251 f->controller = NULL;
237e200e 2252 free(f->cgroup);
43215927 2253 f->cgroup = NULL;
237e200e 2254 free(f->file);
43215927 2255 f->file = NULL;
237e200e 2256 free(f->buf);
43215927 2257 f->buf = NULL;
237e200e 2258 free(f);
bbb508dd 2259 f = NULL;
237e200e
SH
2260}
2261
2262int cg_releasedir(const char *path, struct fuse_file_info *fi)
2263{
43215927 2264 do_release_file_info(fi);
237e200e
SH
2265 return 0;
2266}
2267
2268int cg_open(const char *path, struct fuse_file_info *fi)
2269{
2270 const char *cgroup;
2271 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2272 struct cgfs_files *k = NULL;
2273 struct file_info *file_info;
2274 struct fuse_context *fc = fuse_get_context();
2275 int ret;
2276
2277 if (!fc)
2278 return -EIO;
2279
2280 controller = pick_controller_from_path(fc, path);
2281 if (!controller)
2f7036d0 2282 return -errno;
237e200e
SH
2283 cgroup = find_cgroup_in_path(path);
2284 if (!cgroup)
bc70ba9b 2285 return -errno;
237e200e
SH
2286
2287 get_cgdir_and_path(cgroup, &cgdir, &last);
2288 if (!last) {
2289 path1 = "/";
2290 path2 = cgdir;
2291 } else {
2292 path1 = cgdir;
2293 path2 = last;
2294 }
2295
2296 k = cgfs_get_key(controller, path1, path2);
2297 if (!k) {
2298 ret = -EINVAL;
2299 goto out;
2300 }
2301 free_key(k);
2302
2303 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 2304 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
2305 initpid = fc->pid;
2306 if (!caller_may_see_dir(initpid, controller, path1)) {
2307 ret = -ENOENT;
2308 goto out;
2309 }
2310 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
237e200e
SH
2311 ret = -EACCES;
2312 goto out;
2313 }
2314
2315 /* we'll free this at cg_release */
2316 file_info = malloc(sizeof(*file_info));
2317 if (!file_info) {
2318 ret = -ENOMEM;
2319 goto out;
2320 }
2321 file_info->controller = must_copy_string(controller);
2322 file_info->cgroup = must_copy_string(path1);
2323 file_info->file = must_copy_string(path2);
2324 file_info->type = LXC_TYPE_CGFILE;
2325 file_info->buf = NULL;
2326 file_info->buflen = 0;
2327
2328 fi->fh = (unsigned long)file_info;
2329 ret = 0;
2330
2331out:
2332 free(cgdir);
2333 return ret;
2334}
2335
bddbb106
SH
2336int cg_access(const char *path, int mode)
2337{
6f0f6b83 2338 int ret;
bddbb106 2339 const char *cgroup;
6f0f6b83
CB
2340 char *path1, *path2, *controller;
2341 char *last = NULL, *cgdir = NULL;
bddbb106
SH
2342 struct cgfs_files *k = NULL;
2343 struct fuse_context *fc = fuse_get_context();
6f0f6b83 2344
9873c5e8 2345 if (strcmp(path, "/cgroup") == 0)
6f0f6b83 2346 return 0;
bddbb106
SH
2347
2348 if (!fc)
2349 return -EIO;
2350
2351 controller = pick_controller_from_path(fc, path);
2352 if (!controller)
2f7036d0 2353 return -errno;
bddbb106 2354 cgroup = find_cgroup_in_path(path);
575316c4
SH
2355 if (!cgroup) {
2356 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
3f441bc7
SH
2357 if ((mode & W_OK) == 0)
2358 return 0;
2359 return -EACCES;
575316c4 2360 }
bddbb106
SH
2361
2362 get_cgdir_and_path(cgroup, &cgdir, &last);
2363 if (!last) {
2364 path1 = "/";
2365 path2 = cgdir;
2366 } else {
2367 path1 = cgdir;
2368 path2 = last;
2369 }
2370
2371 k = cgfs_get_key(controller, path1, path2);
2372 if (!k) {
3f441bc7
SH
2373 if ((mode & W_OK) == 0)
2374 ret = 0;
2375 else
2376 ret = -EACCES;
bddbb106
SH
2377 goto out;
2378 }
2379 free_key(k);
2380
2381 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 2382 if (initpid <= 1 || is_shared_pidns(initpid))
bddbb106
SH
2383 initpid = fc->pid;
2384 if (!caller_may_see_dir(initpid, controller, path1)) {
2385 ret = -ENOENT;
2386 goto out;
2387 }
2388 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2389 ret = -EACCES;
2390 goto out;
2391 }
2392
2393 ret = 0;
2394
2395out:
2396 free(cgdir);
2397 return ret;
2398}
2399
237e200e
SH
2400int cg_release(const char *path, struct fuse_file_info *fi)
2401{
43215927 2402 do_release_file_info(fi);
237e200e
SH
2403 return 0;
2404}
2405
2406#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2407
2408static bool wait_for_sock(int sock, int timeout)
2409{
2410 struct epoll_event ev;
2411 int epfd, ret, now, starttime, deltatime, saved_errno;
2412
2413 if ((starttime = time(NULL)) < 0)
2414 return false;
2415
2416 if ((epfd = epoll_create(1)) < 0) {
b8defc3d 2417 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
237e200e
SH
2418 return false;
2419 }
2420
2421 ev.events = POLLIN_SET;
2422 ev.data.fd = sock;
2423 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
b8defc3d 2424 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
237e200e
SH
2425 close(epfd);
2426 return false;
2427 }
2428
2429again:
2430 if ((now = time(NULL)) < 0) {
2431 close(epfd);
2432 return false;
2433 }
2434
2435 deltatime = (starttime + timeout) - now;
2436 if (deltatime < 0) { // timeout
2437 errno = 0;
2438 close(epfd);
2439 return false;
2440 }
2441 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2442 if (ret < 0 && errno == EINTR)
2443 goto again;
2444 saved_errno = errno;
2445 close(epfd);
2446
2447 if (ret <= 0) {
2448 errno = saved_errno;
2449 return false;
2450 }
2451 return true;
2452}
2453
2454static int msgrecv(int sockfd, void *buf, size_t len)
2455{
2456 if (!wait_for_sock(sockfd, 2))
2457 return -1;
2458 return recv(sockfd, buf, len, MSG_DONTWAIT);
2459}
2460
2461static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2462{
2463 struct msghdr msg = { 0 };
2464 struct iovec iov;
2465 struct cmsghdr *cmsg;
2466 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2467 char buf[1];
2468 buf[0] = 'p';
2469
2470 if (pingfirst) {
2471 if (msgrecv(sock, buf, 1) != 1) {
b8defc3d 2472 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
237e200e
SH
2473 return SEND_CREDS_FAIL;
2474 }
2475 }
2476
2477 msg.msg_control = cmsgbuf;
2478 msg.msg_controllen = sizeof(cmsgbuf);
2479
2480 cmsg = CMSG_FIRSTHDR(&msg);
2481 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2482 cmsg->cmsg_level = SOL_SOCKET;
2483 cmsg->cmsg_type = SCM_CREDENTIALS;
2484 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2485
2486 msg.msg_name = NULL;
2487 msg.msg_namelen = 0;
2488
2489 buf[0] = v;
2490 iov.iov_base = buf;
2491 iov.iov_len = sizeof(buf);
2492 msg.msg_iov = &iov;
2493 msg.msg_iovlen = 1;
2494
2495 if (sendmsg(sock, &msg, 0) < 0) {
b8defc3d 2496 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
237e200e
SH
2497 if (errno == 3)
2498 return SEND_CREDS_NOTSK;
2499 return SEND_CREDS_FAIL;
2500 }
2501
2502 return SEND_CREDS_OK;
2503}
2504
2505static bool recv_creds(int sock, struct ucred *cred, char *v)
2506{
2507 struct msghdr msg = { 0 };
2508 struct iovec iov;
2509 struct cmsghdr *cmsg;
2510 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2511 char buf[1];
2512 int ret;
2513 int optval = 1;
2514
2515 *v = '1';
2516
2517 cred->pid = -1;
2518 cred->uid = -1;
2519 cred->gid = -1;
2520
2521 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
b8defc3d 2522 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
237e200e
SH
2523 return false;
2524 }
2525 buf[0] = '1';
2526 if (write(sock, buf, 1) != 1) {
b8defc3d 2527 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
237e200e
SH
2528 return false;
2529 }
2530
2531 msg.msg_name = NULL;
2532 msg.msg_namelen = 0;
2533 msg.msg_control = cmsgbuf;
2534 msg.msg_controllen = sizeof(cmsgbuf);
2535
2536 iov.iov_base = buf;
2537 iov.iov_len = sizeof(buf);
2538 msg.msg_iov = &iov;
2539 msg.msg_iovlen = 1;
2540
2541 if (!wait_for_sock(sock, 2)) {
b8defc3d 2542 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
237e200e
SH
2543 return false;
2544 }
2545 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2546 if (ret < 0) {
b8defc3d 2547 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
237e200e
SH
2548 return false;
2549 }
2550
2551 cmsg = CMSG_FIRSTHDR(&msg);
2552
2553 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2554 cmsg->cmsg_level == SOL_SOCKET &&
2555 cmsg->cmsg_type == SCM_CREDENTIALS) {
2556 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2557 }
2558 *v = buf[0];
2559
2560 return true;
2561}
2562
35174b0f
FG
2563struct pid_ns_clone_args {
2564 int *cpipe;
2565 int sock;
2566 pid_t tpid;
2567 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2568};
2569
2570/*
2571 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2572 * with clone(). This simply writes '1' as ACK back to the parent
2573 * before calling the actual wrapped function.
2574 */
2575static int pid_ns_clone_wrapper(void *arg) {
2576 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2577 char b = '1';
2578
2579 close(args->cpipe[0]);
b8defc3d
CB
2580 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2581 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
35174b0f
FG
2582 close(args->cpipe[1]);
2583 return args->wrapped(args->sock, args->tpid);
2584}
237e200e
SH
2585
2586/*
2587 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2588 * int value back over the socket. This shifts the pid from the
2589 * sender's pidns into tpid's pidns.
2590 */
35174b0f 2591static int pid_to_ns(int sock, pid_t tpid)
237e200e
SH
2592{
2593 char v = '0';
2594 struct ucred cred;
2595
2596 while (recv_creds(sock, &cred, &v)) {
2597 if (v == '1')
35174b0f 2598 return 0;
237e200e 2599 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
35174b0f 2600 return 1;
237e200e 2601 }
35174b0f 2602 return 0;
237e200e
SH
2603}
2604
35174b0f 2605
237e200e
SH
2606/*
2607 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
35174b0f
FG
2608 * in your old pidns. Only children which you clone will be in the target
2609 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2610 * actually convert pids.
2611 *
2612 * Note: glibc's fork() does not respect pidns, which can lead to failed
2613 * assertions inside glibc (and thus failed forks) if the child's pid in
2614 * the pidns and the parent pid outside are identical. Using clone prevents
2615 * this issue.
237e200e
SH
2616 */
2617static void pid_to_ns_wrapper(int sock, pid_t tpid)
2618{
2619 int newnsfd = -1, ret, cpipe[2];
2620 char fnam[100];
2621 pid_t cpid;
2622 char v;
2623
2624 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2625 if (ret < 0 || ret >= sizeof(fnam))
2626 _exit(1);
2627 newnsfd = open(fnam, O_RDONLY);
2628 if (newnsfd < 0)
2629 _exit(1);
2630 if (setns(newnsfd, 0) < 0)
2631 _exit(1);
2632 close(newnsfd);
2633
2634 if (pipe(cpipe) < 0)
2635 _exit(1);
2636
35174b0f
FG
2637 struct pid_ns_clone_args args = {
2638 .cpipe = cpipe,
2639 .sock = sock,
2640 .tpid = tpid,
2641 .wrapped = &pid_to_ns
2642 };
2643 size_t stack_size = sysconf(_SC_PAGESIZE);
2644 void *stack = alloca(stack_size);
2645
2646 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2647 if (cpid < 0)
2648 _exit(1);
2649
237e200e
SH
2650 // give the child 1 second to be done forking and
2651 // write its ack
2652 if (!wait_for_sock(cpipe[0], 1))
2653 _exit(1);
2654 ret = read(cpipe[0], &v, 1);
2655 if (ret != sizeof(char) || v != '1')
2656 _exit(1);
2657
2658 if (!wait_for_pid(cpid))
2659 _exit(1);
2660 _exit(0);
2661}
2662
2663/*
2664 * To read cgroup files with a particular pid, we will setns into the child
2665 * pidns, open a pipe, fork a child - which will be the first to really be in
2666 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2667 */
2668bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2669{
2670 int sock[2] = {-1, -1};
2671 char *tmpdata = NULL;
2672 int ret;
2673 pid_t qpid, cpid = -1;
2674 bool answer = false;
2675 char v = '0';
2676 struct ucred cred;
2677 size_t sz = 0, asz = 0;
2678
2679 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2680 return false;
2681
2682 /*
2683 * Now we read the pids from returned data one by one, pass
2684 * them into a child in the target namespace, read back the
2685 * translated pids, and put them into our to-return data
2686 */
2687
2688 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2689 perror("socketpair");
2690 free(tmpdata);
2691 return false;
2692 }
2693
2694 cpid = fork();
2695 if (cpid == -1)
2696 goto out;
2697
2698 if (!cpid) // child - exits when done
2699 pid_to_ns_wrapper(sock[1], tpid);
2700
2701 char *ptr = tmpdata;
2702 cred.uid = 0;
2703 cred.gid = 0;
2704 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2705 cred.pid = qpid;
2706 ret = send_creds(sock[0], &cred, v, true);
2707
2708 if (ret == SEND_CREDS_NOTSK)
2709 goto next;
2710 if (ret == SEND_CREDS_FAIL)
2711 goto out;
2712
2713 // read converted results
2714 if (!wait_for_sock(sock[0], 2)) {
b8defc3d 2715 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
237e200e
SH
2716 goto out;
2717 }
2718 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2719 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
237e200e
SH
2720 goto out;
2721 }
2722 must_strcat_pid(d, &sz, &asz, qpid);
2723next:
2724 ptr = strchr(ptr, '\n');
2725 if (!ptr)
2726 break;
2727 ptr++;
2728 }
2729
2730 cred.pid = getpid();
2731 v = '1';
2732 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2733 // failed to ask child to exit
b8defc3d 2734 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
237e200e
SH
2735 goto out;
2736 }
2737
2738 answer = true;
2739
2740out:
2741 free(tmpdata);
2742 if (cpid != -1)
2743 wait_for_pid(cpid);
2744 if (sock[0] != -1) {
2745 close(sock[0]);
2746 close(sock[1]);
2747 }
2748 return answer;
2749}
2750
2751int cg_read(const char *path, char *buf, size_t size, off_t offset,
2752 struct fuse_file_info *fi)
2753{
2754 struct fuse_context *fc = fuse_get_context();
2755 struct file_info *f = (struct file_info *)fi->fh;
2756 struct cgfs_files *k = NULL;
2757 char *data = NULL;
2758 int ret, s;
2759 bool r;
2760
2761 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 2762 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
237e200e
SH
2763 return -EIO;
2764 }
2765
2766 if (offset)
2767 return 0;
2768
2769 if (!fc)
2770 return -EIO;
2771
2772 if (!f->controller)
2773 return -EINVAL;
2774
2775 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2776 return -EINVAL;
2777 }
2778 free_key(k);
2779
2780
888f8f3c 2781 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
237e200e
SH
2782 ret = -EACCES;
2783 goto out;
2784 }
2785
2786 if (strcmp(f->file, "tasks") == 0 ||
2787 strcmp(f->file, "/tasks") == 0 ||
2788 strcmp(f->file, "/cgroup.procs") == 0 ||
2789 strcmp(f->file, "cgroup.procs") == 0)
2790 // special case - we have to translate the pids
2791 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2792 else
2793 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2794
2795 if (!r) {
2796 ret = -EINVAL;
2797 goto out;
2798 }
2799
2800 if (!data) {
2801 ret = 0;
2802 goto out;
2803 }
2804 s = strlen(data);
2805 if (s > size)
2806 s = size;
2807 memcpy(buf, data, s);
2808 if (s > 0 && s < size && data[s-1] != '\n')
2809 buf[s++] = '\n';
2810
2811 ret = s;
2812
2813out:
2814 free(data);
2815 return ret;
2816}
2817
35174b0f 2818static int pid_from_ns(int sock, pid_t tpid)
237e200e
SH
2819{
2820 pid_t vpid;
2821 struct ucred cred;
2822 char v;
2823 int ret;
2824
2825 cred.uid = 0;
2826 cred.gid = 0;
2827 while (1) {
2828 if (!wait_for_sock(sock, 2)) {
b8defc3d 2829 lxcfs_error("%s\n", "Timeout reading from parent.");
35174b0f 2830 return 1;
237e200e
SH
2831 }
2832 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
b8defc3d 2833 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
35174b0f 2834 return 1;
237e200e
SH
2835 }
2836 if (vpid == -1) // done
2837 break;
2838 v = '0';
2839 cred.pid = vpid;
2840 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2841 v = '1';
2842 cred.pid = getpid();
2843 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
35174b0f 2844 return 1;
237e200e
SH
2845 }
2846 }
35174b0f 2847 return 0;
237e200e
SH
2848}
2849
2850static void pid_from_ns_wrapper(int sock, pid_t tpid)
2851{
2852 int newnsfd = -1, ret, cpipe[2];
2853 char fnam[100];
2854 pid_t cpid;
2855 char v;
2856
2857 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2858 if (ret < 0 || ret >= sizeof(fnam))
2859 _exit(1);
2860 newnsfd = open(fnam, O_RDONLY);
2861 if (newnsfd < 0)
2862 _exit(1);
2863 if (setns(newnsfd, 0) < 0)
2864 _exit(1);
2865 close(newnsfd);
2866
2867 if (pipe(cpipe) < 0)
2868 _exit(1);
2869
35174b0f
FG
2870 struct pid_ns_clone_args args = {
2871 .cpipe = cpipe,
2872 .sock = sock,
2873 .tpid = tpid,
2874 .wrapped = &pid_from_ns
2875 };
f0f8b851
SH
2876 size_t stack_size = sysconf(_SC_PAGESIZE);
2877 void *stack = alloca(stack_size);
35174b0f
FG
2878
2879 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2880 if (cpid < 0)
2881 _exit(1);
2882
237e200e
SH
2883 // give the child 1 second to be done forking and
2884 // write its ack
2885 if (!wait_for_sock(cpipe[0], 1))
f0f8b851 2886 _exit(1);
237e200e 2887 ret = read(cpipe[0], &v, 1);
f0f8b851
SH
2888 if (ret != sizeof(char) || v != '1')
2889 _exit(1);
237e200e
SH
2890
2891 if (!wait_for_pid(cpid))
2892 _exit(1);
2893 _exit(0);
237e200e
SH
2894}
2895
2896/*
2897 * Given host @uid, return the uid to which it maps in
2898 * @pid's user namespace, or -1 if none.
2899 */
2900bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2901{
2902 FILE *f;
2903 char line[400];
2904
2905 sprintf(line, "/proc/%d/uid_map", pid);
2906 if ((f = fopen(line, "r")) == NULL) {
2907 return false;
2908 }
2909
2910 *answer = convert_id_to_ns(f, uid);
2911 fclose(f);
2912
2913 if (*answer == -1)
2914 return false;
2915 return true;
2916}
2917
2918/*
2919 * get_pid_creds: get the real uid and gid of @pid from
2920 * /proc/$$/status
2921 * (XXX should we use euid here?)
2922 */
2923void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2924{
2925 char line[400];
2926 uid_t u;
2927 gid_t g;
2928 FILE *f;
2929
2930 *uid = -1;
2931 *gid = -1;
2932 sprintf(line, "/proc/%d/status", pid);
2933 if ((f = fopen(line, "r")) == NULL) {
b8defc3d 2934 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
237e200e
SH
2935 return;
2936 }
2937 while (fgets(line, 400, f)) {
2938 if (strncmp(line, "Uid:", 4) == 0) {
2939 if (sscanf(line+4, "%u", &u) != 1) {
b8defc3d 2940 lxcfs_error("bad uid line for pid %u\n", pid);
237e200e
SH
2941 fclose(f);
2942 return;
2943 }
2944 *uid = u;
2945 } else if (strncmp(line, "Gid:", 4) == 0) {
2946 if (sscanf(line+4, "%u", &g) != 1) {
b8defc3d 2947 lxcfs_error("bad gid line for pid %u\n", pid);
237e200e
SH
2948 fclose(f);
2949 return;
2950 }
2951 *gid = g;
2952 }
2953 }
2954 fclose(f);
2955}
2956
2957/*
2958 * May the requestor @r move victim @v to a new cgroup?
2959 * This is allowed if
2960 * . they are the same task
2961 * . they are ownedy by the same uid
2962 * . @r is root on the host, or
2963 * . @v's uid is mapped into @r's where @r is root.
2964 */
2965bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2966{
2967 uid_t v_uid, tmpuid;
2968 gid_t v_gid;
2969
2970 if (r == v)
2971 return true;
2972 if (r_uid == 0)
2973 return true;
2974 get_pid_creds(v, &v_uid, &v_gid);
2975 if (r_uid == v_uid)
2976 return true;
2977 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2978 && hostuid_to_ns(v_uid, r, &tmpuid))
2979 return true;
2980 return false;
2981}
2982
2983static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2984 const char *file, const char *buf)
2985{
2986 int sock[2] = {-1, -1};
2987 pid_t qpid, cpid = -1;
2988 FILE *pids_file = NULL;
2989 bool answer = false, fail = false;
2990
2991 pids_file = open_pids_file(contrl, cg);
2992 if (!pids_file)
2993 return false;
2994
2995 /*
2996 * write the pids to a socket, have helper in writer's pidns
2997 * call movepid for us
2998 */
2999 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
3000 perror("socketpair");
3001 goto out;
3002 }
3003
3004 cpid = fork();
3005 if (cpid == -1)
3006 goto out;
3007
3008 if (!cpid) { // child
3009 fclose(pids_file);
3010 pid_from_ns_wrapper(sock[1], tpid);
3011 }
3012
3013 const char *ptr = buf;
3014 while (sscanf(ptr, "%d", &qpid) == 1) {
3015 struct ucred cred;
3016 char v;
3017
3018 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 3019 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
237e200e
SH
3020 goto out;
3021 }
3022
3023 if (recv_creds(sock[0], &cred, &v)) {
3024 if (v == '0') {
3025 if (!may_move_pid(tpid, tuid, cred.pid)) {
3026 fail = true;
3027 break;
3028 }
3029 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
3030 fail = true;
3031 }
3032 }
3033
3034 ptr = strchr(ptr, '\n');
3035 if (!ptr)
3036 break;
3037 ptr++;
3038 }
3039
3040 /* All good, write the value */
3041 qpid = -1;
3042 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
b8defc3d 3043 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
237e200e
SH
3044
3045 if (!fail)
3046 answer = true;
3047
3048out:
3049 if (cpid != -1)
3050 wait_for_pid(cpid);
3051 if (sock[0] != -1) {
3052 close(sock[0]);
3053 close(sock[1]);
3054 }
3055 if (pids_file) {
3056 if (fclose(pids_file) != 0)
3057 answer = false;
3058 }
3059 return answer;
3060}
3061
3062int cg_write(const char *path, const char *buf, size_t size, off_t offset,
3063 struct fuse_file_info *fi)
3064{
3065 struct fuse_context *fc = fuse_get_context();
3066 char *localbuf = NULL;
3067 struct cgfs_files *k = NULL;
3068 struct file_info *f = (struct file_info *)fi->fh;
3069 bool r;
3070
3071 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 3072 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
237e200e
SH
3073 return -EIO;
3074 }
3075
3076 if (offset)
3077 return 0;
3078
3079 if (!fc)
3080 return -EIO;
3081
3082 localbuf = alloca(size+1);
3083 localbuf[size] = '\0';
3084 memcpy(localbuf, buf, size);
3085
3086 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
3087 size = -EINVAL;
3088 goto out;
3089 }
3090
3091 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
3092 size = -EACCES;
3093 goto out;
3094 }
3095
3096 if (strcmp(f->file, "tasks") == 0 ||
3097 strcmp(f->file, "/tasks") == 0 ||
3098 strcmp(f->file, "/cgroup.procs") == 0 ||
3099 strcmp(f->file, "cgroup.procs") == 0)
3100 // special case - we have to translate the pids
3101 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
3102 else
3103 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
3104
3105 if (!r)
3106 size = -EINVAL;
3107
3108out:
3109 free_key(k);
3110 return size;
3111}
3112
3113int cg_chown(const char *path, uid_t uid, gid_t gid)
3114{
3115 struct fuse_context *fc = fuse_get_context();
3116 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3117 struct cgfs_files *k = NULL;
3118 const char *cgroup;
3119 int ret;
3120
3121 if (!fc)
3122 return -EIO;
3123
3124 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 3125 return -EPERM;
237e200e
SH
3126
3127 controller = pick_controller_from_path(fc, path);
3128 if (!controller)
bc70ba9b
CB
3129 return errno == ENOENT ? -EPERM : -errno;
3130
237e200e
SH
3131 cgroup = find_cgroup_in_path(path);
3132 if (!cgroup)
3133 /* this is just /cgroup/controller */
bc70ba9b 3134 return -EPERM;
237e200e
SH
3135
3136 get_cgdir_and_path(cgroup, &cgdir, &last);
3137
3138 if (!last) {
3139 path1 = "/";
3140 path2 = cgdir;
3141 } else {
3142 path1 = cgdir;
3143 path2 = last;
3144 }
3145
3146 if (is_child_cgroup(controller, path1, path2)) {
3147 // get uid, gid, from '/tasks' file and make up a mode
3148 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3149 k = cgfs_get_key(controller, cgroup, "tasks");
3150
3151 } else
3152 k = cgfs_get_key(controller, path1, path2);
3153
3154 if (!k) {
3155 ret = -EINVAL;
3156 goto out;
3157 }
3158
3159 /*
3160 * This being a fuse request, the uid and gid must be valid
3161 * in the caller's namespace. So we can just check to make
3162 * sure that the caller is root in his uid, and privileged
3163 * over the file's current owner.
3164 */
3165 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3166 ret = -EACCES;
3167 goto out;
3168 }
3169
3170 ret = cgfs_chown_file(controller, cgroup, uid, gid);
3171
3172out:
3173 free_key(k);
3174 free(cgdir);
3175
3176 return ret;
3177}
3178
3179int cg_chmod(const char *path, mode_t mode)
3180{
3181 struct fuse_context *fc = fuse_get_context();
3182 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3183 struct cgfs_files *k = NULL;
3184 const char *cgroup;
3185 int ret;
3186
3187 if (!fc)
3188 return -EIO;
3189
3190 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 3191 return -EPERM;
237e200e
SH
3192
3193 controller = pick_controller_from_path(fc, path);
3194 if (!controller)
bc70ba9b
CB
3195 return errno == ENOENT ? -EPERM : -errno;
3196
237e200e
SH
3197 cgroup = find_cgroup_in_path(path);
3198 if (!cgroup)
3199 /* this is just /cgroup/controller */
bc70ba9b 3200 return -EPERM;
237e200e
SH
3201
3202 get_cgdir_and_path(cgroup, &cgdir, &last);
3203
3204 if (!last) {
3205 path1 = "/";
3206 path2 = cgdir;
3207 } else {
3208 path1 = cgdir;
3209 path2 = last;
3210 }
3211
3212 if (is_child_cgroup(controller, path1, path2)) {
3213 // get uid, gid, from '/tasks' file and make up a mode
3214 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3215 k = cgfs_get_key(controller, cgroup, "tasks");
3216
3217 } else
3218 k = cgfs_get_key(controller, path1, path2);
3219
3220 if (!k) {
3221 ret = -EINVAL;
3222 goto out;
3223 }
3224
3225 /*
3226 * This being a fuse request, the uid and gid must be valid
3227 * in the caller's namespace. So we can just check to make
3228 * sure that the caller is root in his uid, and privileged
3229 * over the file's current owner.
3230 */
3231 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3232 ret = -EPERM;
3233 goto out;
3234 }
3235
3236 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3237 ret = -EINVAL;
3238 goto out;
3239 }
3240
3241 ret = 0;
3242out:
3243 free_key(k);
3244 free(cgdir);
3245 return ret;
3246}
3247
3248int cg_mkdir(const char *path, mode_t mode)
3249{
3250 struct fuse_context *fc = fuse_get_context();
3251 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3252 const char *cgroup;
3253 int ret;
3254
3255 if (!fc)
3256 return -EIO;
3257
237e200e
SH
3258 controller = pick_controller_from_path(fc, path);
3259 if (!controller)
2f7036d0 3260 return errno == ENOENT ? -EPERM : -errno;
237e200e
SH
3261
3262 cgroup = find_cgroup_in_path(path);
3263 if (!cgroup)
bc70ba9b 3264 return -errno;
237e200e
SH
3265
3266 get_cgdir_and_path(cgroup, &cgdir, &last);
3267 if (!last)
3268 path1 = "/";
3269 else
3270 path1 = cgdir;
3271
3272 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 3273 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
3274 initpid = fc->pid;
3275 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3276 if (!next)
3277 ret = -EINVAL;
3278 else if (last && strcmp(next, last) == 0)
3279 ret = -EEXIST;
3280 else
2f7036d0 3281 ret = -EPERM;
237e200e
SH
3282 goto out;
3283 }
3284
3285 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3286 ret = -EACCES;
3287 goto out;
3288 }
3289 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3290 ret = -EACCES;
3291 goto out;
3292 }
3293
3294 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3295
3296out:
3297 free(cgdir);
3298 free(next);
3299 return ret;
3300}
3301
3302int cg_rmdir(const char *path)
3303{
3304 struct fuse_context *fc = fuse_get_context();
3305 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3306 const char *cgroup;
3307 int ret;
3308
3309 if (!fc)
3310 return -EIO;
3311
3312 controller = pick_controller_from_path(fc, path);
e254948f
CB
3313 if (!controller) /* Someone's trying to delete "/cgroup". */
3314 return -EPERM;
237e200e
SH
3315
3316 cgroup = find_cgroup_in_path(path);
e254948f
CB
3317 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3318 return -EPERM;
237e200e
SH
3319
3320 get_cgdir_and_path(cgroup, &cgdir, &last);
3321 if (!last) {
e254948f
CB
3322 /* Someone's trying to delete a cgroup on the same level as the
3323 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3324 * rmdir "/cgroup/blkio/init.slice".
3325 */
3326 ret = -EPERM;
237e200e
SH
3327 goto out;
3328 }
3329
3330 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 3331 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
3332 initpid = fc->pid;
3333 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
de77249b 3334 if (!last || (next && (strcmp(next, last) == 0)))
237e200e
SH
3335 ret = -EBUSY;
3336 else
3337 ret = -ENOENT;
3338 goto out;
3339 }
3340
3341 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3342 ret = -EACCES;
3343 goto out;
3344 }
3345 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3346 ret = -EACCES;
3347 goto out;
3348 }
3349
3350 if (!cgfs_remove(controller, cgroup)) {
3351 ret = -EINVAL;
3352 goto out;
3353 }
3354
3355 ret = 0;
3356
3357out:
3358 free(cgdir);
3359 free(next);
3360 return ret;
3361}
3362
3363static bool startswith(const char *line, const char *pref)
3364{
3365 if (strncmp(line, pref, strlen(pref)) == 0)
3366 return true;
3367 return false;
3368}
3369
c6095b08
SH
3370static void parse_memstat(char *memstat, unsigned long *cached,
3371 unsigned long *active_anon, unsigned long *inactive_anon,
3372 unsigned long *active_file, unsigned long *inactive_file,
559eaa8f 3373 unsigned long *unevictable, unsigned long *shmem)
237e200e
SH
3374{
3375 char *eol;
3376
237e200e 3377 while (*memstat) {
4accebfb
AS
3378 if (startswith(memstat, "total_cache")) {
3379 sscanf(memstat + 11, "%lu", cached);
c6095b08 3380 *cached /= 1024;
4accebfb
AS
3381 } else if (startswith(memstat, "total_active_anon")) {
3382 sscanf(memstat + 17, "%lu", active_anon);
c6095b08 3383 *active_anon /= 1024;
4accebfb
AS
3384 } else if (startswith(memstat, "total_inactive_anon")) {
3385 sscanf(memstat + 19, "%lu", inactive_anon);
c6095b08 3386 *inactive_anon /= 1024;
4accebfb
AS
3387 } else if (startswith(memstat, "total_active_file")) {
3388 sscanf(memstat + 17, "%lu", active_file);
c6095b08 3389 *active_file /= 1024;
4accebfb
AS
3390 } else if (startswith(memstat, "total_inactive_file")) {
3391 sscanf(memstat + 19, "%lu", inactive_file);
c6095b08 3392 *inactive_file /= 1024;
4accebfb
AS
3393 } else if (startswith(memstat, "total_unevictable")) {
3394 sscanf(memstat + 17, "%lu", unevictable);
c6095b08 3395 *unevictable /= 1024;
559eaa8f
JS
3396 } else if (startswith(memstat, "total_shmem")) {
3397 sscanf(memstat + 11, "%lu", shmem);
3398 *shmem /= 1024;
237e200e
SH
3399 }
3400 eol = strchr(memstat, '\n');
3401 if (!eol)
3402 return;
3403 memstat = eol+1;
3404 }
3405}
3406
3407static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3408{
3409 char *eol;
3410 char key[32];
3411
3412 memset(key, 0, 32);
3413 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3414
3415 size_t len = strlen(key);
3416 *v = 0;
3417
3418 while (*str) {
3419 if (startswith(str, key)) {
3420 sscanf(str + len, "%lu", v);
3421 return;
3422 }
3423 eol = strchr(str, '\n');
3424 if (!eol)
3425 return;
3426 str = eol+1;
3427 }
3428}
3429
71f17cd2 3430int read_file(const char *path, char *buf, size_t size, struct file_info *d)
237e200e
SH
3431{
3432 size_t linelen = 0, total_len = 0, rv = 0;
3433 char *line = NULL;
3434 char *cache = d->buf;
3435 size_t cache_size = d->buflen;
3436 FILE *f = fopen(path, "r");
3437 if (!f)
3438 return 0;
3439
3440 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3441 ssize_t l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
3442 if (l < 0) {
3443 perror("Error writing to cache");
3444 rv = 0;
3445 goto err;
3446 }
3447 if (l >= cache_size) {
b8defc3d 3448 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3449 rv = 0;
3450 goto err;
3451 }
3452 cache += l;
3453 cache_size -= l;
3454 total_len += l;
3455 }
3456
3457 d->size = total_len;
a262ddb7
CB
3458 if (total_len > size)
3459 total_len = size;
237e200e
SH
3460
3461 /* read from off 0 */
3462 memcpy(buf, d->buf, total_len);
3463 rv = total_len;
3464 err:
3465 fclose(f);
3466 free(line);
030d022c
HY
3467 if (d->size > rv)
3468 d->cached = d->size - rv;
237e200e
SH
3469 return rv;
3470}
3471
3472/*
3473 * FUSE ops for /proc
3474 */
3475
018246ff 3476static unsigned long get_memlimit(const char *cgroup, const char *file)
237e200e
SH
3477{
3478 char *memlimit_str = NULL;
3479 unsigned long memlimit = -1;
3480
018246ff 3481 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
237e200e
SH
3482 memlimit = strtoul(memlimit_str, NULL, 10);
3483
3484 free(memlimit_str);
3485
3486 return memlimit;
3487}
3488
018246ff 3489static unsigned long get_min_memlimit(const char *cgroup, const char *file)
237e200e
SH
3490{
3491 char *copy = strdupa(cgroup);
3492 unsigned long memlimit = 0, retlimit;
3493
018246ff 3494 retlimit = get_memlimit(copy, file);
237e200e
SH
3495
3496 while (strcmp(copy, "/") != 0) {
3497 copy = dirname(copy);
018246ff 3498 memlimit = get_memlimit(copy, file);
237e200e
SH
3499 if (memlimit != -1 && memlimit < retlimit)
3500 retlimit = memlimit;
3501 };
3502
3503 return retlimit;
3504}
3505
3506static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3507 struct fuse_file_info *fi)
3508{
3509 struct fuse_context *fc = fuse_get_context();
7e60aa1b 3510 struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
237e200e
SH
3511 struct file_info *d = (struct file_info *)fi->fh;
3512 char *cg;
3513 char *memusage_str = NULL, *memstat_str = NULL,
018246ff 3514 *memswlimit_str = NULL, *memswusage_str = NULL;
237e200e 3515 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
c6095b08 3516 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
559eaa8f 3517 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
594a10e6 3518 hostswtotal = 0;
237e200e
SH
3519 char *line = NULL;
3520 size_t linelen = 0, total_len = 0, rv = 0;
3521 char *cache = d->buf;
3522 size_t cache_size = d->buflen;
3523 FILE *f = NULL;
3524
3525 if (offset){
3526 if (offset > d->size)
3527 return -EINVAL;
3528 if (!d->cached)
3529 return 0;
3530 int left = d->size - offset;
3531 total_len = left > size ? size: left;
3532 memcpy(buf, cache + offset, total_len);
3533 return total_len;
3534 }
3535
3536 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 3537 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
3538 initpid = fc->pid;
3539 cg = get_pid_cgroup(initpid, "memory");
3540 if (!cg)
3541 return read_file("/proc/meminfo", buf, size, d);
6d2f6996 3542 prune_init_slice(cg);
237e200e 3543
018246ff 3544 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
237e200e
SH
3545 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3546 goto err;
3547 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3548 goto err;
3549
3550 // Following values are allowed to fail, because swapaccount might be turned
3551 // off for current kernel
3552 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3553 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3554 {
018246ff 3555 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
237e200e
SH
3556 memswusage = strtoul(memswusage_str, NULL, 10);
3557
237e200e
SH
3558 memswlimit = memswlimit / 1024;
3559 memswusage = memswusage / 1024;
3560 }
3561
3562 memusage = strtoul(memusage_str, NULL, 10);
3563 memlimit /= 1024;
3564 memusage /= 1024;
3565
c6095b08
SH
3566 parse_memstat(memstat_str, &cached, &active_anon,
3567 &inactive_anon, &active_file, &inactive_file,
559eaa8f 3568 &unevictable, &shmem);
237e200e
SH
3569
3570 f = fopen("/proc/meminfo", "r");
3571 if (!f)
3572 goto err;
3573
3574 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3575 ssize_t l;
237e200e
SH
3576 char *printme, lbuf[100];
3577
3578 memset(lbuf, 0, 100);
3579 if (startswith(line, "MemTotal:")) {
594a10e6 3580 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
237e200e
SH
3581 if (hosttotal < memlimit)
3582 memlimit = hosttotal;
3583 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3584 printme = lbuf;
3585 } else if (startswith(line, "MemFree:")) {
3586 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3587 printme = lbuf;
3588 } else if (startswith(line, "MemAvailable:")) {
ad19b86d 3589 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
237e200e 3590 printme = lbuf;
b3aa7f1a 3591 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 && opts && opts->swap_off == false) {
594a10e6 3592 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
4127e51b 3593 if (hostswtotal < memswlimit)
3594 memswlimit = hostswtotal;
3595 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
237e200e 3596 printme = lbuf;
b3aa7f1a 3597 } else if (startswith(line, "SwapTotal:") && opts && opts->swap_off == true) {
7e60aa1b 3598 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", 0UL);
3599 printme = lbuf;
b3aa7f1a 3600 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0 && opts && opts->swap_off == false) {
4127e51b 3601 unsigned long swaptotal = memswlimit,
e715dc60 3602 swapusage = memusage > memswusage ? 0 : memswusage - memusage,
b4665ce0
SH
3603 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3604 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
237e200e 3605 printme = lbuf;
b3aa7f1a 3606 } else if (startswith(line, "SwapFree:") && opts && opts->swap_off == true) {
7e60aa1b 3607 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", 0UL);
3608 printme = lbuf;
da35d72a
SH
3609 } else if (startswith(line, "Slab:")) {
3610 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3611 printme = lbuf;
237e200e
SH
3612 } else if (startswith(line, "Buffers:")) {
3613 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3614 printme = lbuf;
3615 } else if (startswith(line, "Cached:")) {
3616 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3617 printme = lbuf;
3618 } else if (startswith(line, "SwapCached:")) {
3619 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3620 printme = lbuf;
2f306ad3 3621 } else if (startswith(line, "Active:")) {
c6095b08
SH
3622 snprintf(lbuf, 100, "Active: %8lu kB\n",
3623 active_anon + active_file);
3624 printme = lbuf;
2f306ad3 3625 } else if (startswith(line, "Inactive:")) {
c6095b08
SH
3626 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3627 inactive_anon + inactive_file);
3628 printme = lbuf;
3629 } else if (startswith(line, "Active(anon)")) {
3630 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3631 printme = lbuf;
3632 } else if (startswith(line, "Inactive(anon)")) {
3633 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3634 printme = lbuf;
3635 } else if (startswith(line, "Active(file)")) {
3636 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3637 printme = lbuf;
3638 } else if (startswith(line, "Inactive(file)")) {
3639 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3640 printme = lbuf;
3641 } else if (startswith(line, "Unevictable")) {
3642 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3643 printme = lbuf;
3644 } else if (startswith(line, "SReclaimable")) {
3645 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3646 printme = lbuf;
3647 } else if (startswith(line, "SUnreclaim")) {
3648 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3649 printme = lbuf;
559eaa8f
JS
3650 } else if (startswith(line, "Shmem:")) {
3651 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3652 printme = lbuf;
28cdea9b
JS
3653 } else if (startswith(line, "ShmemHugePages")) {
3654 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3655 printme = lbuf;
3656 } else if (startswith(line, "ShmemPmdMapped")) {
3657 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3658 printme = lbuf;
237e200e
SH
3659 } else
3660 printme = line;
3661
3662 l = snprintf(cache, cache_size, "%s", printme);
3663 if (l < 0) {
3664 perror("Error writing to cache");
3665 rv = 0;
3666 goto err;
3667
3668 }
3669 if (l >= cache_size) {
b8defc3d 3670 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3671 rv = 0;
3672 goto err;
3673 }
3674
3675 cache += l;
3676 cache_size -= l;
3677 total_len += l;
3678 }
3679
3680 d->cached = 1;
3681 d->size = total_len;
3682 if (total_len > size ) total_len = size;
3683 memcpy(buf, d->buf, total_len);
3684
3685 rv = total_len;
3686err:
3687 if (f)
3688 fclose(f);
3689 free(line);
3690 free(cg);
3691 free(memusage_str);
3692 free(memswlimit_str);
3693 free(memswusage_str);
3694 free(memstat_str);
237e200e
SH
3695 return rv;
3696}
3697
3698/*
3699 * Read the cpuset.cpus for cg
3700 * Return the answer in a newly allocated string which must be freed
3701 */
71f17cd2 3702char *get_cpuset(const char *cg)
237e200e
SH
3703{
3704 char *answer;
3705
3706 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3707 return NULL;
3708 return answer;
3709}
3710
3711bool cpu_in_cpuset(int cpu, const char *cpuset);
3712
3713static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3714{
3715 int cpu;
3716
3717 if (sscanf(line, "processor : %d", &cpu) != 1)
3718 return false;
3719 return cpu_in_cpuset(cpu, cpuset);
3720}
3721
c59d6a55
JS
3722/*
3723 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3724 * depending on `param`. Parameter value is returned throuh `value`.
3725 */
3726static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3727{
3728 bool rv = false;
3729 char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3730 char *str = NULL;
3731
3732 sprintf(file, "cpu.cfs_%s_us", param);
3733
3734 if (!cgfs_get_value("cpu", cg, file, &str))
3735 goto err;
3736
3737 if (sscanf(str, "%ld", value) != 1)
3738 goto err;
3739
3740 rv = true;
3741
3742err:
3743 if (str)
3744 free(str);
3745 return rv;
3746}
3747
3748/*
3749 * Return the maximum number of visible CPUs based on CPU quotas.
3750 * If there is no quota set, zero is returned.
3751 */
3752int max_cpu_count(const char *cg)
3753{
3754 int rv, nprocs;
3755 int64_t cfs_quota, cfs_period;
8c23db36
HY
3756 int nr_cpus_in_cpuset = 0;
3757 char *cpuset = NULL;
c59d6a55
JS
3758
3759 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3760 return 0;
3761
3762 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3763 return 0;
3764
8c23db36
HY
3765 cpuset = get_cpuset(cg);
3766 if (cpuset)
3767 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
3768
3769 if (cfs_quota <= 0 || cfs_period <= 0){
3770 if (nr_cpus_in_cpuset > 0)
3771 return nr_cpus_in_cpuset;
3772
c59d6a55 3773 return 0;
8c23db36 3774 }
c59d6a55
JS
3775
3776 rv = cfs_quota / cfs_period;
3777
3778 /* In case quota/period does not yield a whole number, add one CPU for
3779 * the remainder.
3780 */
3781 if ((cfs_quota % cfs_period) > 0)
3782 rv += 1;
3783
3784 nprocs = get_nprocs();
3785
3786 if (rv > nprocs)
3787 rv = nprocs;
3788
8c23db36
HY
3789 /* use min value in cpu quota and cpuset */
3790 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
3791 rv = nr_cpus_in_cpuset;
3792
c59d6a55
JS
3793 return rv;
3794}
3795
db1b32f6
SX
3796/*
3797 * Return the exact number of visible CPUs based on CPU quotas.
3798 * If there is no quota set, zero is returned.
3799 */
3800static double exact_cpu_count(const char *cg)
3801{
3802 double rv;
3803 int nprocs;
3804 int64_t cfs_quota, cfs_period;
3805
3806 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3807 return 0;
3808
3809 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3810 return 0;
3811
3812 if (cfs_quota <= 0 || cfs_period <= 0)
3813 return 0;
3814
3815 rv = (double)cfs_quota / (double)cfs_period;
3816
3817 nprocs = get_nprocs();
3818
3819 if (rv > nprocs)
3820 rv = nprocs;
3821
3822 return rv;
3823}
3824
c59d6a55
JS
3825/*
3826 * Determine whether CPU views should be used or not.
3827 */
3828bool use_cpuview(const char *cg)
3829{
3830 int cfd;
3831 char *tmpc;
3832
3833 tmpc = find_mounted_controller("cpu", &cfd);
3834 if (!tmpc)
3835 return false;
3836
3837 tmpc = find_mounted_controller("cpuacct", &cfd);
3838 if (!tmpc)
3839 return false;
3840
3841 return true;
3842}
3843
237e200e
SH
3844/*
3845 * check whether this is a '^processor" line in /proc/cpuinfo
3846 */
3847static bool is_processor_line(const char *line)
3848{
3849 int cpu;
3850
3851 if (sscanf(line, "processor : %d", &cpu) == 1)
3852 return true;
3853 return false;
3854}
3855
3856static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3857 struct fuse_file_info *fi)
3858{
3859 struct fuse_context *fc = fuse_get_context();
3860 struct file_info *d = (struct file_info *)fi->fh;
3861 char *cg;
3862 char *cpuset = NULL;
3863 char *line = NULL;
3864 size_t linelen = 0, total_len = 0, rv = 0;
f676eb79 3865 bool am_printing = false, firstline = true, is_s390x = false;
c59d6a55
JS
3866 int curcpu = -1, cpu, max_cpus = 0;
3867 bool use_view;
237e200e
SH
3868 char *cache = d->buf;
3869 size_t cache_size = d->buflen;
3870 FILE *f = NULL;
3871
3872 if (offset){
3873 if (offset > d->size)
3874 return -EINVAL;
3875 if (!d->cached)
3876 return 0;
3877 int left = d->size - offset;
3878 total_len = left > size ? size: left;
3879 memcpy(buf, cache + offset, total_len);
3880 return total_len;
3881 }
3882
3883 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 3884 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
3885 initpid = fc->pid;
3886 cg = get_pid_cgroup(initpid, "cpuset");
3887 if (!cg)
3888 return read_file("proc/cpuinfo", buf, size, d);
6d2f6996 3889 prune_init_slice(cg);
237e200e
SH
3890
3891 cpuset = get_cpuset(cg);
3892 if (!cpuset)
3893 goto err;
3894
c59d6a55
JS
3895 use_view = use_cpuview(cg);
3896
3897 if (use_view)
3898 max_cpus = max_cpu_count(cg);
3899
237e200e
SH
3900 f = fopen("/proc/cpuinfo", "r");
3901 if (!f)
3902 goto err;
3903
3904 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3905 ssize_t l;
f676eb79
SH
3906 if (firstline) {
3907 firstline = false;
3908 if (strstr(line, "IBM/S390") != NULL) {
3909 is_s390x = true;
3910 am_printing = true;
5ed9d4e2 3911 continue;
f676eb79
SH
3912 }
3913 }
5ed9d4e2
SH
3914 if (strncmp(line, "# processors:", 12) == 0)
3915 continue;
237e200e 3916 if (is_processor_line(line)) {
c59d6a55
JS
3917 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3918 break;
237e200e
SH
3919 am_printing = cpuline_in_cpuset(line, cpuset);
3920 if (am_printing) {
3921 curcpu ++;
3922 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3923 if (l < 0) {
3924 perror("Error writing to cache");
3925 rv = 0;
3926 goto err;
3927 }
3928 if (l >= cache_size) {
b8defc3d 3929 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3930 rv = 0;
3931 goto err;
3932 }
3933 cache += l;
3934 cache_size -= l;
3935 total_len += l;
3936 }
3937 continue;
f676eb79
SH
3938 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3939 char *p;
c59d6a55
JS
3940 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3941 break;
f676eb79
SH
3942 if (!cpu_in_cpuset(cpu, cpuset))
3943 continue;
3944 curcpu ++;
3945 p = strchr(line, ':');
3946 if (!p || !*p)
3947 goto err;
3948 p++;
5ed9d4e2 3949 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
f676eb79
SH
3950 if (l < 0) {
3951 perror("Error writing to cache");
3952 rv = 0;
3953 goto err;
3954 }
3955 if (l >= cache_size) {
b8defc3d 3956 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
f676eb79
SH
3957 rv = 0;
3958 goto err;
3959 }
3960 cache += l;
3961 cache_size -= l;
3962 total_len += l;
3963 continue;
3964
237e200e
SH
3965 }
3966 if (am_printing) {
3967 l = snprintf(cache, cache_size, "%s", line);
3968 if (l < 0) {
3969 perror("Error writing to cache");
3970 rv = 0;
3971 goto err;
3972 }
3973 if (l >= cache_size) {
b8defc3d 3974 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3975 rv = 0;
3976 goto err;
3977 }
3978 cache += l;
3979 cache_size -= l;
3980 total_len += l;
3981 }
3982 }
3983
5ed9d4e2
SH
3984 if (is_s390x) {
3985 char *origcache = d->buf;
a262ddb7 3986 ssize_t l;
5ed9d4e2
SH
3987 do {
3988 d->buf = malloc(d->buflen);
3989 } while (!d->buf);
3990 cache = d->buf;
3991 cache_size = d->buflen;
3992 total_len = 0;
3993 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3994 if (l < 0 || l >= cache_size) {
3995 free(origcache);
3996 goto err;
3997 }
3998 cache_size -= l;
3999 cache += l;
4000 total_len += l;
4001 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
4002 if (l < 0 || l >= cache_size) {
4003 free(origcache);
4004 goto err;
4005 }
4006 cache_size -= l;
4007 cache += l;
4008 total_len += l;
4009 l = snprintf(cache, cache_size, "%s", origcache);
4010 free(origcache);
4011 if (l < 0 || l >= cache_size)
4012 goto err;
4013 total_len += l;
4014 }
4015
237e200e
SH
4016 d->cached = 1;
4017 d->size = total_len;
4018 if (total_len > size ) total_len = size;
4019
4020 /* read from off 0 */
4021 memcpy(buf, d->buf, total_len);
4022 rv = total_len;
4023err:
4024 if (f)
4025 fclose(f);
4026 free(line);
4027 free(cpuset);
4028 free(cg);
4029 return rv;
4030}
4031
0ecddf02 4032static uint64_t get_reaper_start_time(pid_t pid)
9ac264cf 4033{
9ac264cf 4034 int ret;
0ecddf02
CB
4035 FILE *f;
4036 uint64_t starttime;
4037 /* strlen("/proc/") = 6
4038 * +
4039 * LXCFS_NUMSTRLEN64
4040 * +
4041 * strlen("/stat") = 5
4042 * +
4043 * \0 = 1
4044 * */
4045#define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
4046 char path[__PROC_PID_STAT_LEN];
9ac264cf
JB
4047 pid_t qpid;
4048
4049 qpid = lookup_initpid_in_store(pid);
0ecddf02
CB
4050 if (qpid <= 0) {
4051 /* Caller can check for EINVAL on 0. */
4052 errno = EINVAL;
9ac264cf 4053 return 0;
0ecddf02 4054 }
9ac264cf 4055
0ecddf02
CB
4056 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
4057 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
4058 /* Caller can check for EINVAL on 0. */
4059 errno = EINVAL;
9ac264cf 4060 return 0;
0ecddf02 4061 }
9ac264cf 4062
0ecddf02
CB
4063 f = fopen(path, "r");
4064 if (!f) {
4065 /* Caller can check for EINVAL on 0. */
4066 errno = EINVAL;
9ac264cf 4067 return 0;
0ecddf02 4068 }
9ac264cf 4069
0ecddf02
CB
4070 /* Note that the *scanf() argument supression requires that length
4071 * modifiers such as "l" are omitted. Otherwise some compilers will yell
4072 * at us. It's like telling someone you're not married and then asking
4073 * if you can bring your wife to the party.
4074 */
4075 ret = fscanf(f, "%*d " /* (1) pid %d */
4076 "%*s " /* (2) comm %s */
4077 "%*c " /* (3) state %c */
4078 "%*d " /* (4) ppid %d */
4079 "%*d " /* (5) pgrp %d */
4080 "%*d " /* (6) session %d */
4081 "%*d " /* (7) tty_nr %d */
4082 "%*d " /* (8) tpgid %d */
4083 "%*u " /* (9) flags %u */
4084 "%*u " /* (10) minflt %lu */
4085 "%*u " /* (11) cminflt %lu */
4086 "%*u " /* (12) majflt %lu */
4087 "%*u " /* (13) cmajflt %lu */
4088 "%*u " /* (14) utime %lu */
4089 "%*u " /* (15) stime %lu */
4090 "%*d " /* (16) cutime %ld */
4091 "%*d " /* (17) cstime %ld */
4092 "%*d " /* (18) priority %ld */
4093 "%*d " /* (19) nice %ld */
4094 "%*d " /* (20) num_threads %ld */
4095 "%*d " /* (21) itrealvalue %ld */
4096 "%" PRIu64, /* (22) starttime %llu */
4097 &starttime);
4098 if (ret != 1) {
4099 fclose(f);
4100 /* Caller can check for EINVAL on 0. */
4101 errno = EINVAL;
4102 return 0;
4103 }
4104
4105 fclose(f);
4106
4107 errno = 0;
4108 return starttime;
4109}
4110
1c4b4e38 4111static double get_reaper_start_time_in_sec(pid_t pid)
0ecddf02 4112{
1c4b4e38
CB
4113 uint64_t clockticks, ticks_per_sec;
4114 int64_t ret;
4115 double res = 0;
0ecddf02
CB
4116
4117 clockticks = get_reaper_start_time(pid);
4118 if (clockticks == 0 && errno == EINVAL) {
4119 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
4120 return 0;
4121 }
4122
1c4b4e38
CB
4123 ret = sysconf(_SC_CLK_TCK);
4124 if (ret < 0 && errno == EINVAL) {
0ecddf02
CB
4125 lxcfs_debug(
4126 "%s\n",
4127 "failed to determine number of clock ticks in a second");
4128 return 0;
4129 }
4130
1c4b4e38
CB
4131 ticks_per_sec = (uint64_t)ret;
4132 res = (double)clockticks / ticks_per_sec;
4133 return res;
0ecddf02
CB
4134}
4135
1c4b4e38 4136static double get_reaper_age(pid_t pid)
0ecddf02 4137{
1c4b4e38
CB
4138 uint64_t uptime_ms;
4139 double procstart, procage;
0ecddf02
CB
4140
4141 /* We need to substract the time the process has started since system
4142 * boot minus the time when the system has started to get the actual
4143 * reaper age.
4144 */
4145 procstart = get_reaper_start_time_in_sec(pid);
4146 procage = procstart;
4147 if (procstart > 0) {
4148 int ret;
4149 struct timespec spec;
4150
4151 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
4152 if (ret < 0)
4153 return 0;
1c4b4e38 4154
0ecddf02
CB
4155 /* We could make this more precise here by using the tv_nsec
4156 * field in the timespec struct and convert it to milliseconds
4157 * and then create a double for the seconds and milliseconds but
4158 * that seems more work than it is worth.
4159 */
1c4b4e38
CB
4160 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
4161 procage = (uptime_ms - (procstart * 1000)) / 1000;
0ecddf02
CB
4162 }
4163
4164 return procage;
4165}
4166
8be92dd1
JS
4167/*
4168 * Returns 0 on success.
4169 * It is the caller's responsibility to free `return_usage`, unless this
4170 * function returns an error.
4171 */
79612c8b 4172static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
8be92dd1 4173{
77005a6c 4174 int cpucount = get_nprocs_conf();
8be92dd1 4175 struct cpuacct_usage *cpu_usage;
db1b32f6 4176 int rv = 0, i, j, ret;
8be92dd1
JS
4177 int cg_cpu;
4178 uint64_t cg_user, cg_system;
4179 int64_t ticks_per_sec;
4180 char *usage_str = NULL;
4181
4182 ticks_per_sec = sysconf(_SC_CLK_TCK);
4183
4184 if (ticks_per_sec < 0 && errno == EINVAL) {
db1b32f6 4185 lxcfs_v(
8be92dd1
JS
4186 "%s\n",
4187 "read_cpuacct_usage_all failed to determine number of clock ticks "
4188 "in a second");
4189 return -1;
4190 }
4191
4192 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4193 if (!cpu_usage)
4194 return -ENOMEM;
4195
db1b32f6 4196 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
8be92dd1 4197 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
db1b32f6
SX
4198 // read cpuacct.usage_percpu instead
4199 lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", "");
4200 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_percpu", &usage_str)) {
4201 rv = -1;
4202 goto err;
4203 }
4204 lxcfs_v("usage_str: %s\n", usage_str);
4205
4206 // convert cpuacct.usage_percpu into cpuacct.usage_all
4207 lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", "");
1c4b4e38 4208
db1b32f6
SX
4209 char *data = NULL;
4210 size_t sz = 0, asz = 0;
4211
4212 must_strcat(&data, &sz, &asz, "cpu user system\n");
4213
4214 int i = 0, read_pos = 0, read_cnt=0;
4215 while (sscanf(usage_str + read_pos, "%lu %n", &cg_user, &read_cnt) > 0) {
4216 lxcfs_debug("i: %d, cg_user: %lu, read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt);
4217 must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user);
4218 i++;
4219 read_pos += read_cnt;
4220 }
4221
4222 free(usage_str);
4223 usage_str = data;
4224
4225 lxcfs_v("usage_str: %s\n", usage_str);
8be92dd1
JS
4226 }
4227
db1b32f6 4228 int read_pos = 0, read_cnt=0;
8be92dd1
JS
4229 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4230 lxcfs_error("read_cpuacct_usage_all reading first line from "
4231 "%s/cpuacct.usage_all failed.\n", cg);
4232 rv = -1;
4233 goto err;
4234 }
4235
4236 read_pos += read_cnt;
4237
4238 for (i = 0, j = 0; i < cpucount; i++) {
4239 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4240 &cg_system, &read_cnt);
4241
4242 if (ret == EOF)
4243 break;
4244
4245 if (ret != 3) {
4246 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4247 "failed.\n", cg);
4248 rv = -1;
4249 goto err;
4250 }
4251
4252 read_pos += read_cnt;
4253
8be92dd1
JS
4254 /* Convert the time from nanoseconds to USER_HZ */
4255 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4256 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4257 j++;
4258 }
4259
4260 rv = 0;
4261 *return_usage = cpu_usage;
79612c8b 4262 *size = cpucount;
8be92dd1
JS
4263
4264err:
4265 if (usage_str)
4266 free(usage_str);
4267
4268 if (rv != 0) {
4269 free(cpu_usage);
4270 *return_usage = NULL;
4271 }
4272
4273 return rv;
4274}
4275
056adcef
JS
4276static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4277{
4278 int i;
4279 unsigned long sum = 0;
4280
4281 for (i = 0; i < cpu_count; i++) {
77005a6c
JS
4282 if (!newer[i].online)
4283 continue;
4284
056adcef
JS
4285 /* When cpuset is changed on the fly, the CPUs might get reordered.
4286 * We could either reset all counters, or check that the substractions
4287 * below will return expected results.
4288 */
4289 if (newer[i].user > older[i].user)
4290 diff[i].user = newer[i].user - older[i].user;
4291 else
4292 diff[i].user = 0;
4293
4294 if (newer[i].system > older[i].system)
4295 diff[i].system = newer[i].system - older[i].system;
4296 else
4297 diff[i].system = 0;
4298
4299 if (newer[i].idle > older[i].idle)
4300 diff[i].idle = newer[i].idle - older[i].idle;
4301 else
4302 diff[i].idle = 0;
4303
4304 sum += diff[i].user;
4305 sum += diff[i].system;
4306 sum += diff[i].idle;
4307 }
4308
4309 return sum;
4310}
4311
4312static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4313{
4314 unsigned long free_space, to_add;
4315
4316 free_space = threshold - usage->user - usage->system;
4317
4318 if (free_space > usage->idle)
4319 free_space = usage->idle;
4320
4321 to_add = free_space > *surplus ? *surplus : free_space;
4322
4323 *counter += to_add;
4324 usage->idle -= to_add;
4325 *surplus -= to_add;
4326}
4327
951acc94
JS
4328static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4329{
4330 struct cg_proc_stat *first = NULL, *prev, *tmp;
4331
4332 for (prev = NULL; node; ) {
4333 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4334 tmp = node;
4335 lxcfs_debug("Removing stat node for %s\n", node->cg);
4336
4337 if (prev)
4338 prev->next = node->next;
4339 else
4340 first = node->next;
4341
4342 node = node->next;
4343 free_proc_stat_node(tmp);
4344 } else {
4345 if (!first)
4346 first = node;
4347 prev = node;
4348 node = node->next;
4349 }
4350 }
4351
4352 return first;
4353}
4354
4355#define PROC_STAT_PRUNE_INTERVAL 10
4356static void prune_proc_stat_history(void)
4357{
4358 int i;
4359 time_t now = time(NULL);
4360
4361 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
2f49b662
JS
4362 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4363
4364 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4365 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
951acc94 4366 return;
2f49b662 4367 }
951acc94 4368
2f49b662
JS
4369 if (proc_stat_history[i]->next) {
4370 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4371 proc_stat_history[i]->lastcheck = now;
4372 }
951acc94 4373
2f49b662 4374 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
951acc94
JS
4375 }
4376}
4377
2f49b662 4378static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
056adcef 4379{
056adcef
JS
4380 struct cg_proc_stat *node;
4381
2f49b662
JS
4382 pthread_rwlock_rdlock(&head->lock);
4383
4384 if (!head->next) {
4385 pthread_rwlock_unlock(&head->lock);
056adcef 4386 return NULL;
2f49b662 4387 }
056adcef
JS
4388
4389 node = head->next;
4390
4391 do {
4392 if (strcmp(cg, node->cg) == 0)
951acc94 4393 goto out;
056adcef
JS
4394 } while ((node = node->next));
4395
951acc94
JS
4396 node = NULL;
4397
4398out:
2f49b662 4399 pthread_rwlock_unlock(&head->lock);
951acc94
JS
4400 prune_proc_stat_history();
4401 return node;
056adcef
JS
4402}
4403
4404static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4405{
4406 struct cg_proc_stat *node;
4407 int i;
4408
4409 node = malloc(sizeof(struct cg_proc_stat));
4410 if (!node)
4411 goto err;
4412
4413 node->cg = NULL;
4414 node->usage = NULL;
4415 node->view = NULL;
4416
4417 node->cg = malloc(strlen(cg) + 1);
4418 if (!node->cg)
4419 goto err;
4420
4421 strcpy(node->cg, cg);
4422
4423 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4424 if (!node->usage)
4425 goto err;
4426
4427 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4428
4429 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4430 if (!node->view)
4431 goto err;
4432
4433 node->cpu_count = cpu_count;
4434 node->next = NULL;
4435
2f49b662
JS
4436 if (pthread_mutex_init(&node->lock, NULL) != 0) {
4437 lxcfs_error("%s\n", "Failed to initialize node lock");
4438 goto err;
4439 }
4440
056adcef
JS
4441 for (i = 0; i < cpu_count; i++) {
4442 node->view[i].user = 0;
4443 node->view[i].system = 0;
4444 node->view[i].idle = 0;
4445 }
4446
4447 return node;
4448
4449err:
4450 if (node && node->cg)
4451 free(node->cg);
4452 if (node && node->usage)
4453 free(node->usage);
4454 if (node && node->view)
4455 free(node->view);
4456 if (node)
4457 free(node);
4458
4459 return NULL;
4460}
4461
2f49b662 4462static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
056adcef
JS
4463{
4464 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4465 struct cg_proc_stat_head *head = proc_stat_history[hash];
2f49b662
JS
4466 struct cg_proc_stat *node, *rv = new_node;
4467
4468 pthread_rwlock_wrlock(&head->lock);
056adcef
JS
4469
4470 if (!head->next) {
4471 head->next = new_node;
2f49b662 4472 goto out;
056adcef
JS
4473 }
4474
2f49b662
JS
4475 node = head->next;
4476
056adcef 4477 for (;;) {
2f49b662
JS
4478 if (strcmp(node->cg, new_node->cg) == 0) {
4479 /* The node is already present, return it */
4480 free_proc_stat_node(new_node);
4481 rv = node;
4482 goto out;
4483 }
056adcef
JS
4484
4485 if (node->next) {
4486 node = node->next;
4487 continue;
4488 }
4489
4490 node->next = new_node;
2f49b662
JS
4491 goto out;
4492 }
4493
4494out:
4495 pthread_rwlock_unlock(&head->lock);
4496 return rv;
4497}
4498
895f28e5
JS
4499static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4500{
4501 struct cpuacct_usage *new_usage, *new_view;
4502 int i;
4503
4504 /* Allocate new memory */
4505 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4506 if (!new_usage)
4507 return false;
4508
4509 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4510 if (!new_view) {
4511 free(new_usage);
4512 return false;
4513 }
4514
4515 /* Copy existing data & initialize new elements */
4516 for (i = 0; i < cpu_count; i++) {
4517 if (i < node->cpu_count) {
4518 new_usage[i].user = node->usage[i].user;
4519 new_usage[i].system = node->usage[i].system;
4520 new_usage[i].idle = node->usage[i].idle;
4521
4522 new_view[i].user = node->view[i].user;
4523 new_view[i].system = node->view[i].system;
4524 new_view[i].idle = node->view[i].idle;
4525 } else {
4526 new_usage[i].user = 0;
4527 new_usage[i].system = 0;
4528 new_usage[i].idle = 0;
4529
4530 new_view[i].user = 0;
4531 new_view[i].system = 0;
4532 new_view[i].idle = 0;
4533 }
4534 }
4535
4536 free(node->usage);
4537 free(node->view);
4538
4539 node->usage = new_usage;
4540 node->view = new_view;
4541 node->cpu_count = cpu_count;
4542
4543 return true;
4544}
4545
2f49b662
JS
4546static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4547{
4548 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4549 struct cg_proc_stat_head *head = proc_stat_history[hash];
4550 struct cg_proc_stat *node;
4551
4552 node = find_proc_stat_node(head, cg);
4553
4554 if (!node) {
4555 node = new_proc_stat_node(usage, cpu_count, cg);
4556 if (!node)
4557 return NULL;
4558
4559 node = add_proc_stat_node(node);
4560 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
056adcef 4561 }
2f49b662
JS
4562
4563 pthread_mutex_lock(&node->lock);
895f28e5
JS
4564
4565 /* If additional CPUs on the host have been enabled, CPU usage counter
4566 * arrays have to be expanded */
4567 if (node->cpu_count < cpu_count) {
4568 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4569 node->cpu_count, cpu_count, cg);
4570
4571 if (!expand_proc_stat_node(node, cpu_count)) {
4572 pthread_mutex_unlock(&node->lock);
4573 lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4574 node->cpu_count, cpu_count, cg);
4575 return NULL;
4576 }
4577 }
4578
2f49b662 4579 return node;
056adcef
JS
4580}
4581
4582static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4583{
4584 int i;
4585
4586 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4587 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4588
4589 for (i = 0; i < cpu_count; i++) {
4590 node->view[i].user = 0;
4591 node->view[i].system = 0;
4592 node->view[i].idle = 0;
4593 }
4594
4595 node->cpu_count = cpu_count;
4596}
4597
79612c8b 4598static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, FILE *f, char *buf, size_t buf_size)
056adcef
JS
4599{
4600 char *line = NULL;
4601 size_t linelen = 0, total_len = 0, rv = 0, l;
4602 int curcpu = -1; /* cpu numbering starts at 0 */
77005a6c 4603 int physcpu, i;
056adcef
JS
4604 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4605 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4606 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4607 unsigned long user_surplus = 0, system_surplus = 0;
4608 unsigned long total_sum, threshold;
4609 struct cg_proc_stat *stat_node;
4610 struct cpuacct_usage *diff = NULL;
77005a6c 4611 int nprocs = get_nprocs_conf();
056adcef 4612
79612c8b
JS
4613 if (cg_cpu_usage_size < nprocs)
4614 nprocs = cg_cpu_usage_size;
4615
056adcef
JS
4616 /* Read all CPU stats and stop when we've encountered other lines */
4617 while (getline(&line, &linelen, f) != -1) {
77005a6c 4618 int ret;
056adcef
JS
4619 char cpu_char[10]; /* That's a lot of cores */
4620 uint64_t all_used, cg_used;
4621
4622 if (strlen(line) == 0)
4623 continue;
4624 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4625 /* not a ^cpuN line containing a number N */
4626 break;
4627 }
4628
77005a6c 4629 if (sscanf(cpu_char, "%d", &physcpu) != 1)
056adcef 4630 continue;
77005a6c 4631
79612c8b
JS
4632 if (physcpu >= cg_cpu_usage_size)
4633 continue;
4634
056adcef
JS
4635 curcpu ++;
4636 cpu_cnt ++;
4637
77005a6c
JS
4638 if (!cpu_in_cpuset(physcpu, cpuset)) {
4639 for (i = curcpu; i <= physcpu; i++) {
4640 cg_cpu_usage[i].online = false;
4641 }
4642 continue;
4643 }
4644
4645 if (curcpu < physcpu) {
4646 /* Some CPUs may be disabled */
4647 for (i = curcpu; i < physcpu; i++)
4648 cg_cpu_usage[i].online = false;
4649
4650 curcpu = physcpu;
4651 }
4652
4653 cg_cpu_usage[curcpu].online = true;
4654
056adcef
JS
4655 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4656 &user,
4657 &nice,
4658 &system,
4659 &idle,
4660 &iowait,
4661 &irq,
4662 &softirq,
4663 &steal,
4664 &guest,
4665 &guest_nice);
4666
4667 if (ret != 10)
4668 continue;
4669
4670 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4671 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4672
4673 if (all_used >= cg_used) {
4674 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4675
4676 } else {
4677 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4678 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4679 curcpu, cg, all_used, cg_used);
4680 cg_cpu_usage[curcpu].idle = idle;
4681 }
4682 }
4683
4684 /* Cannot use more CPUs than is available due to cpuset */
4685 if (max_cpus > cpu_cnt)
4686 max_cpus = cpu_cnt;
4687
2f49b662 4688 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
056adcef
JS
4689
4690 if (!stat_node) {
2f49b662
JS
4691 lxcfs_error("unable to find/create stat node for %s\n", cg);
4692 rv = 0;
4693 goto err;
056adcef
JS
4694 }
4695
4696 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4697 if (!diff) {
4698 rv = 0;
4699 goto err;
4700 }
4701
4702 /*
4703 * If the new values are LOWER than values stored in memory, it means
4704 * the cgroup has been reset/recreated and we should reset too.
4705 */
77005a6c
JS
4706 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4707 if (!cg_cpu_usage[curcpu].online)
4708 continue;
4709
4710 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4711 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4712
4713 break;
4714 }
056adcef 4715
77005a6c
JS
4716 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4717
4718 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4719 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4720
4721 if (!stat_node->usage[curcpu].online)
4722 continue;
4723
4724 i++;
056adcef 4725
056adcef
JS
4726 stat_node->usage[curcpu].user += diff[curcpu].user;
4727 stat_node->usage[curcpu].system += diff[curcpu].system;
4728 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4729
77005a6c 4730 if (max_cpus > 0 && i >= max_cpus) {
056adcef
JS
4731 user_surplus += diff[curcpu].user;
4732 system_surplus += diff[curcpu].system;
4733 }
4734 }
4735
4736 /* Calculate usage counters of visible CPUs */
4737 if (max_cpus > 0) {
4738 /* threshold = maximum usage per cpu, including idle */
4739 threshold = total_sum / cpu_cnt * max_cpus;
4740
77005a6c 4741 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
77005a6c
JS
4742 if (!stat_node->usage[curcpu].online)
4743 continue;
4744
4745 i++;
4746
db1b32f6
SX
4747 if (i == max_cpus)
4748 break;
4749
056adcef
JS
4750 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4751 continue;
4752
4753 /* Add user */
4754 add_cpu_usage(
4755 &user_surplus,
4756 &diff[curcpu],
4757 &diff[curcpu].user,
4758 threshold);
4759
4760 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4761 continue;
4762
4763 /* If there is still room, add system */
4764 add_cpu_usage(
4765 &system_surplus,
4766 &diff[curcpu],
4767 &diff[curcpu].system,
4768 threshold);
4769 }
4770
4771 if (user_surplus > 0)
4772 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4773 if (system_surplus > 0)
4774 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4775
db1b32f6
SX
4776 unsigned long diff_user = 0;
4777 unsigned long diff_system = 0;
4778 unsigned long diff_idle = 0;
4779 unsigned long max_diff_idle = 0;
4780 unsigned long max_diff_idle_index = 0;
77005a6c 4781 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
77005a6c
JS
4782 if (!stat_node->usage[curcpu].online)
4783 continue;
4784
4785 i++;
4786
db1b32f6
SX
4787 if (i == max_cpus)
4788 break;
4789
056adcef
JS
4790 stat_node->view[curcpu].user += diff[curcpu].user;
4791 stat_node->view[curcpu].system += diff[curcpu].system;
4792 stat_node->view[curcpu].idle += diff[curcpu].idle;
4793
4794 user_sum += stat_node->view[curcpu].user;
4795 system_sum += stat_node->view[curcpu].system;
4796 idle_sum += stat_node->view[curcpu].idle;
056adcef 4797
db1b32f6
SX
4798 diff_user += diff[curcpu].user;
4799 diff_system += diff[curcpu].system;
4800 diff_idle += diff[curcpu].idle;
4801 if (diff[curcpu].idle > max_diff_idle) {
4802 max_diff_idle = diff[curcpu].idle;
4803 max_diff_idle_index = curcpu;
4804 }
4805
4806 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
4807 }
4808 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
4809
4810 // revise cpu usage view to support partial cpu case
4811 double exact_cpus = exact_cpu_count(cg);
4812 if (exact_cpus < (double)max_cpus){
4813 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
4814 unsigned long delta = (unsigned long)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
4815 lxcfs_v("delta: %lu\n", delta);
4816 lxcfs_v("idle_sum before: %lu\n", idle_sum);
4817 idle_sum = idle_sum > delta ? idle_sum - delta : 0;
4818 lxcfs_v("idle_sum after: %lu\n", idle_sum);
1c4b4e38 4819
db1b32f6
SX
4820 curcpu = max_diff_idle_index;
4821 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
4822 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
4823 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
4824 }
056adcef 4825 } else {
77005a6c
JS
4826 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4827 if (!stat_node->usage[curcpu].online)
4828 continue;
4829
056adcef
JS
4830 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4831 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4832 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4833
4834 user_sum += stat_node->view[curcpu].user;
4835 system_sum += stat_node->view[curcpu].system;
4836 idle_sum += stat_node->view[curcpu].idle;
4837 }
4838 }
4839
4840 /* Render the file */
4841 /* cpu-all */
4842 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4843 user_sum,
4844 system_sum,
4845 idle_sum);
db1b32f6 4846 lxcfs_v("cpu-all: %s\n", buf);
056adcef
JS
4847
4848 if (l < 0) {
4849 perror("Error writing to cache");
4850 rv = 0;
4851 goto err;
056adcef
JS
4852 }
4853 if (l >= buf_size) {
4854 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4855 rv = 0;
4856 goto err;
4857 }
4858
4859 buf += l;
4860 buf_size -= l;
4861 total_len += l;
4862
4863 /* Render visible CPUs */
77005a6c
JS
4864 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4865 if (!stat_node->usage[curcpu].online)
4866 continue;
4867
4868 i++;
4869
4870 if (max_cpus > 0 && i == max_cpus)
056adcef
JS
4871 break;
4872
4873 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
77005a6c 4874 i,
056adcef
JS
4875 stat_node->view[curcpu].user,
4876 stat_node->view[curcpu].system,
4877 stat_node->view[curcpu].idle);
db1b32f6 4878 lxcfs_v("cpu: %s\n", buf);
056adcef
JS
4879
4880 if (l < 0) {
4881 perror("Error writing to cache");
4882 rv = 0;
4883 goto err;
4884
4885 }
4886 if (l >= buf_size) {
4887 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4888 rv = 0;
4889 goto err;
4890 }
4891
4892 buf += l;
4893 buf_size -= l;
4894 total_len += l;
4895 }
4896
4897 /* Pass the rest of /proc/stat, start with the last line read */
4898 l = snprintf(buf, buf_size, "%s", line);
4899
4900 if (l < 0) {
4901 perror("Error writing to cache");
4902 rv = 0;
4903 goto err;
4904
4905 }
4906 if (l >= buf_size) {
4907 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4908 rv = 0;
4909 goto err;
4910 }
4911
4912 buf += l;
4913 buf_size -= l;
4914 total_len += l;
4915
4916 /* Pass the rest of the host's /proc/stat */
4917 while (getline(&line, &linelen, f) != -1) {
4918 l = snprintf(buf, buf_size, "%s", line);
4919 if (l < 0) {
4920 perror("Error writing to cache");
4921 rv = 0;
4922 goto err;
4923 }
4924 if (l >= buf_size) {
4925 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4926 rv = 0;
4927 goto err;
4928 }
4929 buf += l;
4930 buf_size -= l;
4931 total_len += l;
4932 }
4933
4934 rv = total_len;
4935
4936err:
2f49b662
JS
4937 if (stat_node)
4938 pthread_mutex_unlock(&stat_node->lock);
056adcef
JS
4939 if (line)
4940 free(line);
4941 if (diff)
4942 free(diff);
4943 return rv;
4944}
4945
f34de69a 4946#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
237e200e
SH
4947static int proc_stat_read(char *buf, size_t size, off_t offset,
4948 struct fuse_file_info *fi)
4949{
4950 struct fuse_context *fc = fuse_get_context();
4951 struct file_info *d = (struct file_info *)fi->fh;
4952 char *cg;
4953 char *cpuset = NULL;
4954 char *line = NULL;
4955 size_t linelen = 0, total_len = 0, rv = 0;
4956 int curcpu = -1; /* cpu numbering starts at 0 */
77005a6c 4957 int physcpu = 0;
7144f069 4958 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
237e200e 4959 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
7144f069 4960 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
237e200e
SH
4961 char cpuall[CPUALL_MAX_SIZE];
4962 /* reserve for cpu all */
4963 char *cache = d->buf + CPUALL_MAX_SIZE;
4964 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4965 FILE *f = NULL;
8be92dd1 4966 struct cpuacct_usage *cg_cpu_usage = NULL;
79612c8b 4967 int cg_cpu_usage_size = 0;
237e200e
SH
4968
4969 if (offset){
4970 if (offset > d->size)
4971 return -EINVAL;
4972 if (!d->cached)
4973 return 0;
4974 int left = d->size - offset;
4975 total_len = left > size ? size: left;
4976 memcpy(buf, d->buf + offset, total_len);
4977 return total_len;
4978 }
4979
4980 pid_t initpid = lookup_initpid_in_store(fc->pid);
db1b32f6 4981 lxcfs_v("initpid: %d\n", initpid);
237e200e
SH
4982 if (initpid <= 0)
4983 initpid = fc->pid;
a4bd57ae
HY
4984
4985 /*
4986 * when container run with host pid namespace initpid == 1, cgroup will "/"
4987 * we should return host os's /proc contents.
4988 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
4989 */
4990 if (initpid == 1) {
4991 return read_file("/proc/stat", buf, size, d);
4992 }
4993
237e200e 4994 cg = get_pid_cgroup(initpid, "cpuset");
db1b32f6 4995 lxcfs_v("cg: %s\n", cg);
237e200e
SH
4996 if (!cg)
4997 return read_file("/proc/stat", buf, size, d);
6d2f6996 4998 prune_init_slice(cg);
237e200e
SH
4999
5000 cpuset = get_cpuset(cg);
5001 if (!cpuset)
5002 goto err;
5003
8be92dd1
JS
5004 /*
5005 * Read cpuacct.usage_all for all CPUs.
5006 * If the cpuacct cgroup is present, it is used to calculate the container's
5007 * CPU usage. If not, values from the host's /proc/stat are used.
5008 */
79612c8b 5009 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
db1b32f6 5010 lxcfs_v("%s\n", "proc_stat_read failed to read from cpuacct, "
8be92dd1
JS
5011 "falling back to the host's /proc/stat");
5012 }
5013
237e200e
SH
5014 f = fopen("/proc/stat", "r");
5015 if (!f)
5016 goto err;
5017
5018 //skip first line
5019 if (getline(&line, &linelen, f) < 0) {
b8defc3d 5020 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
237e200e
SH
5021 goto err;
5022 }
5023
056adcef 5024 if (use_cpuview(cg) && cg_cpu_usage) {
79612c8b
JS
5025 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
5026 f, d->buf, d->buflen);
056adcef
JS
5027 goto out;
5028 }
5029
237e200e 5030 while (getline(&line, &linelen, f) != -1) {
a262ddb7 5031 ssize_t l;
237e200e
SH
5032 char cpu_char[10]; /* That's a lot of cores */
5033 char *c;
8be92dd1
JS
5034 uint64_t all_used, cg_used, new_idle;
5035 int ret;
237e200e 5036
b4665ce0
SH
5037 if (strlen(line) == 0)
5038 continue;
237e200e
SH
5039 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
5040 /* not a ^cpuN line containing a number N, just print it */
9502bae2 5041 l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
5042 if (l < 0) {
5043 perror("Error writing to cache");
5044 rv = 0;
5045 goto err;
5046 }
5047 if (l >= cache_size) {
b8defc3d 5048 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
5049 rv = 0;
5050 goto err;
5051 }
5052 cache += l;
5053 cache_size -= l;
5054 total_len += l;
5055 continue;
5056 }
5057
77005a6c 5058 if (sscanf(cpu_char, "%d", &physcpu) != 1)
237e200e 5059 continue;
77005a6c 5060 if (!cpu_in_cpuset(physcpu, cpuset))
237e200e
SH
5061 continue;
5062 curcpu ++;
5063
8be92dd1 5064 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
7144f069
CB
5065 &user,
5066 &nice,
5067 &system,
5068 &idle,
5069 &iowait,
5070 &irq,
5071 &softirq,
5072 &steal,
5073 &guest,
8be92dd1
JS
5074 &guest_nice);
5075
5076 if (ret != 10 || !cg_cpu_usage) {
5077 c = strchr(line, ' ');
5078 if (!c)
5079 continue;
5080 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
5081 if (l < 0) {
5082 perror("Error writing to cache");
5083 rv = 0;
5084 goto err;
5085
5086 }
5087 if (l >= cache_size) {
5088 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5089 rv = 0;
5090 goto err;
5091 }
5092
5093 cache += l;
5094 cache_size -= l;
5095 total_len += l;
5096
5097 if (ret != 10)
5098 continue;
5099 }
5100
5101 if (cg_cpu_usage) {
79612c8b
JS
5102 if (physcpu >= cg_cpu_usage_size)
5103 break;
5104
8be92dd1 5105 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
77005a6c 5106 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
8be92dd1
JS
5107
5108 if (all_used >= cg_used) {
5109 new_idle = idle + (all_used - cg_used);
5110
5111 } else {
5112 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
5113 "%lu in cpuacct.usage_all; unable to determine idle time\n",
5114 curcpu, cg, all_used, cg_used);
5115 new_idle = idle;
5116 }
5117
5118 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
77005a6c 5119 curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
8be92dd1
JS
5120 new_idle);
5121
5122 if (l < 0) {
5123 perror("Error writing to cache");
5124 rv = 0;
5125 goto err;
5126
5127 }
5128 if (l >= cache_size) {
5129 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5130 rv = 0;
5131 goto err;
5132 }
5133
5134 cache += l;
5135 cache_size -= l;
5136 total_len += l;
5137
77005a6c
JS
5138 user_sum += cg_cpu_usage[physcpu].user;
5139 system_sum += cg_cpu_usage[physcpu].system;
8be92dd1
JS
5140 idle_sum += new_idle;
5141
5142 } else {
5143 user_sum += user;
5144 nice_sum += nice;
5145 system_sum += system;
5146 idle_sum += idle;
5147 iowait_sum += iowait;
5148 irq_sum += irq;
5149 softirq_sum += softirq;
5150 steal_sum += steal;
5151 guest_sum += guest;
5152 guest_nice_sum += guest_nice;
5153 }
237e200e
SH
5154 }
5155
5156 cache = d->buf;
5157
7144f069
CB
5158 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5159 user_sum,
5160 nice_sum,
5161 system_sum,
5162 idle_sum,
5163 iowait_sum,
5164 irq_sum,
5165 softirq_sum,
5166 steal_sum,
5167 guest_sum,
5168 guest_nice_sum);
5169 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
237e200e
SH
5170 memcpy(cache, cpuall, cpuall_len);
5171 cache += cpuall_len;
7144f069 5172 } else {
237e200e 5173 /* shouldn't happen */
b8defc3d 5174 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
237e200e
SH
5175 cpuall_len = 0;
5176 }
5177
5178 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
5179 total_len += cpuall_len;
056adcef
JS
5180
5181out:
237e200e
SH
5182 d->cached = 1;
5183 d->size = total_len;
7144f069
CB
5184 if (total_len > size)
5185 total_len = size;
237e200e
SH
5186
5187 memcpy(buf, d->buf, total_len);
5188 rv = total_len;
5189
5190err:
5191 if (f)
5192 fclose(f);
8be92dd1
JS
5193 if (cg_cpu_usage)
5194 free(cg_cpu_usage);
237e200e
SH
5195 free(line);
5196 free(cpuset);
5197 free(cg);
5198 return rv;
5199}
5200
0ecddf02
CB
5201/* This function retrieves the busy time of a group of tasks by looking at
5202 * cpuacct.usage. Unfortunately, this only makes sense when the container has
5203 * been given it's own cpuacct cgroup. If not, this function will take the busy
5204 * time of all other taks that do not actually belong to the container into
5205 * account as well. If someone has a clever solution for this please send a
5206 * patch!
5207 */
1c4b4e38 5208static double get_reaper_busy(pid_t task)
237e200e
SH
5209{
5210 pid_t initpid = lookup_initpid_in_store(task);
5211 char *cgroup = NULL, *usage_str = NULL;
5212 unsigned long usage = 0;
1c4b4e38 5213 double res = 0;
237e200e
SH
5214
5215 if (initpid <= 0)
5216 return 0;
5217
5218 cgroup = get_pid_cgroup(initpid, "cpuacct");
5219 if (!cgroup)
5220 goto out;
6d2f6996 5221 prune_init_slice(cgroup);
237e200e
SH
5222 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
5223 goto out;
5224 usage = strtoul(usage_str, NULL, 10);
1c4b4e38 5225 res = (double)usage / 1000000000;
237e200e
SH
5226
5227out:
5228 free(cgroup);
5229 free(usage_str);
1c4b4e38 5230 return res;
237e200e
SH
5231}
5232
5233#if RELOADTEST
5234void iwashere(void)
5235{
237e200e
SH
5236 int fd;
5237
ec2b5e7c 5238 fd = creat("/tmp/lxcfs-iwashere", 0644);
237e200e
SH
5239 if (fd >= 0)
5240 close(fd);
5241}
5242#endif
5243
5244/*
5245 * We read /proc/uptime and reuse its second field.
5246 * For the first field, we use the mtime for the reaper for
5247 * the calling pid as returned by getreaperage
5248 */
5249static int proc_uptime_read(char *buf, size_t size, off_t offset,
5250 struct fuse_file_info *fi)
5251{
5252 struct fuse_context *fc = fuse_get_context();
5253 struct file_info *d = (struct file_info *)fi->fh;
1c4b4e38 5254 double busytime = get_reaper_busy(fc->pid);
237e200e 5255 char *cache = d->buf;
a262ddb7 5256 ssize_t total_len = 0;
1c4b4e38 5257 double idletime, reaperage;
237e200e
SH
5258
5259#if RELOADTEST
5260 iwashere();
5261#endif
5262
5263 if (offset){
237e200e
SH
5264 if (!d->cached)
5265 return 0;
bbdf646b
BM
5266 if (offset > d->size)
5267 return -EINVAL;
237e200e
SH
5268 int left = d->size - offset;
5269 total_len = left > size ? size: left;
5270 memcpy(buf, cache + offset, total_len);
5271 return total_len;
5272 }
5273
0ecddf02
CB
5274 reaperage = get_reaper_age(fc->pid);
5275 /* To understand why this is done, please read the comment to the
5276 * get_reaper_busy() function.
5277 */
5278 idletime = reaperage;
5279 if (reaperage >= busytime)
5280 idletime = reaperage - busytime;
237e200e 5281
1c4b4e38 5282 total_len = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
bbdf646b 5283 if (total_len < 0 || total_len >= d->buflen){
0ecddf02 5284 lxcfs_error("%s\n", "failed to write to cache");
237e200e
SH
5285 return 0;
5286 }
5287
5288 d->size = (int)total_len;
5289 d->cached = 1;
5290
5291 if (total_len > size) total_len = size;
5292
5293 memcpy(buf, d->buf, total_len);
5294 return total_len;
5295}
5296
5297static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5298 struct fuse_file_info *fi)
5299{
5300 char dev_name[72];
5301 struct fuse_context *fc = fuse_get_context();
5302 struct file_info *d = (struct file_info *)fi->fh;
5303 char *cg;
5304 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5305 *io_wait_time_str = NULL, *io_service_time_str = NULL;
5306 unsigned long read = 0, write = 0;
5307 unsigned long read_merged = 0, write_merged = 0;
5308 unsigned long read_sectors = 0, write_sectors = 0;
5309 unsigned long read_ticks = 0, write_ticks = 0;
5310 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5311 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5312 char *cache = d->buf;
5313 size_t cache_size = d->buflen;
5314 char *line = NULL;
5315 size_t linelen = 0, total_len = 0, rv = 0;
5316 unsigned int major = 0, minor = 0;
5317 int i = 0;
5318 FILE *f = NULL;
5319
5320 if (offset){
5321 if (offset > d->size)
5322 return -EINVAL;
5323 if (!d->cached)
5324 return 0;
5325 int left = d->size - offset;
5326 total_len = left > size ? size: left;
5327 memcpy(buf, cache + offset, total_len);
5328 return total_len;
5329 }
5330
5331 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 5332 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
5333 initpid = fc->pid;
5334 cg = get_pid_cgroup(initpid, "blkio");
5335 if (!cg)
5336 return read_file("/proc/diskstats", buf, size, d);
6d2f6996 5337 prune_init_slice(cg);
237e200e 5338
2209fe50 5339 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
237e200e 5340 goto err;
2209fe50 5341 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
237e200e 5342 goto err;
2209fe50 5343 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
237e200e 5344 goto err;
2209fe50 5345 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
237e200e 5346 goto err;
2209fe50 5347 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
237e200e
SH
5348 goto err;
5349
5350
5351 f = fopen("/proc/diskstats", "r");
5352 if (!f)
5353 goto err;
5354
5355 while (getline(&line, &linelen, f) != -1) {
a262ddb7 5356 ssize_t l;
2209fe50 5357 char lbuf[256];
237e200e
SH
5358
5359 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2209fe50 5360 if (i != 3)
237e200e 5361 continue;
2209fe50
SH
5362
5363 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5364 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5365 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5366 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5367 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5368 read_sectors = read_sectors/512;
5369 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5370 write_sectors = write_sectors/512;
5371
5372 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5373 rd_svctm = rd_svctm/1000000;
5374 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5375 rd_wait = rd_wait/1000000;
5376 read_ticks = rd_svctm + rd_wait;
5377
5378 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5379 wr_svctm = wr_svctm/1000000;
5380 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5381 wr_wait = wr_wait/1000000;
5382 write_ticks = wr_svctm + wr_wait;
5383
5384 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5385 tot_ticks = tot_ticks/1000000;
237e200e
SH
5386
5387 memset(lbuf, 0, 256);
2db31eb6
SH
5388 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5389 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5390 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5391 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5392 else
5393 continue;
237e200e 5394
2209fe50 5395 l = snprintf(cache, cache_size, "%s", lbuf);
237e200e
SH
5396 if (l < 0) {
5397 perror("Error writing to fuse buf");
5398 rv = 0;
5399 goto err;
5400 }
5401 if (l >= cache_size) {
b8defc3d 5402 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
5403 rv = 0;
5404 goto err;
5405 }
5406 cache += l;
5407 cache_size -= l;
5408 total_len += l;
5409 }
5410
5411 d->cached = 1;
5412 d->size = total_len;
5413 if (total_len > size ) total_len = size;
5414 memcpy(buf, d->buf, total_len);
5415
5416 rv = total_len;
5417err:
5418 free(cg);
5419 if (f)
5420 fclose(f);
5421 free(line);
5422 free(io_serviced_str);
5423 free(io_merged_str);
5424 free(io_service_bytes_str);
5425 free(io_wait_time_str);
5426 free(io_service_time_str);
5427 return rv;
5428}
5429
70dcc12e
SH
5430static int proc_swaps_read(char *buf, size_t size, off_t offset,
5431 struct fuse_file_info *fi)
5432{
5433 struct fuse_context *fc = fuse_get_context();
5434 struct file_info *d = (struct file_info *)fi->fh;
5435 char *cg = NULL;
018246ff 5436 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
70dcc12e 5437 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
a262ddb7
CB
5438 ssize_t total_len = 0, rv = 0;
5439 ssize_t l = 0;
70dcc12e
SH
5440 char *cache = d->buf;
5441
5442 if (offset) {
5443 if (offset > d->size)
5444 return -EINVAL;
5445 if (!d->cached)
5446 return 0;
5447 int left = d->size - offset;
5448 total_len = left > size ? size: left;
5449 memcpy(buf, cache + offset, total_len);
5450 return total_len;
5451 }
5452
5453 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 5454 if (initpid <= 1 || is_shared_pidns(initpid))
70dcc12e
SH
5455 initpid = fc->pid;
5456 cg = get_pid_cgroup(initpid, "memory");
5457 if (!cg)
5458 return read_file("/proc/swaps", buf, size, d);
6d2f6996 5459 prune_init_slice(cg);
70dcc12e 5460
018246ff 5461 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
70dcc12e
SH
5462
5463 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
5464 goto err;
5465
70dcc12e
SH
5466 memusage = strtoul(memusage_str, NULL, 10);
5467
5468 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5469 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5470
018246ff 5471 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
70dcc12e
SH
5472 memswusage = strtoul(memswusage_str, NULL, 10);
5473
70dcc12e
SH
5474 swap_total = (memswlimit - memlimit) / 1024;
5475 swap_free = (memswusage - memusage) / 1024;
5476 }
5477
5478 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5479
5480 /* When no mem + swap limit is specified or swapaccount=0*/
5481 if (!memswlimit) {
5482 char *line = NULL;
5483 size_t linelen = 0;
5484 FILE *f = fopen("/proc/meminfo", "r");
5485
5486 if (!f)
5487 goto err;
5488
5489 while (getline(&line, &linelen, f) != -1) {
5490 if (startswith(line, "SwapTotal:")) {
5491 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
5492 } else if (startswith(line, "SwapFree:")) {
5493 sscanf(line, "SwapFree: %8lu kB", &swap_free);
5494 }
5495 }
5496
5497 free(line);
5498 fclose(f);
5499 }
5500
5501 if (swap_total > 0) {
a262ddb7
CB
5502 l = snprintf(d->buf + total_len, d->size - total_len,
5503 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5504 swap_total, swap_free);
5505 total_len += l;
70dcc12e
SH
5506 }
5507
a262ddb7 5508 if (total_len < 0 || l < 0) {
70dcc12e
SH
5509 perror("Error writing to cache");
5510 rv = 0;
5511 goto err;
5512 }
5513
5514 d->cached = 1;
5515 d->size = (int)total_len;
5516
5517 if (total_len > size) total_len = size;
5518 memcpy(buf, d->buf, total_len);
5519 rv = total_len;
5520
5521err:
5522 free(cg);
5523 free(memswlimit_str);
5524 free(memlimit_str);
5525 free(memusage_str);
5526 free(memswusage_str);
70dcc12e
SH
5527 return rv;
5528}
6db4f7a3 5529/*
5530 * Find the process pid from cgroup path.
5531 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5532 * @pid_buf : put pid to pid_buf.
5533 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5534 * @depth : the depth of cgroup in container.
5535 * @sum : return the number of pid.
5536 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5537 */
5538static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5539{
5540 DIR *dir;
5541 int fd;
5542 struct dirent *file;
5543 FILE *f = NULL;
5544 size_t linelen = 0;
5545 char *line = NULL;
5546 int pd;
5547 char *path_dir, *path;
5548 char **pid;
5549
5550 /* path = dpath + "/cgroup.procs" + /0 */
5551 do {
5552 path = malloc(strlen(dpath) + 20);
5553 } while (!path);
5554
5555 strcpy(path, dpath);
5556 fd = openat(cfd, path, O_RDONLY);
5557 if (fd < 0)
5558 goto out;
5559
5560 dir = fdopendir(fd);
5561 if (dir == NULL) {
5562 close(fd);
5563 goto out;
5564 }
5565
5566 while (((file = readdir(dir)) != NULL) && depth > 0) {
5567 if (strncmp(file->d_name, ".", 1) == 0)
5568 continue;
5569 if (strncmp(file->d_name, "..", 1) == 0)
5570 continue;
5571 if (file->d_type == DT_DIR) {
5572 /* path + '/' + d_name +/0 */
5573 do {
5574 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5575 } while (!path_dir);
5576 strcpy(path_dir, path);
5577 strcat(path_dir, "/");
5578 strcat(path_dir, file->d_name);
5579 pd = depth - 1;
5580 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5581 free(path_dir);
5582 }
5583 }
5584 closedir(dir);
5585
5586 strcat(path, "/cgroup.procs");
5587 fd = openat(cfd, path, O_RDONLY);
5588 if (fd < 0)
5589 goto out;
5590
5591 f = fdopen(fd, "r");
5592 if (!f) {
5593 close(fd);
5594 goto out;
5595 }
5596
5597 while (getline(&line, &linelen, f) != -1) {
5598 do {
5599 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5600 } while (!pid);
5601 *pid_buf = pid;
5602 do {
5603 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5604 } while (*(*pid_buf + sum) == NULL);
5605 strcpy(*(*pid_buf + sum), line);
5606 sum++;
5607 }
5608 fclose(f);
5609out:
832904c1
JS
5610 if (line)
5611 free(line);
6db4f7a3 5612 free(path);
5613 return sum;
5614}
5615/*
5616 * calc_load calculates the load according to the following formula:
5617 * load1 = load0 * exp + active * (1 - exp)
5618 *
5619 * @load1: the new loadavg.
5620 * @load0: the former loadavg.
5621 * @active: the total number of running pid at this moment.
5622 * @exp: the fixed-point defined in the beginning.
5623 */
5624static unsigned long
5625calc_load(unsigned long load, unsigned long exp, unsigned long active)
5626{
5627 unsigned long newload;
5628
5629 active = active > 0 ? active * FIXED_1 : 0;
5630 newload = load * exp + active * (FIXED_1 - exp);
5631 if (active >= load)
5632 newload += FIXED_1 - 1;
5633
5634 return newload / FIXED_1;
5635}
5636
5637/*
5638 * Return 0 means that container p->cg is closed.
5639 * Return -1 means that error occurred in refresh.
5640 * Positive num equals the total number of pid.
5641 */
5642static int refresh_load(struct load_node *p, char *path)
5643{
5644 FILE *f = NULL;
5645 char **idbuf;
5646 char proc_path[256];
5647 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5648 char *line = NULL;
5649 size_t linelen = 0;
5650 int sum, length;
5651 DIR *dp;
5652 struct dirent *file;
5653
5654 do {
5655 idbuf = malloc(sizeof(char *));
5656 } while (!idbuf);
5657 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5658 /* normal exit */
5659 if (sum == 0)
5660 goto out;
5661
5662 for (i = 0; i < sum; i++) {
5663 /*clean up '\n' */
5664 length = strlen(idbuf[i])-1;
5665 idbuf[i][length] = '\0';
5666 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5667 if (ret < 0 || ret > 255) {
5668 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5669 i = sum;
5670 sum = -1;
5671 goto err_out;
5672 }
5673
5674 dp = opendir(proc_path);
5675 if (!dp) {
5676 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5677 continue;
5678 }
5679 while ((file = readdir(dp)) != NULL) {
5680 if (strncmp(file->d_name, ".", 1) == 0)
5681 continue;
5682 if (strncmp(file->d_name, "..", 1) == 0)
5683 continue;
5684 total_pid++;
5685 /* We make the biggest pid become last_pid.*/
5686 ret = atof(file->d_name);
5687 last_pid = (ret > last_pid) ? ret : last_pid;
5688
5689 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5690 if (ret < 0 || ret > 255) {
5691 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5692 i = sum;
5693 sum = -1;
5694 closedir(dp);
5695 goto err_out;
5696 }
5697 f = fopen(proc_path, "r");
5698 if (f != NULL) {
5699 while (getline(&line, &linelen, f) != -1) {
5700 /* Find State */
5701 if ((line[0] == 'S') && (line[1] == 't'))
5702 break;
5703 }
5704 if ((line[7] == 'R') || (line[7] == 'D'))
5705 run_pid++;
5706 fclose(f);
5707 }
5708 }
5709 closedir(dp);
5710 }
5711 /*Calculate the loadavg.*/
5712 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5713 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5714 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5715 p->run_pid = run_pid;
5716 p->total_pid = total_pid;
5717 p->last_pid = last_pid;
5718
5719 free(line);
beb5024e 5720err_out:
6db4f7a3 5721 for (; i > 0; i--)
5722 free(idbuf[i-1]);
5723out:
5724 free(idbuf);
5725 return sum;
5726}
5727/*
5728 * Traverse the hash table and update it.
5729 */
5730void *load_begin(void *arg)
5731{
5732
5733 char *path = NULL;
5734 int i, sum, length, ret;
5735 struct load_node *f;
5736 int first_node;
5737 clock_t time1, time2;
5738
5739 while (1) {
a83618e2
JS
5740 if (loadavg_stop == 1)
5741 return NULL;
5742
6db4f7a3 5743 time1 = clock();
5744 for (i = 0; i < LOAD_SIZE; i++) {
5745 pthread_mutex_lock(&load_hash[i].lock);
5746 if (load_hash[i].next == NULL) {
5747 pthread_mutex_unlock(&load_hash[i].lock);
5748 continue;
5749 }
5750 f = load_hash[i].next;
5751 first_node = 1;
5752 while (f) {
5753 length = strlen(f->cg) + 2;
5754 do {
5755 /* strlen(f->cg) + '.' or '' + \0 */
5756 path = malloc(length);
5757 } while (!path);
5758
5759 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5760 if (ret < 0 || ret > length - 1) {
5761 /* snprintf failed, ignore the node.*/
5762 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5763 goto out;
5764 }
5765 sum = refresh_load(f, path);
5766 if (sum == 0) {
5767 f = del_node(f, i);
5768 } else {
5769out: f = f->next;
5770 }
5771 free(path);
5772 /* load_hash[i].lock locks only on the first node.*/
5773 if (first_node == 1) {
5774 first_node = 0;
5775 pthread_mutex_unlock(&load_hash[i].lock);
5776 }
5777 }
5778 }
a83618e2
JS
5779
5780 if (loadavg_stop == 1)
5781 return NULL;
5782
6db4f7a3 5783 time2 = clock();
5784 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5785 }
5786}
5787
5788static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5789 struct fuse_file_info *fi)
5790{
5791 struct fuse_context *fc = fuse_get_context();
5792 struct file_info *d = (struct file_info *)fi->fh;
5793 pid_t initpid;
5794 char *cg;
5795 size_t total_len = 0;
5796 char *cache = d->buf;
5797 struct load_node *n;
5798 int hash;
01d88ede 5799 int cfd, rv = 0;
6db4f7a3 5800 unsigned long a, b, c;
5801
5802 if (offset) {
5803 if (offset > d->size)
5804 return -EINVAL;
5805 if (!d->cached)
5806 return 0;
5807 int left = d->size - offset;
5808 total_len = left > size ? size : left;
5809 memcpy(buf, cache + offset, total_len);
5810 return total_len;
5811 }
5812 if (!loadavg)
5813 return read_file("/proc/loadavg", buf, size, d);
5814
5815 initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 5816 if (initpid <= 1 || is_shared_pidns(initpid))
6db4f7a3 5817 initpid = fc->pid;
5818 cg = get_pid_cgroup(initpid, "cpu");
5819 if (!cg)
5820 return read_file("/proc/loadavg", buf, size, d);
5821
5822 prune_init_slice(cg);
b077527b 5823 hash = calc_hash(cg) % LOAD_SIZE;
6db4f7a3 5824 n = locate_node(cg, hash);
5825
5826 /* First time */
5827 if (n == NULL) {
5828 if (!find_mounted_controller("cpu", &cfd)) {
5829 /*
5830 * In locate_node() above, pthread_rwlock_unlock() isn't used
5831 * because delete is not allowed before read has ended.
5832 */
5833 pthread_rwlock_unlock(&load_hash[hash].rdlock);
01d88ede
JS
5834 rv = 0;
5835 goto err;
6db4f7a3 5836 }
5837 do {
5838 n = malloc(sizeof(struct load_node));
5839 } while (!n);
5840
5841 do {
5842 n->cg = malloc(strlen(cg)+1);
5843 } while (!n->cg);
5844 strcpy(n->cg, cg);
5845 n->avenrun[0] = 0;
5846 n->avenrun[1] = 0;
5847 n->avenrun[2] = 0;
5848 n->run_pid = 0;
5849 n->total_pid = 1;
5850 n->last_pid = initpid;
5851 n->cfd = cfd;
5852 insert_node(&n, hash);
5853 }
5854 a = n->avenrun[0] + (FIXED_1/200);
5855 b = n->avenrun[1] + (FIXED_1/200);
5856 c = n->avenrun[2] + (FIXED_1/200);
5857 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5858 LOAD_INT(a), LOAD_FRAC(a),
5859 LOAD_INT(b), LOAD_FRAC(b),
5860 LOAD_INT(c), LOAD_FRAC(c),
5861 n->run_pid, n->total_pid, n->last_pid);
5862 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5863 if (total_len < 0 || total_len >= d->buflen) {
5864 lxcfs_error("%s\n", "Failed to write to cache");
01d88ede
JS
5865 rv = 0;
5866 goto err;
6db4f7a3 5867 }
5868 d->size = (int)total_len;
5869 d->cached = 1;
5870
5871 if (total_len > size)
5872 total_len = size;
5873 memcpy(buf, d->buf, total_len);
01d88ede
JS
5874 rv = total_len;
5875
5876err:
5877 free(cg);
5878 return rv;
6db4f7a3 5879}
5880/* Return a positive number on success, return 0 on failure.*/
5881pthread_t load_daemon(int load_use)
5882{
5883 int ret;
5884 pthread_t pid;
5885
5886 ret = init_load();
5887 if (ret == -1) {
5888 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5889 return 0;
5890 }
5891 ret = pthread_create(&pid, NULL, load_begin, NULL);
5892 if (ret != 0) {
5893 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5894 load_free();
5895 return 0;
5896 }
5897 /* use loadavg, here loadavg = 1*/
5898 loadavg = load_use;
5899 return pid;
5900}
70dcc12e 5901
a83618e2
JS
5902/* Returns 0 on success. */
5903int stop_load_daemon(pthread_t pid)
5904{
5905 int s;
5906
5907 /* Signal the thread to gracefully stop */
5908 loadavg_stop = 1;
5909
5910 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5911 if (s != 0) {
5912 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5913 return -1;
5914 }
5915
5916 load_free();
5917 loadavg_stop = 0;
5918
5919 return 0;
5920}
5921
237e200e
SH
5922static off_t get_procfile_size(const char *which)
5923{
5924 FILE *f = fopen(which, "r");
5925 char *line = NULL;
5926 size_t len = 0;
5927 ssize_t sz, answer = 0;
5928 if (!f)
5929 return 0;
5930
5931 while ((sz = getline(&line, &len, f)) != -1)
5932 answer += sz;
5933 fclose (f);
5934 free(line);
5935
5936 return answer;
5937}
5938
5939int proc_getattr(const char *path, struct stat *sb)
5940{
5941 struct timespec now;
5942
5943 memset(sb, 0, sizeof(struct stat));
5944 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5945 return -EINVAL;
5946 sb->st_uid = sb->st_gid = 0;
5947 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5948 if (strcmp(path, "/proc") == 0) {
5949 sb->st_mode = S_IFDIR | 00555;
5950 sb->st_nlink = 2;
5951 return 0;
5952 }
5953 if (strcmp(path, "/proc/meminfo") == 0 ||
5954 strcmp(path, "/proc/cpuinfo") == 0 ||
5955 strcmp(path, "/proc/uptime") == 0 ||
5956 strcmp(path, "/proc/stat") == 0 ||
70dcc12e 5957 strcmp(path, "/proc/diskstats") == 0 ||
46be8eed 5958 strcmp(path, "/proc/swaps") == 0 ||
5959 strcmp(path, "/proc/loadavg") == 0) {
237e200e
SH
5960 sb->st_size = 0;
5961 sb->st_mode = S_IFREG | 00444;
5962 sb->st_nlink = 1;
5963 return 0;
5964 }
5965
5966 return -ENOENT;
5967}
5968
5969int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5970 struct fuse_file_info *fi)
5971{
d639f863
CB
5972 if (filler(buf, ".", NULL, 0) != 0 ||
5973 filler(buf, "..", NULL, 0) != 0 ||
5974 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5975 filler(buf, "meminfo", NULL, 0) != 0 ||
5976 filler(buf, "stat", NULL, 0) != 0 ||
5977 filler(buf, "uptime", NULL, 0) != 0 ||
5978 filler(buf, "diskstats", NULL, 0) != 0 ||
46be8eed 5979 filler(buf, "swaps", NULL, 0) != 0 ||
5980 filler(buf, "loadavg", NULL, 0) != 0)
237e200e
SH
5981 return -EINVAL;
5982 return 0;
5983}
5984
5985int proc_open(const char *path, struct fuse_file_info *fi)
5986{
5987 int type = -1;
5988 struct file_info *info;
5989
5990 if (strcmp(path, "/proc/meminfo") == 0)
5991 type = LXC_TYPE_PROC_MEMINFO;
5992 else if (strcmp(path, "/proc/cpuinfo") == 0)
5993 type = LXC_TYPE_PROC_CPUINFO;
5994 else if (strcmp(path, "/proc/uptime") == 0)
5995 type = LXC_TYPE_PROC_UPTIME;
5996 else if (strcmp(path, "/proc/stat") == 0)
5997 type = LXC_TYPE_PROC_STAT;
5998 else if (strcmp(path, "/proc/diskstats") == 0)
5999 type = LXC_TYPE_PROC_DISKSTATS;
70dcc12e
SH
6000 else if (strcmp(path, "/proc/swaps") == 0)
6001 type = LXC_TYPE_PROC_SWAPS;
46be8eed 6002 else if (strcmp(path, "/proc/loadavg") == 0)
6003 type = LXC_TYPE_PROC_LOADAVG;
237e200e
SH
6004 if (type == -1)
6005 return -ENOENT;
6006
6007 info = malloc(sizeof(*info));
6008 if (!info)
6009 return -ENOMEM;
6010
6011 memset(info, 0, sizeof(*info));
6012 info->type = type;
6013
6014 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
6015 do {
6016 info->buf = malloc(info->buflen);
6017 } while (!info->buf);
6018 memset(info->buf, 0, info->buflen);
6019 /* set actual size to buffer size */
6020 info->size = info->buflen;
6021
6022 fi->fh = (unsigned long)info;
6023 return 0;
6024}
6025
bddbb106
SH
6026int proc_access(const char *path, int mask)
6027{
e7849aa3
CB
6028 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
6029 return 0;
6030
bddbb106
SH
6031 /* these are all read-only */
6032 if ((mask & ~R_OK) != 0)
1b060d0a 6033 return -EACCES;
bddbb106
SH
6034 return 0;
6035}
6036
237e200e
SH
6037int proc_release(const char *path, struct fuse_file_info *fi)
6038{
43215927 6039 do_release_file_info(fi);
237e200e
SH
6040 return 0;
6041}
6042
6043int proc_read(const char *path, char *buf, size_t size, off_t offset,
6044 struct fuse_file_info *fi)
6045{
6046 struct file_info *f = (struct file_info *) fi->fh;
6047
6048 switch (f->type) {
6049 case LXC_TYPE_PROC_MEMINFO:
6050 return proc_meminfo_read(buf, size, offset, fi);
6051 case LXC_TYPE_PROC_CPUINFO:
6052 return proc_cpuinfo_read(buf, size, offset, fi);
6053 case LXC_TYPE_PROC_UPTIME:
6054 return proc_uptime_read(buf, size, offset, fi);
6055 case LXC_TYPE_PROC_STAT:
6056 return proc_stat_read(buf, size, offset, fi);
6057 case LXC_TYPE_PROC_DISKSTATS:
6058 return proc_diskstats_read(buf, size, offset, fi);
70dcc12e
SH
6059 case LXC_TYPE_PROC_SWAPS:
6060 return proc_swaps_read(buf, size, offset, fi);
46be8eed 6061 case LXC_TYPE_PROC_LOADAVG:
6062 return proc_loadavg_read(buf, size, offset, fi);
237e200e
SH
6063 default:
6064 return -EINVAL;
6065 }
6066}
6067
29a73c2f
CB
6068/*
6069 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
6070 */
6071
6072static bool mkdir_p(const char *dir, mode_t mode)
6073{
6074 const char *tmp = dir;
6075 const char *orig = dir;
6076 char *makeme;
6077
6078 do {
6079 dir = tmp + strspn(tmp, "/");
6080 tmp = dir + strcspn(dir, "/");
6081 makeme = strndup(orig, dir - orig);
6082 if (!makeme)
6083 return false;
6084 if (mkdir(makeme, mode) && errno != EEXIST) {
b8defc3d 6085 lxcfs_error("Failed to create directory '%s': %s.\n",
29a73c2f
CB
6086 makeme, strerror(errno));
6087 free(makeme);
6088 return false;
6089 }
6090 free(makeme);
6091 } while(tmp != dir);
6092
6093 return true;
6094}
6095
6096static bool umount_if_mounted(void)
6097{
6098 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 6099 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
6100 return false;
6101 }
6102 return true;
6103}
6104
2283e240
CB
6105/* __typeof__ should be safe to use with all compilers. */
6106typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
6107static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
6108{
6109 return (fs->f_type == (fs_type_magic)magic_val);
6110}
6111
0a4dea41
CB
6112/*
6113 * looking at fs/proc_namespace.c, it appears we can
6114 * actually expect the rootfs entry to very specifically contain
6115 * " - rootfs rootfs "
6116 * IIUC, so long as we've chrooted so that rootfs is not our root,
6117 * the rootfs entry should always be skipped in mountinfo contents.
6118 */
6119static bool is_on_ramfs(void)
6120{
6121 FILE *f;
6122 char *p, *p2;
6123 char *line = NULL;
6124 size_t len = 0;
6125 int i;
6126
6127 f = fopen("/proc/self/mountinfo", "r");
6128 if (!f)
6129 return false;
6130
6131 while (getline(&line, &len, f) != -1) {
6132 for (p = line, i = 0; p && i < 4; i++)
6133 p = strchr(p + 1, ' ');
6134 if (!p)
6135 continue;
6136 p2 = strchr(p + 1, ' ');
6137 if (!p2)
6138 continue;
6139 *p2 = '\0';
6140 if (strcmp(p + 1, "/") == 0) {
6141 // this is '/'. is it the ramfs?
6142 p = strchr(p2 + 1, '-');
6143 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
6144 free(line);
6145 fclose(f);
6146 return true;
6147 }
6148 }
6149 }
6150 free(line);
6151 fclose(f);
6152 return false;
6153}
6154
cc309f33 6155static int pivot_enter()
0a4dea41 6156{
cc309f33
CB
6157 int ret = -1, oldroot = -1, newroot = -1;
6158
6159 oldroot = open("/", O_DIRECTORY | O_RDONLY);
6160 if (oldroot < 0) {
6161 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
6162 return ret;
6163 }
6164
6165 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
6166 if (newroot < 0) {
6167 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
6168 goto err;
6169 }
6170
6171 /* change into new root fs */
6172 if (fchdir(newroot) < 0) {
6173 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
6174 goto err;
6175 }
6176
0a4dea41
CB
6177 /* pivot_root into our new root fs */
6178 if (pivot_root(".", ".") < 0) {
6179 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
cc309f33 6180 goto err;
0a4dea41
CB
6181 }
6182
6183 /*
6184 * At this point the old-root is mounted on top of our new-root.
6185 * To unmounted it we must not be chdir'd into it, so escape back
6186 * to the old-root.
6187 */
6188 if (fchdir(oldroot) < 0) {
6189 lxcfs_error("%s\n", "Failed to enter old root.");
cc309f33 6190 goto err;
0a4dea41
CB
6191 }
6192
6193 if (umount2(".", MNT_DETACH) < 0) {
6194 lxcfs_error("%s\n", "Failed to detach old root.");
cc309f33 6195 goto err;
0a4dea41
CB
6196 }
6197
6198 if (fchdir(newroot) < 0) {
6199 lxcfs_error("%s\n", "Failed to re-enter new root.");
cc309f33 6200 goto err;
0a4dea41
CB
6201 }
6202
cc309f33
CB
6203 ret = 0;
6204
6205err:
6206 if (oldroot > 0)
6207 close(oldroot);
6208 if (newroot > 0)
6209 close(newroot);
6210
6211 return ret;
0a4dea41
CB
6212}
6213
6214static int chroot_enter()
6215{
6216 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
6217 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
6218 return -1;
6219 }
6220
6221 if (chroot(".") < 0) {
6222 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
6223 return -1;
6224 }
6225
6226 if (chdir("/") < 0) {
6227 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
6228 return -1;
6229 }
6230
6231 return 0;
6232}
6233
0232cbac 6234static int permute_and_enter(void)
29a73c2f 6235{
0a4dea41
CB
6236 struct statfs sb;
6237
6238 if (statfs("/", &sb) < 0) {
6239 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 6240 return -1;
0a4dea41
CB
6241 }
6242
6243 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
6244 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
6245 * /proc/1/mountinfo. */
6246 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
6247 return chroot_enter();
29a73c2f 6248
cc309f33 6249 if (pivot_enter() < 0) {
0a4dea41 6250 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 6251 return -1;
29a73c2f
CB
6252 }
6253
cc309f33 6254 return 0;
29a73c2f
CB
6255}
6256
6257/* Prepare our new clean root. */
0232cbac 6258static int permute_prepare(void)
29a73c2f
CB
6259{
6260 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 6261 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
6262 return -1;
6263 }
6264
6265 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 6266 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
6267 return -1;
6268 }
6269
6270 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 6271 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
6272 return -1;
6273 }
6274
6275 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 6276 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
6277 return -1;
6278 }
6279
6280 return 0;
6281}
6282
0232cbac
CB
6283/* Calls chroot() on ramfs, pivot_root() in all other cases. */
6284static bool permute_root(void)
29a73c2f
CB
6285{
6286 /* Prepare new root. */
0232cbac 6287 if (permute_prepare() < 0)
29a73c2f
CB
6288 return false;
6289
6290 /* Pivot into new root. */
0232cbac 6291 if (permute_and_enter() < 0)
29a73c2f
CB
6292 return false;
6293
6294 return true;
6295}
6296
a257a8ee
CB
6297static int preserve_mnt_ns(int pid)
6298{
6299 int ret;
6300 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6301 char path[len];
6302
6303 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6304 if (ret < 0 || (size_t)ret >= len)
6305 return -1;
6306
6307 return open(path, O_RDONLY | O_CLOEXEC);
6308}
6309
0a4dea41 6310static bool cgfs_prepare_mounts(void)
29a73c2f
CB
6311{
6312 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 6313 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
6314 return false;
6315 }
480262c9 6316
29a73c2f 6317 if (!umount_if_mounted()) {
b8defc3d 6318 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
6319 return false;
6320 }
6321
6322 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 6323 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
6324 return false;
6325 }
6326
a257a8ee
CB
6327 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6328 if (cgroup_mount_ns_fd < 0) {
6329 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6330 return false;
6331 }
6332
480262c9 6333 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 6334 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
6335 return false;
6336 }
480262c9 6337
29a73c2f 6338 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 6339 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
6340 return false;
6341 }
480262c9 6342
29a73c2f
CB
6343 return true;
6344}
6345
0a4dea41 6346static bool cgfs_mount_hierarchies(void)
29a73c2f
CB
6347{
6348 char *target;
6349 size_t clen, len;
6350 int i, ret;
6351
6352 for (i = 0; i < num_hierarchies; i++) {
6353 char *controller = hierarchies[i];
51c7ca35 6354
29a73c2f
CB
6355 clen = strlen(controller);
6356 len = strlen(BASEDIR) + clen + 2;
6357 target = malloc(len);
6358 if (!target)
6359 return false;
51c7ca35 6360
29a73c2f
CB
6361 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
6362 if (ret < 0 || ret >= len) {
6363 free(target);
6364 return false;
6365 }
6366 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
6367 free(target);
6368 return false;
6369 }
51c7ca35
CB
6370 if (!strcmp(controller, "unified"))
6371 ret = mount("none", target, "cgroup2", 0, NULL);
6372 else
6373 ret = mount(controller, target, "cgroup", 0, controller);
6374 if (ret < 0) {
6375 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
29a73c2f
CB
6376 free(target);
6377 return false;
6378 }
6379
6380 fd_hierarchies[i] = open(target, O_DIRECTORY);
6381 if (fd_hierarchies[i] < 0) {
6382 free(target);
6383 return false;
6384 }
6385 free(target);
6386 }
6387 return true;
6388}
6389
480262c9 6390static bool cgfs_setup_controllers(void)
29a73c2f 6391{
0a4dea41 6392 if (!cgfs_prepare_mounts())
29a73c2f 6393 return false;
29a73c2f 6394
0a4dea41 6395 if (!cgfs_mount_hierarchies()) {
b8defc3d 6396 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
29a73c2f
CB
6397 return false;
6398 }
6399
0232cbac 6400 if (!permute_root())
29a73c2f
CB
6401 return false;
6402
6403 return true;
6404}
6405
6406static void __attribute__((constructor)) collect_and_mount_subsystems(void)
237e200e
SH
6407{
6408 FILE *f;
e58dab00
CB
6409 char *cret, *line = NULL;
6410 char cwd[MAXPATHLEN];
237e200e 6411 size_t len = 0;
480262c9 6412 int i, init_ns = -1;
51c7ca35 6413 bool found_unified = false;
237e200e
SH
6414
6415 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
b8defc3d 6416 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
237e200e
SH
6417 return;
6418 }
e58dab00 6419
237e200e 6420 while (getline(&line, &len, f) != -1) {
51c7ca35 6421 char *idx, *p, *p2;
237e200e
SH
6422
6423 p = strchr(line, ':');
6424 if (!p)
6425 goto out;
51c7ca35 6426 idx = line;
237e200e
SH
6427 *(p++) = '\0';
6428
6429 p2 = strrchr(p, ':');
6430 if (!p2)
6431 goto out;
6432 *p2 = '\0';
6433
a67719f6
CB
6434 /* With cgroupv2 /proc/self/cgroup can contain entries of the
6435 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
6436 * because it parses out the empty string "" and later on passes
6437 * it to mount(). Let's skip such entries.
6438 */
51c7ca35
CB
6439 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
6440 found_unified = true;
6441 p = "unified";
6442 }
a67719f6 6443
237e200e
SH
6444 if (!store_hierarchy(line, p))
6445 goto out;
6446 }
6447
480262c9 6448 /* Preserve initial namespace. */
a257a8ee 6449 init_ns = preserve_mnt_ns(getpid());
b8defc3d
CB
6450 if (init_ns < 0) {
6451 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
480262c9 6452 goto out;
b8defc3d 6453 }
480262c9 6454
92c3ee11 6455 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
b8defc3d
CB
6456 if (!fd_hierarchies) {
6457 lxcfs_error("%s\n", strerror(errno));
29a73c2f 6458 goto out;
b8defc3d 6459 }
29a73c2f 6460
480262c9
CB
6461 for (i = 0; i < num_hierarchies; i++)
6462 fd_hierarchies[i] = -1;
6463
e58dab00
CB
6464 cret = getcwd(cwd, MAXPATHLEN);
6465 if (!cret)
6466 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6467
480262c9
CB
6468 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6469 * to privately mount lxcfs cgroups. */
b8defc3d
CB
6470 if (!cgfs_setup_controllers()) {
6471 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
29a73c2f 6472 goto out;
b8defc3d 6473 }
480262c9 6474
b8defc3d
CB
6475 if (setns(init_ns, 0) < 0) {
6476 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
29a73c2f 6477 goto out;
b8defc3d 6478 }
29a73c2f 6479
e58dab00
CB
6480 if (!cret || chdir(cwd) < 0)
6481 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6482
056adcef
JS
6483 if (!init_cpuview()) {
6484 lxcfs_error("%s\n", "failed to init CPU view");
6485 goto out;
6486 }
6487
237e200e
SH
6488 print_subsystems();
6489
6490out:
6491 free(line);
6492 fclose(f);
480262c9
CB
6493 if (init_ns >= 0)
6494 close(init_ns);
237e200e
SH
6495}
6496
6497static void __attribute__((destructor)) free_subsystems(void)
6498{
6499 int i;
6500
b8defc3d
CB
6501 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6502
29a73c2f 6503 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
6504 if (hierarchies[i])
6505 free(hierarchies[i]);
480262c9 6506 if (fd_hierarchies && fd_hierarchies[i] >= 0)
29a73c2f
CB
6507 close(fd_hierarchies[i]);
6508 }
237e200e 6509 free(hierarchies);
480262c9 6510 free(fd_hierarchies);
056adcef 6511 free_cpuview();
a257a8ee
CB
6512
6513 if (cgroup_mount_ns_fd >= 0)
6514 close(cgroup_mount_ns_fd);
1c4b4e38 6515}