]> git.proxmox.com Git - mirror_lxcfs.git/blame - bindings.c
CPU view: handle disabling/enabling of physical CPUs at runtime
[mirror_lxcfs.git] / bindings.c
CommitLineData
237e200e
SH
1/* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9#define FUSE_USE_VERSION 26
10
0ecddf02 11#define __STDC_FORMAT_MACROS
237e200e 12#include <dirent.h>
29a73c2f 13#include <errno.h>
237e200e
SH
14#include <fcntl.h>
15#include <fuse.h>
0ecddf02 16#include <inttypes.h>
237e200e 17#include <libgen.h>
237e200e 18#include <pthread.h>
29a73c2f
CB
19#include <sched.h>
20#include <stdbool.h>
0ecddf02 21#include <stdint.h>
29a73c2f
CB
22#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25#include <time.h>
26#include <unistd.h>
27#include <wait.h>
d89504c4 28#include <linux/magic.h>
237e200e 29#include <linux/sched.h>
29a73c2f
CB
30#include <sys/epoll.h>
31#include <sys/mman.h>
32#include <sys/mount.h>
237e200e
SH
33#include <sys/param.h>
34#include <sys/socket.h>
29a73c2f 35#include <sys/syscall.h>
0ecddf02 36#include <sys/sysinfo.h>
d89504c4 37#include <sys/vfs.h>
237e200e 38
237e200e 39#include "bindings.h"
237e200e
SH
40#include "config.h" // for VERSION
41
0ecddf02
CB
42/* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
43#define LXCFS_NUMSTRLEN64 21
44
29a73c2f
CB
45/* Define pivot_root() if missing from the C library */
46#ifndef HAVE_PIVOT_ROOT
47static int pivot_root(const char * new_root, const char * put_old)
48{
49#ifdef __NR_pivot_root
50return syscall(__NR_pivot_root, new_root, put_old);
51#else
52errno = ENOSYS;
53return -1;
54#endif
55}
56#else
57extern int pivot_root(const char * new_root, const char * put_old);
58#endif
59
237e200e
SH
60enum {
61 LXC_TYPE_CGDIR,
62 LXC_TYPE_CGFILE,
63 LXC_TYPE_PROC_MEMINFO,
64 LXC_TYPE_PROC_CPUINFO,
65 LXC_TYPE_PROC_UPTIME,
66 LXC_TYPE_PROC_STAT,
67 LXC_TYPE_PROC_DISKSTATS,
70dcc12e 68 LXC_TYPE_PROC_SWAPS,
46be8eed 69 LXC_TYPE_PROC_LOADAVG,
237e200e
SH
70};
71
72struct file_info {
73 char *controller;
74 char *cgroup;
75 char *file;
76 int type;
77 char *buf; // unused as of yet
78 int buflen;
79 int size; //actual data size
80 int cached;
81};
82
8be92dd1
JS
83struct cpuacct_usage {
84 uint64_t user;
85 uint64_t system;
056adcef 86 uint64_t idle;
77005a6c 87 bool online;
8be92dd1
JS
88};
89
0e47acaa 90/* The function of hash table.*/
91#define LOAD_SIZE 100 /*the size of hash_table */
6db4f7a3 92#define FLUSH_TIME 5 /*the flush rate */
93#define DEPTH_DIR 3 /*the depth of per cgroup */
94/* The function of calculate loadavg .*/
95#define FSHIFT 11 /* nr of bits of precision */
96#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
97#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
98#define EXP_5 2014 /* 1/exp(5sec/5min) */
99#define EXP_15 2037 /* 1/exp(5sec/15min) */
100#define LOAD_INT(x) ((x) >> FSHIFT)
101#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
beb5024e 102/*
6db4f7a3 103 * This parameter is used for proc_loadavg_read().
104 * 1 means use loadavg, 0 means not use.
105 */
106static int loadavg = 0;
a83618e2 107static volatile sig_atomic_t loadavg_stop = 0;
056adcef 108static int calc_hash(const char *name)
0e47acaa 109{
110 unsigned int hash = 0;
111 unsigned int x = 0;
112 /* ELFHash algorithm. */
113 while (*name) {
114 hash = (hash << 4) + *name++;
115 x = hash & 0xf0000000;
116 if (x != 0)
117 hash ^= (x >> 24);
118 hash &= ~x;
119 }
b077527b 120 return (hash & 0x7fffffff);
0e47acaa 121}
122
123struct load_node {
124 char *cg; /*cg */
125 unsigned long avenrun[3]; /* Load averages */
126 unsigned int run_pid;
127 unsigned int total_pid;
128 unsigned int last_pid;
129 int cfd; /* The file descriptor of the mounted cgroup */
130 struct load_node *next;
131 struct load_node **pre;
132};
133
134struct load_head {
135 /*
136 * The lock is about insert load_node and refresh load_node.To the first
137 * load_node of each hash bucket, insert and refresh in this hash bucket is
138 * mutually exclusive.
139 */
140 pthread_mutex_t lock;
141 /*
142 * The rdlock is about read loadavg and delete load_node.To each hash
143 * bucket, read and delete is mutually exclusive. But at the same time, we
144 * allow paratactic read operation. This rdlock is at list level.
145 */
146 pthread_rwlock_t rdlock;
147 /*
148 * The rilock is about read loadavg and insert load_node.To the first
149 * load_node of each hash bucket, read and insert is mutually exclusive.
150 * But at the same time, we allow paratactic read operation.
151 */
152 pthread_rwlock_t rilock;
153 struct load_node *next;
154};
155
156static struct load_head load_hash[LOAD_SIZE]; /* hash table */
157/*
158 * init_load initialize the hash table.
159 * Return 0 on success, return -1 on failure.
160 */
161static int init_load(void)
162{
163 int i;
164 int ret;
165
166 for (i = 0; i < LOAD_SIZE; i++) {
167 load_hash[i].next = NULL;
168 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
169 if (ret != 0) {
170 lxcfs_error("%s\n", "Failed to initialize lock");
171 goto out3;
172 }
173 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
174 if (ret != 0) {
175 lxcfs_error("%s\n", "Failed to initialize rdlock");
176 goto out2;
177 }
178 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
179 if (ret != 0) {
180 lxcfs_error("%s\n", "Failed to initialize rilock");
181 goto out1;
182 }
183 }
184 return 0;
185out1:
186 pthread_rwlock_destroy(&load_hash[i].rdlock);
187out2:
188 pthread_mutex_destroy(&load_hash[i].lock);
189out3:
190 while (i > 0) {
191 i--;
192 pthread_mutex_destroy(&load_hash[i].lock);
193 pthread_rwlock_destroy(&load_hash[i].rdlock);
194 pthread_rwlock_destroy(&load_hash[i].rilock);
195 }
196 return -1;
197}
198
199static void insert_node(struct load_node **n, int locate)
200{
201 struct load_node *f;
202
203 pthread_mutex_lock(&load_hash[locate].lock);
204 pthread_rwlock_wrlock(&load_hash[locate].rilock);
205 f = load_hash[locate].next;
206 load_hash[locate].next = *n;
207
208 (*n)->pre = &(load_hash[locate].next);
209 if (f)
210 f->pre = &((*n)->next);
211 (*n)->next = f;
212 pthread_mutex_unlock(&load_hash[locate].lock);
213 pthread_rwlock_unlock(&load_hash[locate].rilock);
214}
215/*
216 * locate_node() finds special node. Not return NULL means success.
217 * It should be noted that rdlock isn't unlocked at the end of code
218 * because this function is used to read special node. Delete is not
219 * allowed before read has ended.
220 * unlock rdlock only in proc_loadavg_read().
221 */
222static struct load_node *locate_node(char *cg, int locate)
223{
224 struct load_node *f = NULL;
225 int i = 0;
226
227 pthread_rwlock_rdlock(&load_hash[locate].rilock);
228 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
229 if (load_hash[locate].next == NULL) {
230 pthread_rwlock_unlock(&load_hash[locate].rilock);
231 return f;
232 }
233 f = load_hash[locate].next;
234 pthread_rwlock_unlock(&load_hash[locate].rilock);
235 while (f && ((i = strcmp(f->cg, cg)) != 0))
236 f = f->next;
237 return f;
238}
239/* Delete the load_node n and return the next node of it. */
240static struct load_node *del_node(struct load_node *n, int locate)
241{
242 struct load_node *g;
243
244 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
245 if (n->next == NULL) {
246 *(n->pre) = NULL;
247 } else {
248 *(n->pre) = n->next;
249 n->next->pre = n->pre;
250 }
251 g = n->next;
252 free(n->cg);
253 free(n);
254 pthread_rwlock_unlock(&load_hash[locate].rdlock);
255 return g;
256}
257
a83618e2 258static void load_free(void)
9c480eb7 259{
260 int i;
261 struct load_node *f, *p;
262
263 for (i = 0; i < LOAD_SIZE; i++) {
264 pthread_mutex_lock(&load_hash[i].lock);
265 pthread_rwlock_wrlock(&load_hash[i].rilock);
266 pthread_rwlock_wrlock(&load_hash[i].rdlock);
267 if (load_hash[i].next == NULL) {
268 pthread_mutex_unlock(&load_hash[i].lock);
269 pthread_mutex_destroy(&load_hash[i].lock);
270 pthread_rwlock_unlock(&load_hash[i].rilock);
271 pthread_rwlock_destroy(&load_hash[i].rilock);
272 pthread_rwlock_unlock(&load_hash[i].rdlock);
273 pthread_rwlock_destroy(&load_hash[i].rdlock);
274 continue;
275 }
276 for (f = load_hash[i].next; f; ) {
277 free(f->cg);
278 p = f->next;
279 free(f);
280 f = p;
281 }
282 pthread_mutex_unlock(&load_hash[i].lock);
283 pthread_mutex_destroy(&load_hash[i].lock);
284 pthread_rwlock_unlock(&load_hash[i].rilock);
285 pthread_rwlock_destroy(&load_hash[i].rilock);
286 pthread_rwlock_unlock(&load_hash[i].rdlock);
287 pthread_rwlock_destroy(&load_hash[i].rdlock);
288 }
289}
056adcef
JS
290
291/* Data for CPU view */
292struct cg_proc_stat {
293 char *cg;
294 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
295 struct cpuacct_usage *view; // Usage stats reported to the container
296 int cpu_count;
2f49b662 297 pthread_mutex_t lock; // For node manipulation
056adcef
JS
298 struct cg_proc_stat *next;
299};
300
301struct cg_proc_stat_head {
302 struct cg_proc_stat *next;
951acc94 303 time_t lastcheck;
2f49b662
JS
304
305 /*
306 * For access to the list. Reading can be parallel, pruning is exclusive.
307 */
308 pthread_rwlock_t lock;
056adcef
JS
309};
310
311#define CPUVIEW_HASH_SIZE 100
312static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
313
314static bool cpuview_init_head(struct cg_proc_stat_head **head)
315{
316 *head = malloc(sizeof(struct cg_proc_stat_head));
317 if (!(*head)) {
318 lxcfs_error("%s\n", strerror(errno));
319 return false;
320 }
321
951acc94 322 (*head)->lastcheck = time(NULL);
056adcef 323 (*head)->next = NULL;
2f49b662
JS
324
325 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
326 lxcfs_error("%s\n", "Failed to initialize list lock");
327 free(*head);
328 return false;
329 }
330
056adcef
JS
331 return true;
332}
333
334static bool init_cpuview()
335{
336 int i;
337
338 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
339 proc_stat_history[i] = NULL;
340
341 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
342 if (!cpuview_init_head(&proc_stat_history[i]))
343 goto err;
344 }
345
346 return true;
347
348err:
349 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
350 if (proc_stat_history[i]) {
351 free(proc_stat_history[i]);
352 proc_stat_history[i] = NULL;
353 }
354 }
355
356 return false;
357}
358
951acc94
JS
359static void free_proc_stat_node(struct cg_proc_stat *node)
360{
2f49b662 361 pthread_mutex_destroy(&node->lock);
951acc94
JS
362 free(node->cg);
363 free(node->usage);
364 free(node->view);
365 free(node);
366}
367
056adcef
JS
368static void cpuview_free_head(struct cg_proc_stat_head *head)
369{
370 struct cg_proc_stat *node, *tmp;
371
372 if (head->next) {
373 node = head->next;
374
375 for (;;) {
376 tmp = node;
377 node = node->next;
951acc94 378 free_proc_stat_node(tmp);
056adcef
JS
379
380 if (!node)
381 break;
382 }
383 }
384
2f49b662 385 pthread_rwlock_destroy(&head->lock);
056adcef
JS
386 free(head);
387}
388
389static void free_cpuview()
390{
391 int i;
392
393 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
394 if (proc_stat_history[i])
395 cpuview_free_head(proc_stat_history[i]);
396 }
397}
398
f34de69a
CB
399/* Reserve buffer size to account for file size changes. */
400#define BUF_RESERVE_SIZE 512
237e200e
SH
401
402/*
403 * A table caching which pid is init for a pid namespace.
404 * When looking up which pid is init for $qpid, we first
405 * 1. Stat /proc/$qpid/ns/pid.
406 * 2. Check whether the ino_t is in our store.
407 * a. if not, fork a child in qpid's ns to send us
408 * ucred.pid = 1, and read the initpid. Cache
409 * initpid and creation time for /proc/initpid
410 * in a new store entry.
411 * b. if so, verify that /proc/initpid still matches
412 * what we have saved. If not, clear the store
413 * entry and go back to a. If so, return the
414 * cached initpid.
415 */
416struct pidns_init_store {
417 ino_t ino; // inode number for /proc/$pid/ns/pid
418 pid_t initpid; // the pid of nit in that ns
419 long int ctime; // the time at which /proc/$initpid was created
420 struct pidns_init_store *next;
421 long int lastcheck;
422};
423
424/* lol - look at how they are allocated in the kernel */
425#define PIDNS_HASH_SIZE 4096
426#define HASH(x) ((x) % PIDNS_HASH_SIZE)
427
428static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
429static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
430static void lock_mutex(pthread_mutex_t *l)
431{
432 int ret;
433
434 if ((ret = pthread_mutex_lock(l)) != 0) {
b8defc3d 435 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
436 exit(1);
437 }
438}
439
29a73c2f
CB
440/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
441 * Number of hierarchies mounted. */
442static int num_hierarchies;
443
444/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
445 * Hierachies mounted {cpuset, blkio, ...}:
446 * Initialized via __constructor__ collect_and_mount_subsystems(). */
447static char **hierarchies;
448
449/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
450 * Open file descriptors:
451 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
452 * private mount namespace.
453 * Initialized via __constructor__ collect_and_mount_subsystems().
454 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
455 * mounts and respective files in the private namespace even when located in
456 * another namespace using the *at() family of functions
457 * {openat(), fchownat(), ...}. */
458static int *fd_hierarchies;
a257a8ee 459static int cgroup_mount_ns_fd = -1;
29a73c2f 460
237e200e
SH
461static void unlock_mutex(pthread_mutex_t *l)
462{
463 int ret;
464
465 if ((ret = pthread_mutex_unlock(l)) != 0) {
b8defc3d 466 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
467 exit(1);
468 }
469}
470
471static void store_lock(void)
472{
473 lock_mutex(&pidns_store_mutex);
474}
475
476static void store_unlock(void)
477{
478 unlock_mutex(&pidns_store_mutex);
479}
480
481/* Must be called under store_lock */
482static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
483{
484 struct stat initsb;
485 char fnam[100];
486
487 snprintf(fnam, 100, "/proc/%d", e->initpid);
488 if (stat(fnam, &initsb) < 0)
489 return false;
7dd6560a
CB
490
491 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
492 initsb.st_ctime, e->initpid);
493
237e200e
SH
494 if (e->ctime != initsb.st_ctime)
495 return false;
496 return true;
497}
498
499/* Must be called under store_lock */
500static void remove_initpid(struct pidns_init_store *e)
501{
502 struct pidns_init_store *tmp;
503 int h;
504
7dd6560a
CB
505 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
506
237e200e
SH
507 h = HASH(e->ino);
508 if (pidns_hash_table[h] == e) {
509 pidns_hash_table[h] = e->next;
510 free(e);
511 return;
512 }
513
514 tmp = pidns_hash_table[h];
515 while (tmp) {
516 if (tmp->next == e) {
517 tmp->next = e->next;
518 free(e);
519 return;
520 }
521 tmp = tmp->next;
522 }
523}
524
525#define PURGE_SECS 5
526/* Must be called under store_lock */
527static void prune_initpid_store(void)
528{
529 static long int last_prune = 0;
530 struct pidns_init_store *e, *prev, *delme;
531 long int now, threshold;
532 int i;
533
534 if (!last_prune) {
535 last_prune = time(NULL);
536 return;
537 }
538 now = time(NULL);
539 if (now < last_prune + PURGE_SECS)
540 return;
7dd6560a
CB
541
542 lxcfs_debug("%s\n", "Pruning.");
543
237e200e
SH
544 last_prune = now;
545 threshold = now - 2 * PURGE_SECS;
546
547 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
548 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
549 if (e->lastcheck < threshold) {
7dd6560a
CB
550
551 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
552
237e200e
SH
553 delme = e;
554 if (prev)
555 prev->next = e->next;
556 else
557 pidns_hash_table[i] = e->next;
558 e = e->next;
559 free(delme);
560 } else {
561 prev = e;
562 e = e->next;
563 }
564 }
565 }
566}
567
568/* Must be called under store_lock */
569static void save_initpid(struct stat *sb, pid_t pid)
570{
571 struct pidns_init_store *e;
572 char fpath[100];
573 struct stat procsb;
574 int h;
575
7dd6560a
CB
576 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
577
237e200e
SH
578 snprintf(fpath, 100, "/proc/%d", pid);
579 if (stat(fpath, &procsb) < 0)
580 return;
581 do {
582 e = malloc(sizeof(*e));
583 } while (!e);
584 e->ino = sb->st_ino;
585 e->initpid = pid;
586 e->ctime = procsb.st_ctime;
587 h = HASH(e->ino);
588 e->next = pidns_hash_table[h];
589 e->lastcheck = time(NULL);
590 pidns_hash_table[h] = e;
591}
592
593/*
594 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
595 * entry for the inode number and creation time. Verify that the init pid
596 * is still valid. If not, remove it. Return the entry if valid, NULL
597 * otherwise.
598 * Must be called under store_lock
599 */
600static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
601{
602 int h = HASH(sb->st_ino);
603 struct pidns_init_store *e = pidns_hash_table[h];
604
605 while (e) {
606 if (e->ino == sb->st_ino) {
607 if (initpid_still_valid(e, sb)) {
608 e->lastcheck = time(NULL);
609 return e;
610 }
611 remove_initpid(e);
612 return NULL;
613 }
614 e = e->next;
615 }
616
617 return NULL;
618}
619
0f657ce3 620static int is_dir(const char *path, int fd)
237e200e
SH
621{
622 struct stat statbuf;
0f657ce3 623 int ret = fstatat(fd, path, &statbuf, fd);
237e200e
SH
624 if (ret == 0 && S_ISDIR(statbuf.st_mode))
625 return 1;
626 return 0;
627}
628
629static char *must_copy_string(const char *str)
630{
631 char *dup = NULL;
632 if (!str)
633 return NULL;
634 do {
635 dup = strdup(str);
636 } while (!dup);
637
638 return dup;
639}
640
641static inline void drop_trailing_newlines(char *s)
642{
643 int l;
644
645 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
646 s[l-1] = '\0';
647}
648
649#define BATCH_SIZE 50
650static void dorealloc(char **mem, size_t oldlen, size_t newlen)
651{
652 int newbatches = (newlen / BATCH_SIZE) + 1;
653 int oldbatches = (oldlen / BATCH_SIZE) + 1;
654
655 if (!*mem || newbatches > oldbatches) {
656 char *tmp;
657 do {
658 tmp = realloc(*mem, newbatches * BATCH_SIZE);
659 } while (!tmp);
660 *mem = tmp;
661 }
662}
663static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
664{
665 size_t newlen = *len + linelen;
666 dorealloc(contents, *len, newlen + 1);
667 memcpy(*contents + *len, line, linelen+1);
668 *len = newlen;
669}
670
60f2ae53 671static char *slurp_file(const char *from, int fd)
237e200e
SH
672{
673 char *line = NULL;
674 char *contents = NULL;
60f2ae53 675 FILE *f = fdopen(fd, "r");
237e200e
SH
676 size_t len = 0, fulllen = 0;
677 ssize_t linelen;
678
679 if (!f)
680 return NULL;
681
682 while ((linelen = getline(&line, &len, f)) != -1) {
683 append_line(&contents, &fulllen, line, linelen);
684 }
685 fclose(f);
686
687 if (contents)
688 drop_trailing_newlines(contents);
689 free(line);
690 return contents;
691}
692
ba59ea09 693static bool write_string(const char *fnam, const char *string, int fd)
237e200e
SH
694{
695 FILE *f;
696 size_t len, ret;
697
beb5024e
CB
698 f = fdopen(fd, "w");
699 if (!f)
237e200e 700 return false;
beb5024e 701
237e200e
SH
702 len = strlen(string);
703 ret = fwrite(string, 1, len, f);
704 if (ret != len) {
beb5024e
CB
705 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
706 strerror(errno), string, fnam);
237e200e
SH
707 fclose(f);
708 return false;
709 }
beb5024e 710
237e200e 711 if (fclose(f) < 0) {
beb5024e 712 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
237e200e
SH
713 return false;
714 }
beb5024e 715
237e200e
SH
716 return true;
717}
718
237e200e
SH
719struct cgfs_files {
720 char *name;
721 uint32_t uid, gid;
722 uint32_t mode;
723};
724
0619767c 725#define ALLOC_NUM 20
237e200e
SH
726static bool store_hierarchy(char *stridx, char *h)
727{
0619767c
SH
728 if (num_hierarchies % ALLOC_NUM == 0) {
729 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
730 n *= ALLOC_NUM;
731 char **tmp = realloc(hierarchies, n * sizeof(char *));
0619767c 732 if (!tmp) {
b8defc3d 733 lxcfs_error("%s\n", strerror(errno));
0619767c
SH
734 exit(1);
735 }
237e200e 736 hierarchies = tmp;
237e200e 737 }
f676eb79 738
0619767c 739 hierarchies[num_hierarchies++] = must_copy_string(h);
237e200e
SH
740 return true;
741}
742
743static void print_subsystems(void)
744{
745 int i;
746
a257a8ee 747 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
cc97d34c 748 fprintf(stderr, "hierarchies:\n");
237e200e
SH
749 for (i = 0; i < num_hierarchies; i++) {
750 if (hierarchies[i])
b8defc3d
CB
751 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
752 fd_hierarchies[i], hierarchies[i]);
237e200e
SH
753 }
754}
755
756static bool in_comma_list(const char *needle, const char *haystack)
757{
758 const char *s = haystack, *e;
759 size_t nlen = strlen(needle);
760
06081b29 761 while (*s && (e = strchr(s, ','))) {
237e200e
SH
762 if (nlen != e - s) {
763 s = e + 1;
764 continue;
765 }
766 if (strncmp(needle, s, nlen) == 0)
767 return true;
768 s = e + 1;
769 }
770 if (strcmp(needle, s) == 0)
771 return true;
772 return false;
773}
774
775/* do we need to do any massaging here? I'm not sure... */
5dd3e6fd
CB
776/* Return the mounted controller and store the corresponding open file descriptor
777 * referring to the controller mountpoint in the private lxcfs namespace in
778 * @cfd.
779 */
780static char *find_mounted_controller(const char *controller, int *cfd)
237e200e
SH
781{
782 int i;
783
784 for (i = 0; i < num_hierarchies; i++) {
785 if (!hierarchies[i])
786 continue;
5dd3e6fd
CB
787 if (strcmp(hierarchies[i], controller) == 0) {
788 *cfd = fd_hierarchies[i];
237e200e 789 return hierarchies[i];
5dd3e6fd
CB
790 }
791 if (in_comma_list(controller, hierarchies[i])) {
792 *cfd = fd_hierarchies[i];
237e200e 793 return hierarchies[i];
5dd3e6fd 794 }
237e200e
SH
795 }
796
797 return NULL;
798}
799
800bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
801 const char *value)
802{
ba59ea09 803 int ret, fd, cfd;
237e200e 804 size_t len;
f5a6d92e 805 char *fnam, *tmpc;
237e200e 806
f5a6d92e 807 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
808 if (!tmpc)
809 return false;
f5a6d92e
CB
810
811 /* Make sure we pass a relative path to *at() family of functions.
812 * . + /cgroup + / + file + \0
813 */
ba59ea09 814 len = strlen(cgroup) + strlen(file) + 3;
237e200e 815 fnam = alloca(len);
ba59ea09
CB
816 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
817 if (ret < 0 || (size_t)ret >= len)
818 return false;
819
820 fd = openat(cfd, fnam, O_WRONLY);
821 if (fd < 0)
822 return false;
f676eb79 823
ba59ea09 824 return write_string(fnam, value, fd);
237e200e
SH
825}
826
827// Chown all the files in the cgroup directory. We do this when we create
828// a cgroup on behalf of a user.
f23fe717 829static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e 830{
f23fe717 831 struct dirent *direntp;
237e200e
SH
832 char path[MAXPATHLEN];
833 size_t len;
834 DIR *d;
f23fe717 835 int fd1, ret;
237e200e
SH
836
837 len = strlen(dirname);
838 if (len >= MAXPATHLEN) {
b8defc3d 839 lxcfs_error("Pathname too long: %s\n", dirname);
237e200e
SH
840 return;
841 }
842
f23fe717
CB
843 fd1 = openat(fd, dirname, O_DIRECTORY);
844 if (fd1 < 0)
845 return;
846
847 d = fdopendir(fd1);
237e200e 848 if (!d) {
b8defc3d 849 lxcfs_error("Failed to open %s\n", dirname);
237e200e
SH
850 return;
851 }
852
f23fe717 853 while ((direntp = readdir(d))) {
237e200e
SH
854 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
855 continue;
856 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
857 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 858 lxcfs_error("Pathname too long under %s\n", dirname);
237e200e
SH
859 continue;
860 }
f23fe717 861 if (fchownat(fd, path, uid, gid, 0) < 0)
b8defc3d 862 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
237e200e
SH
863 }
864 closedir(d);
865}
866
867int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
868{
5dd3e6fd 869 int cfd;
237e200e 870 size_t len;
f5a6d92e 871 char *dirnam, *tmpc;
237e200e 872
f5a6d92e 873 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
874 if (!tmpc)
875 return -EINVAL;
f5a6d92e
CB
876
877 /* Make sure we pass a relative path to *at() family of functions.
878 * . + /cg + \0
879 */
f23fe717 880 len = strlen(cg) + 2;
237e200e 881 dirnam = alloca(len);
f23fe717 882 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
237e200e 883
f23fe717 884 if (mkdirat(cfd, dirnam, 0755) < 0)
237e200e
SH
885 return -errno;
886
887 if (uid == 0 && gid == 0)
888 return 0;
889
f23fe717 890 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
237e200e
SH
891 return -errno;
892
f23fe717 893 chown_all_cgroup_files(dirnam, uid, gid, cfd);
237e200e
SH
894
895 return 0;
896}
897
7213ec5c 898static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
237e200e 899{
b7672ded 900 struct dirent *direntp;
237e200e
SH
901 DIR *dir;
902 bool ret = false;
903 char pathname[MAXPATHLEN];
b7672ded
CB
904 int dupfd;
905
906 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
907 if (dupfd < 0)
908 return false;
237e200e 909
b7672ded 910 dir = fdopendir(dupfd);
237e200e 911 if (!dir) {
7dd6560a 912 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
7213ec5c 913 close(dupfd);
237e200e
SH
914 return false;
915 }
916
b7672ded 917 while ((direntp = readdir(dir))) {
237e200e
SH
918 struct stat mystat;
919 int rc;
920
237e200e
SH
921 if (!strcmp(direntp->d_name, ".") ||
922 !strcmp(direntp->d_name, ".."))
923 continue;
924
925 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
926 if (rc < 0 || rc >= MAXPATHLEN) {
b8defc3d 927 lxcfs_error("%s\n", "Pathname too long.");
237e200e
SH
928 continue;
929 }
930
2e81a5e3
CB
931 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
932 if (rc) {
7dd6560a 933 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
237e200e
SH
934 continue;
935 }
7dd6560a 936 if (S_ISDIR(mystat.st_mode))
2e81a5e3 937 if (!recursive_rmdir(pathname, fd, cfd))
7dd6560a 938 lxcfs_debug("Error removing %s.\n", pathname);
237e200e
SH
939 }
940
941 ret = true;
942 if (closedir(dir) < 0) {
b8defc3d 943 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
237e200e
SH
944 ret = false;
945 }
946
2e81a5e3 947 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
7dd6560a 948 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
237e200e
SH
949 ret = false;
950 }
7213ec5c
CB
951
952 close(dupfd);
237e200e
SH
953
954 return ret;
955}
956
957bool cgfs_remove(const char *controller, const char *cg)
958{
b7672ded 959 int fd, cfd;
237e200e 960 size_t len;
f5a6d92e 961 char *dirnam, *tmpc;
7213ec5c 962 bool bret;
237e200e 963
f5a6d92e 964 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
965 if (!tmpc)
966 return false;
f5a6d92e
CB
967
968 /* Make sure we pass a relative path to *at() family of functions.
969 * . + /cg + \0
970 */
b7672ded 971 len = strlen(cg) + 2;
237e200e 972 dirnam = alloca(len);
b7672ded
CB
973 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
974
975 fd = openat(cfd, dirnam, O_DIRECTORY);
976 if (fd < 0)
977 return false;
978
7213ec5c
CB
979 bret = recursive_rmdir(dirnam, fd, cfd);
980 close(fd);
981 return bret;
237e200e
SH
982}
983
984bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
985{
5dd3e6fd 986 int cfd;
237e200e 987 size_t len;
f5a6d92e 988 char *pathname, *tmpc;
237e200e 989
f5a6d92e 990 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
991 if (!tmpc)
992 return false;
f5a6d92e
CB
993
994 /* Make sure we pass a relative path to *at() family of functions.
995 * . + /file + \0
996 */
534690b4 997 len = strlen(file) + 2;
237e200e 998 pathname = alloca(len);
534690b4
CB
999 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1000 if (fchmodat(cfd, pathname, mode, 0) < 0)
237e200e
SH
1001 return false;
1002 return true;
1003}
1004
0f657ce3 1005static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e
SH
1006{
1007 size_t len;
1008 char *fname;
1009
1010 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
1011 fname = alloca(len);
1012 snprintf(fname, len, "%s/tasks", dirname);
0f657ce3 1013 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
1014 return -errno;
1015 snprintf(fname, len, "%s/cgroup.procs", dirname);
0f657ce3 1016 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
1017 return -errno;
1018 return 0;
1019}
1020
1021int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
1022{
5dd3e6fd 1023 int cfd;
237e200e 1024 size_t len;
f5a6d92e 1025 char *pathname, *tmpc;
237e200e 1026
f5a6d92e 1027 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1028 if (!tmpc)
1029 return -EINVAL;
f5a6d92e
CB
1030
1031 /* Make sure we pass a relative path to *at() family of functions.
1032 * . + /file + \0
1033 */
0f657ce3 1034 len = strlen(file) + 2;
237e200e 1035 pathname = alloca(len);
0f657ce3
CB
1036 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1037 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
237e200e
SH
1038 return -errno;
1039
0f657ce3 1040 if (is_dir(pathname, cfd))
237e200e 1041 // like cgmanager did, we want to chown the tasks file as well
0f657ce3 1042 return chown_tasks_files(pathname, uid, gid, cfd);
237e200e
SH
1043
1044 return 0;
1045}
1046
1047FILE *open_pids_file(const char *controller, const char *cgroup)
1048{
3ffd08ee 1049 int fd, cfd;
237e200e 1050 size_t len;
f5a6d92e 1051 char *pathname, *tmpc;
237e200e 1052
f5a6d92e 1053 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1054 if (!tmpc)
1055 return NULL;
f5a6d92e
CB
1056
1057 /* Make sure we pass a relative path to *at() family of functions.
1058 * . + /cgroup + / "cgroup.procs" + \0
1059 */
3ffd08ee 1060 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
237e200e 1061 pathname = alloca(len);
3ffd08ee
CB
1062 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1063
1064 fd = openat(cfd, pathname, O_WRONLY);
1065 if (fd < 0)
1066 return NULL;
1067
1068 return fdopen(fd, "w");
237e200e
SH
1069}
1070
f366da65
WB
1071static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1072 void ***list, size_t typesize,
1073 void* (*iterator)(const char*, const char*, const char*))
237e200e 1074{
4ea38a4c 1075 int cfd, fd, ret;
237e200e 1076 size_t len;
4ea38a4c 1077 char *cg, *tmpc;
237e200e 1078 char pathname[MAXPATHLEN];
f366da65 1079 size_t sz = 0, asz = 0;
4ea38a4c 1080 struct dirent *dirent;
237e200e 1081 DIR *dir;
237e200e 1082
4ea38a4c 1083 tmpc = find_mounted_controller(controller, &cfd);
f366da65 1084 *list = NULL;
237e200e 1085 if (!tmpc)
e97c834b 1086 return false;
237e200e 1087
f5a6d92e 1088 /* Make sure we pass a relative path to *at() family of functions. */
4ea38a4c
CB
1089 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1090 cg = alloca(len);
1091 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1092 if (ret < 0 || (size_t)ret >= len) {
b8defc3d 1093 lxcfs_error("Pathname too long under %s\n", cgroup);
4ea38a4c
CB
1094 return false;
1095 }
237e200e 1096
4ea38a4c
CB
1097 fd = openat(cfd, cg, O_DIRECTORY);
1098 if (fd < 0)
1099 return false;
1100
1101 dir = fdopendir(fd);
237e200e
SH
1102 if (!dir)
1103 return false;
1104
4ea38a4c 1105 while ((dirent = readdir(dir))) {
237e200e 1106 struct stat mystat;
237e200e 1107
4ea38a4c
CB
1108 if (!strcmp(dirent->d_name, ".") ||
1109 !strcmp(dirent->d_name, ".."))
237e200e
SH
1110 continue;
1111
4ea38a4c
CB
1112 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1113 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 1114 lxcfs_error("Pathname too long under %s\n", cg);
237e200e
SH
1115 continue;
1116 }
1117
4ea38a4c 1118 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
237e200e 1119 if (ret) {
b8defc3d 1120 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
237e200e
SH
1121 continue;
1122 }
f366da65
WB
1123 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1124 (directories && !S_ISDIR(mystat.st_mode)))
237e200e
SH
1125 continue;
1126
1127 if (sz+2 >= asz) {
f366da65 1128 void **tmp;
237e200e
SH
1129 asz += BATCH_SIZE;
1130 do {
f366da65 1131 tmp = realloc(*list, asz * typesize);
237e200e
SH
1132 } while (!tmp);
1133 *list = tmp;
1134 }
4ea38a4c 1135 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
237e200e
SH
1136 (*list)[sz+1] = NULL;
1137 sz++;
1138 }
1139 if (closedir(dir) < 0) {
b8defc3d 1140 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
237e200e
SH
1141 return false;
1142 }
1143 return true;
1144}
1145
f366da65
WB
1146static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1147{
1148 char *dup;
1149 do {
1150 dup = strdup(dir_entry);
1151 } while (!dup);
1152 return dup;
1153}
1154
1155bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1156{
1157 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1158}
1159
237e200e
SH
1160void free_key(struct cgfs_files *k)
1161{
1162 if (!k)
1163 return;
1164 free(k->name);
1165 free(k);
1166}
1167
1168void free_keys(struct cgfs_files **keys)
1169{
1170 int i;
1171
1172 if (!keys)
1173 return;
1174 for (i = 0; keys[i]; i++) {
1175 free_key(keys[i]);
1176 }
1177 free(keys);
1178}
1179
1180bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1181{
60f2ae53 1182 int ret, fd, cfd;
237e200e 1183 size_t len;
f5a6d92e 1184 char *fnam, *tmpc;
237e200e 1185
f5a6d92e 1186 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1187 if (!tmpc)
1188 return false;
f5a6d92e
CB
1189
1190 /* Make sure we pass a relative path to *at() family of functions.
1191 * . + /cgroup + / + file + \0
1192 */
60f2ae53 1193 len = strlen(cgroup) + strlen(file) + 3;
237e200e 1194 fnam = alloca(len);
60f2ae53
CB
1195 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1196 if (ret < 0 || (size_t)ret >= len)
234a820c 1197 return false;
60f2ae53
CB
1198
1199 fd = openat(cfd, fnam, O_RDONLY);
1200 if (fd < 0)
234a820c 1201 return false;
237e200e 1202
60f2ae53 1203 *value = slurp_file(fnam, fd);
237e200e
SH
1204 return *value != NULL;
1205}
1206
951acc94
JS
1207bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1208{
1209 int ret, cfd;
1210 size_t len;
1211 char *fnam, *tmpc;
1212
1213 tmpc = find_mounted_controller(controller, &cfd);
1214 if (!tmpc)
1215 return false;
1216
1217 /* Make sure we pass a relative path to *at() family of functions.
1218 * . + /cgroup + / + file + \0
1219 */
1220 len = strlen(cgroup) + strlen(file) + 3;
1221 fnam = alloca(len);
1222 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1223 if (ret < 0 || (size_t)ret >= len)
1224 return false;
1225
1226 return (faccessat(cfd, fnam, F_OK, 0) == 0);
1227}
1228
237e200e
SH
1229struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1230{
4ea38a4c 1231 int ret, cfd;
237e200e 1232 size_t len;
f5a6d92e 1233 char *fnam, *tmpc;
237e200e
SH
1234 struct stat sb;
1235 struct cgfs_files *newkey;
237e200e 1236
f5a6d92e 1237 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1238 if (!tmpc)
1239 return false;
1240
1241 if (file && *file == '/')
1242 file++;
1243
06081b29 1244 if (file && strchr(file, '/'))
237e200e
SH
1245 return NULL;
1246
f5a6d92e
CB
1247 /* Make sure we pass a relative path to *at() family of functions.
1248 * . + /cgroup + / + file + \0
1249 */
4ea38a4c 1250 len = strlen(cgroup) + 3;
237e200e
SH
1251 if (file)
1252 len += strlen(file) + 1;
1253 fnam = alloca(len);
4ea38a4c
CB
1254 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1255 file ? "/" : "", file ? file : "");
237e200e 1256
4ea38a4c 1257 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1258 if (ret < 0)
1259 return NULL;
1260
1261 do {
1262 newkey = malloc(sizeof(struct cgfs_files));
1263 } while (!newkey);
1264 if (file)
1265 newkey->name = must_copy_string(file);
06081b29
CB
1266 else if (strrchr(cgroup, '/'))
1267 newkey->name = must_copy_string(strrchr(cgroup, '/'));
237e200e
SH
1268 else
1269 newkey->name = must_copy_string(cgroup);
1270 newkey->uid = sb.st_uid;
1271 newkey->gid = sb.st_gid;
1272 newkey->mode = sb.st_mode;
1273
1274 return newkey;
1275}
1276
f366da65 1277static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
237e200e 1278{
f366da65
WB
1279 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1280 if (!entry) {
b8defc3d
CB
1281 lxcfs_error("Error getting files under %s:%s\n", controller,
1282 cgroup);
237e200e 1283 }
f366da65
WB
1284 return entry;
1285}
1286
1287bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1288{
1289 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
237e200e
SH
1290}
1291
1292bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
5dd3e6fd
CB
1293{
1294 int cfd;
1295 size_t len;
f5a6d92e 1296 char *fnam, *tmpc;
237e200e
SH
1297 int ret;
1298 struct stat sb;
1299
f5a6d92e 1300 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1301 if (!tmpc)
1302 return false;
f5a6d92e
CB
1303
1304 /* Make sure we pass a relative path to *at() family of functions.
1305 * . + /cgroup + / + f + \0
1306 */
d04232f2 1307 len = strlen(cgroup) + strlen(f) + 3;
237e200e 1308 fnam = alloca(len);
d04232f2
CB
1309 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1310 if (ret < 0 || (size_t)ret >= len)
1311 return false;
237e200e 1312
d04232f2 1313 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1314 if (ret < 0 || !S_ISDIR(sb.st_mode))
1315 return false;
f5a6d92e 1316
237e200e
SH
1317 return true;
1318}
1319
1320#define SEND_CREDS_OK 0
1321#define SEND_CREDS_NOTSK 1
1322#define SEND_CREDS_FAIL 2
1323static bool recv_creds(int sock, struct ucred *cred, char *v);
1324static int wait_for_pid(pid_t pid);
1325static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
b10bdd6c 1326static int send_creds_clone_wrapper(void *arg);
237e200e
SH
1327
1328/*
b10bdd6c 1329 * clone a task which switches to @task's namespace and writes '1'.
237e200e
SH
1330 * over a unix sock so we can read the task's reaper's pid in our
1331 * namespace
b10bdd6c
FG
1332 *
1333 * Note: glibc's fork() does not respect pidns, which can lead to failed
1334 * assertions inside glibc (and thus failed forks) if the child's pid in
1335 * the pidns and the parent pid outside are identical. Using clone prevents
1336 * this issue.
237e200e
SH
1337 */
1338static void write_task_init_pid_exit(int sock, pid_t target)
1339{
237e200e
SH
1340 char fnam[100];
1341 pid_t pid;
237e200e 1342 int fd, ret;
b10bdd6c
FG
1343 size_t stack_size = sysconf(_SC_PAGESIZE);
1344 void *stack = alloca(stack_size);
237e200e
SH
1345
1346 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1347 if (ret < 0 || ret >= sizeof(fnam))
1348 _exit(1);
1349
1350 fd = open(fnam, O_RDONLY);
1351 if (fd < 0) {
1352 perror("write_task_init_pid_exit open of ns/pid");
1353 _exit(1);
1354 }
1355 if (setns(fd, 0)) {
1356 perror("write_task_init_pid_exit setns 1");
1357 close(fd);
1358 _exit(1);
1359 }
b10bdd6c 1360 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
237e200e
SH
1361 if (pid < 0)
1362 _exit(1);
1363 if (pid != 0) {
1364 if (!wait_for_pid(pid))
1365 _exit(1);
1366 _exit(0);
1367 }
b10bdd6c
FG
1368}
1369
1370static int send_creds_clone_wrapper(void *arg) {
1371 struct ucred cred;
1372 char v;
1373 int sock = *(int *)arg;
237e200e
SH
1374
1375 /* we are the child */
1376 cred.uid = 0;
1377 cred.gid = 0;
1378 cred.pid = 1;
1379 v = '1';
1380 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
b10bdd6c
FG
1381 return 1;
1382 return 0;
237e200e
SH
1383}
1384
1385static pid_t get_init_pid_for_task(pid_t task)
1386{
1387 int sock[2];
1388 pid_t pid;
1389 pid_t ret = -1;
1390 char v = '0';
1391 struct ucred cred;
1392
1393 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1394 perror("socketpair");
1395 return -1;
1396 }
1397
1398 pid = fork();
1399 if (pid < 0)
1400 goto out;
1401 if (!pid) {
1402 close(sock[1]);
1403 write_task_init_pid_exit(sock[0], task);
1404 _exit(0);
1405 }
1406
1407 if (!recv_creds(sock[1], &cred, &v))
1408 goto out;
1409 ret = cred.pid;
1410
1411out:
1412 close(sock[0]);
1413 close(sock[1]);
1414 if (pid > 0)
1415 wait_for_pid(pid);
1416 return ret;
1417}
1418
1419static pid_t lookup_initpid_in_store(pid_t qpid)
1420{
1421 pid_t answer = 0;
1422 struct stat sb;
1423 struct pidns_init_store *e;
1424 char fnam[100];
1425
1426 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1427 store_lock();
1428 if (stat(fnam, &sb) < 0)
1429 goto out;
1430 e = lookup_verify_initpid(&sb);
1431 if (e) {
1432 answer = e->initpid;
1433 goto out;
1434 }
1435 answer = get_init_pid_for_task(qpid);
1436 if (answer > 0)
1437 save_initpid(&sb, answer);
1438
1439out:
1440 /* we prune at end in case we are returning
1441 * the value we were about to return */
1442 prune_initpid_store();
1443 store_unlock();
1444 return answer;
1445}
1446
1447static int wait_for_pid(pid_t pid)
1448{
1449 int status, ret;
1450
1451 if (pid <= 0)
1452 return -1;
1453
1454again:
1455 ret = waitpid(pid, &status, 0);
1456 if (ret == -1) {
1457 if (errno == EINTR)
1458 goto again;
1459 return -1;
1460 }
1461 if (ret != pid)
1462 goto again;
1463 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1464 return -1;
1465 return 0;
1466}
1467
1468
1469/*
1470 * append pid to *src.
1471 * src: a pointer to a char* in which ot append the pid.
1472 * sz: the number of characters printed so far, minus trailing \0.
1473 * asz: the allocated size so far
1474 * pid: the pid to append
1475 */
1476static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1477{
1478 char tmp[30];
1479
1480 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1481
1482 if (!*src || tmplen + *sz + 1 >= *asz) {
1483 char *tmp;
1484 do {
1485 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1486 } while (!tmp);
1487 *src = tmp;
1488 *asz += BUF_RESERVE_SIZE;
1489 }
bbfd0e33 1490 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
237e200e 1491 *sz += tmplen;
237e200e
SH
1492}
1493
1494/*
1495 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1496 * valid in the caller's namespace, return the id mapped into
1497 * pid's namespace.
1498 * Returns the mapped id, or -1 on error.
1499 */
1500unsigned int
1501convert_id_to_ns(FILE *idfile, unsigned int in_id)
1502{
1503 unsigned int nsuid, // base id for a range in the idfile's namespace
1504 hostuid, // base id for a range in the caller's namespace
1505 count; // number of ids in this range
1506 char line[400];
1507 int ret;
1508
1509 fseek(idfile, 0L, SEEK_SET);
1510 while (fgets(line, 400, idfile)) {
1511 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1512 if (ret != 3)
1513 continue;
1514 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1515 /*
1516 * uids wrapped around - unexpected as this is a procfile,
1517 * so just bail.
1518 */
b8defc3d 1519 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
237e200e
SH
1520 nsuid, hostuid, count, line);
1521 return -1;
1522 }
1523 if (hostuid <= in_id && hostuid+count > in_id) {
1524 /*
1525 * now since hostuid <= in_id < hostuid+count, and
1526 * hostuid+count and nsuid+count do not wrap around,
1527 * we know that nsuid+(in_id-hostuid) which must be
1528 * less that nsuid+(count) must not wrap around
1529 */
1530 return (in_id - hostuid) + nsuid;
1531 }
1532 }
1533
1534 // no answer found
1535 return -1;
1536}
1537
1538/*
1539 * for is_privileged_over,
1540 * specify whether we require the calling uid to be root in his
1541 * namespace
1542 */
1543#define NS_ROOT_REQD true
1544#define NS_ROOT_OPT false
1545
1546#define PROCLEN 100
1547
1548static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1549{
1550 char fpath[PROCLEN];
1551 int ret;
1552 bool answer = false;
1553 uid_t nsuid;
1554
1555 if (victim == -1 || uid == -1)
1556 return false;
1557
1558 /*
1559 * If the request is one not requiring root in the namespace,
1560 * then having the same uid suffices. (i.e. uid 1000 has write
1561 * access to files owned by uid 1000
1562 */
1563 if (!req_ns_root && uid == victim)
1564 return true;
1565
1566 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1567 if (ret < 0 || ret >= PROCLEN)
1568 return false;
1569 FILE *f = fopen(fpath, "r");
1570 if (!f)
1571 return false;
1572
1573 /* if caller's not root in his namespace, reject */
1574 nsuid = convert_id_to_ns(f, uid);
1575 if (nsuid)
1576 goto out;
1577
1578 /*
1579 * If victim is not mapped into caller's ns, reject.
1580 * XXX I'm not sure this check is needed given that fuse
1581 * will be sending requests where the vfs has converted
1582 */
1583 nsuid = convert_id_to_ns(f, victim);
1584 if (nsuid == -1)
1585 goto out;
1586
1587 answer = true;
1588
1589out:
1590 fclose(f);
1591 return answer;
1592}
1593
1594static bool perms_include(int fmode, mode_t req_mode)
1595{
1596 mode_t r;
1597
1598 switch (req_mode & O_ACCMODE) {
1599 case O_RDONLY:
1600 r = S_IROTH;
1601 break;
1602 case O_WRONLY:
1603 r = S_IWOTH;
1604 break;
1605 case O_RDWR:
1606 r = S_IROTH | S_IWOTH;
1607 break;
1608 default:
1609 return false;
1610 }
1611 return ((fmode & r) == r);
1612}
1613
1614
1615/*
1616 * taskcg is a/b/c
1617 * querycg is /a/b/c/d/e
1618 * we return 'd'
1619 */
1620static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1621{
1622 char *start, *end;
1623
1624 if (strlen(taskcg) <= strlen(querycg)) {
b8defc3d 1625 lxcfs_error("%s\n", "I was fed bad input.");
237e200e
SH
1626 return NULL;
1627 }
1628
06081b29 1629 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
237e200e
SH
1630 start = strdup(taskcg + 1);
1631 else
1632 start = strdup(taskcg + strlen(querycg) + 1);
1633 if (!start)
1634 return NULL;
1635 end = strchr(start, '/');
1636 if (end)
1637 *end = '\0';
1638 return start;
1639}
1640
1641static void stripnewline(char *x)
1642{
1643 size_t l = strlen(x);
1644 if (l && x[l-1] == '\n')
1645 x[l-1] = '\0';
1646}
1647
1648static char *get_pid_cgroup(pid_t pid, const char *contrl)
1649{
5dd3e6fd 1650 int cfd;
237e200e
SH
1651 char fnam[PROCLEN];
1652 FILE *f;
1653 char *answer = NULL;
1654 char *line = NULL;
1655 size_t len = 0;
1656 int ret;
5dd3e6fd 1657 const char *h = find_mounted_controller(contrl, &cfd);
237e200e
SH
1658 if (!h)
1659 return NULL;
1660
1661 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1662 if (ret < 0 || ret >= PROCLEN)
1663 return NULL;
1664 if (!(f = fopen(fnam, "r")))
1665 return NULL;
1666
1667 while (getline(&line, &len, f) != -1) {
1668 char *c1, *c2;
1669 if (!line[0])
1670 continue;
1671 c1 = strchr(line, ':');
1672 if (!c1)
1673 goto out;
1674 c1++;
1675 c2 = strchr(c1, ':');
1676 if (!c2)
1677 goto out;
1678 *c2 = '\0';
1679 if (strcmp(c1, h) != 0)
1680 continue;
1681 c2++;
1682 stripnewline(c2);
1683 do {
1684 answer = strdup(c2);
1685 } while (!answer);
1686 break;
1687 }
1688
1689out:
1690 fclose(f);
1691 free(line);
1692 return answer;
1693}
1694
1695/*
1696 * check whether a fuse context may access a cgroup dir or file
1697 *
1698 * If file is not null, it is a cgroup file to check under cg.
1699 * If file is null, then we are checking perms on cg itself.
1700 *
1701 * For files we can check the mode of the list_keys result.
1702 * For cgroups, we must make assumptions based on the files under the
1703 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1704 * yet.
1705 */
1706static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1707{
1708 struct cgfs_files *k = NULL;
1709 bool ret = false;
1710
1711 k = cgfs_get_key(contrl, cg, file);
1712 if (!k)
1713 return false;
1714
1715 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1716 if (perms_include(k->mode >> 6, mode)) {
1717 ret = true;
1718 goto out;
1719 }
1720 }
1721 if (fc->gid == k->gid) {
1722 if (perms_include(k->mode >> 3, mode)) {
1723 ret = true;
1724 goto out;
1725 }
1726 }
1727 ret = perms_include(k->mode, mode);
1728
1729out:
1730 free_key(k);
1731 return ret;
1732}
1733
1734#define INITSCOPE "/init.scope"
1735static void prune_init_slice(char *cg)
1736{
1737 char *point;
1738 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1739
1740 if (cg_len < initscope_len)
1741 return;
1742
1743 point = cg + cg_len - initscope_len;
1744 if (strcmp(point, INITSCOPE) == 0) {
1745 if (point == cg)
1746 *(point+1) = '\0';
1747 else
1748 *point = '\0';
1749 }
1750}
1751
1752/*
1753 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1754 * If pid is in /a, he may act on /a/b, but not on /b.
1755 * if the answer is false and nextcg is not NULL, then *nextcg will point
1756 * to a string containing the next cgroup directory under cg, which must be
1757 * freed by the caller.
1758 */
1759static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1760{
1761 bool answer = false;
1762 char *c2 = get_pid_cgroup(pid, contrl);
1763 char *linecmp;
1764
1765 if (!c2)
1766 return false;
1767 prune_init_slice(c2);
1768
1769 /*
12c31268
CB
1770 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1771 * they pass in a cgroup without leading '/'
1772 *
1773 * The original line here was:
1774 * linecmp = *cg == '/' ? c2 : c2+1;
1775 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1776 * Serge, do you know?
237e200e 1777 */
12c31268
CB
1778 if (*cg == '/' || !strncmp(cg, "./", 2))
1779 linecmp = c2;
1780 else
1781 linecmp = c2 + 1;
237e200e
SH
1782 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1783 if (nextcg) {
1784 *nextcg = get_next_cgroup_dir(linecmp, cg);
1785 }
1786 goto out;
1787 }
1788 answer = true;
1789
1790out:
1791 free(c2);
1792 return answer;
1793}
1794
1795/*
1796 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1797 */
1798static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1799{
1800 bool answer = false;
1801 char *c2, *task_cg;
1802 size_t target_len, task_len;
1803
f7bff426 1804 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
237e200e
SH
1805 return true;
1806
1807 c2 = get_pid_cgroup(pid, contrl);
1808 if (!c2)
1809 return false;
1810 prune_init_slice(c2);
1811
1812 task_cg = c2 + 1;
1813 target_len = strlen(cg);
1814 task_len = strlen(task_cg);
1815 if (task_len == 0) {
1816 /* Task is in the root cg, it can see everything. This case is
1817 * not handled by the strmcps below, since they test for the
1818 * last /, but that is the first / that we've chopped off
1819 * above.
1820 */
1821 answer = true;
1822 goto out;
1823 }
1824 if (strcmp(cg, task_cg) == 0) {
1825 answer = true;
1826 goto out;
1827 }
1828 if (target_len < task_len) {
1829 /* looking up a parent dir */
1830 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1831 answer = true;
1832 goto out;
1833 }
1834 if (target_len > task_len) {
1835 /* looking up a child dir */
1836 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1837 answer = true;
1838 goto out;
1839 }
1840
1841out:
1842 free(c2);
1843 return answer;
1844}
1845
1846/*
1847 * given /cgroup/freezer/a/b, return "freezer".
1848 * the returned char* should NOT be freed.
1849 */
1850static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1851{
1852 const char *p1;
1853 char *contr, *slash;
1854
99142521 1855 if (strlen(path) < 9) {
e254948f 1856 errno = EACCES;
237e200e 1857 return NULL;
99142521
CB
1858 }
1859 if (*(path + 7) != '/') {
1860 errno = EINVAL;
237e200e 1861 return NULL;
99142521 1862 }
3adc421c 1863 p1 = path + 8;
237e200e 1864 contr = strdupa(p1);
99142521
CB
1865 if (!contr) {
1866 errno = ENOMEM;
237e200e 1867 return NULL;
99142521 1868 }
237e200e
SH
1869 slash = strstr(contr, "/");
1870 if (slash)
1871 *slash = '\0';
1872
1873 int i;
3adc421c 1874 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
1875 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1876 return hierarchies[i];
1877 }
99142521 1878 errno = ENOENT;
237e200e
SH
1879 return NULL;
1880}
1881
1882/*
1883 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1884 * Note that the returned value may include files (keynames) etc
1885 */
1886static const char *find_cgroup_in_path(const char *path)
1887{
1888 const char *p1;
1889
bc70ba9b 1890 if (strlen(path) < 9) {
e254948f 1891 errno = EACCES;
237e200e 1892 return NULL;
bc70ba9b
CB
1893 }
1894 p1 = strstr(path + 8, "/");
1895 if (!p1) {
1896 errno = EINVAL;
237e200e 1897 return NULL;
bc70ba9b
CB
1898 }
1899 errno = 0;
1900 return p1 + 1;
237e200e
SH
1901}
1902
1903/*
1904 * split the last path element from the path in @cg.
1905 * @dir is newly allocated and should be freed, @last not
1906*/
1907static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1908{
1909 char *p;
1910
1911 do {
1912 *dir = strdup(cg);
1913 } while (!*dir);
1914 *last = strrchr(cg, '/');
1915 if (!*last) {
1916 *last = NULL;
1917 return;
1918 }
1919 p = strrchr(*dir, '/');
1920 *p = '\0';
1921}
1922
1923/*
1924 * FUSE ops for /cgroup
1925 */
1926
1927int cg_getattr(const char *path, struct stat *sb)
1928{
1929 struct timespec now;
1930 struct fuse_context *fc = fuse_get_context();
1931 char * cgdir = NULL;
1932 char *last = NULL, *path1, *path2;
1933 struct cgfs_files *k = NULL;
1934 const char *cgroup;
1935 const char *controller = NULL;
1936 int ret = -ENOENT;
1937
1938
1939 if (!fc)
1940 return -EIO;
1941
1942 memset(sb, 0, sizeof(struct stat));
1943
1944 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1945 return -EINVAL;
1946
1947 sb->st_uid = sb->st_gid = 0;
1948 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1949 sb->st_size = 0;
1950
1951 if (strcmp(path, "/cgroup") == 0) {
1952 sb->st_mode = S_IFDIR | 00755;
1953 sb->st_nlink = 2;
1954 return 0;
1955 }
1956
1957 controller = pick_controller_from_path(fc, path);
1958 if (!controller)
2f7036d0 1959 return -errno;
237e200e
SH
1960 cgroup = find_cgroup_in_path(path);
1961 if (!cgroup) {
1962 /* this is just /cgroup/controller, return it as a dir */
1963 sb->st_mode = S_IFDIR | 00755;
1964 sb->st_nlink = 2;
1965 return 0;
1966 }
1967
1968 get_cgdir_and_path(cgroup, &cgdir, &last);
1969
1970 if (!last) {
1971 path1 = "/";
1972 path2 = cgdir;
1973 } else {
1974 path1 = cgdir;
1975 path2 = last;
1976 }
1977
1978 pid_t initpid = lookup_initpid_in_store(fc->pid);
1979 if (initpid <= 0)
1980 initpid = fc->pid;
1981 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1982 * Then check that caller's cgroup is under path if last is a child
1983 * cgroup, or cgdir if last is a file */
1984
1985 if (is_child_cgroup(controller, path1, path2)) {
1986 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1987 ret = -ENOENT;
1988 goto out;
1989 }
1990 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1991 /* this is just /cgroup/controller, return it as a dir */
1992 sb->st_mode = S_IFDIR | 00555;
1993 sb->st_nlink = 2;
1994 ret = 0;
1995 goto out;
1996 }
1997 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1998 ret = -EACCES;
1999 goto out;
2000 }
2001
2002 // get uid, gid, from '/tasks' file and make up a mode
2003 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2004 sb->st_mode = S_IFDIR | 00755;
2005 k = cgfs_get_key(controller, cgroup, NULL);
2006 if (!k) {
2007 sb->st_uid = sb->st_gid = 0;
2008 } else {
2009 sb->st_uid = k->uid;
2010 sb->st_gid = k->gid;
2011 }
2012 free_key(k);
2013 sb->st_nlink = 2;
2014 ret = 0;
2015 goto out;
2016 }
2017
2018 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
2019 sb->st_mode = S_IFREG | k->mode;
2020 sb->st_nlink = 1;
2021 sb->st_uid = k->uid;
2022 sb->st_gid = k->gid;
2023 sb->st_size = 0;
2024 free_key(k);
2025 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2026 ret = -ENOENT;
2027 goto out;
2028 }
237e200e
SH
2029 ret = 0;
2030 }
2031
2032out:
2033 free(cgdir);
2034 return ret;
2035}
2036
2037int cg_opendir(const char *path, struct fuse_file_info *fi)
2038{
2039 struct fuse_context *fc = fuse_get_context();
2040 const char *cgroup;
2041 struct file_info *dir_info;
2042 char *controller = NULL;
2043
2044 if (!fc)
2045 return -EIO;
2046
2047 if (strcmp(path, "/cgroup") == 0) {
2048 cgroup = NULL;
2049 controller = NULL;
2050 } else {
2051 // return list of keys for the controller, and list of child cgroups
2052 controller = pick_controller_from_path(fc, path);
2053 if (!controller)
2f7036d0 2054 return -errno;
237e200e
SH
2055
2056 cgroup = find_cgroup_in_path(path);
2057 if (!cgroup) {
2058 /* this is just /cgroup/controller, return its contents */
2059 cgroup = "/";
2060 }
2061 }
2062
2063 pid_t initpid = lookup_initpid_in_store(fc->pid);
2064 if (initpid <= 0)
2065 initpid = fc->pid;
2066 if (cgroup) {
2067 if (!caller_may_see_dir(initpid, controller, cgroup))
2068 return -ENOENT;
2069 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2070 return -EACCES;
2071 }
2072
2073 /* we'll free this at cg_releasedir */
2074 dir_info = malloc(sizeof(*dir_info));
2075 if (!dir_info)
2076 return -ENOMEM;
2077 dir_info->controller = must_copy_string(controller);
2078 dir_info->cgroup = must_copy_string(cgroup);
2079 dir_info->type = LXC_TYPE_CGDIR;
2080 dir_info->buf = NULL;
2081 dir_info->file = NULL;
2082 dir_info->buflen = 0;
2083
2084 fi->fh = (unsigned long)dir_info;
2085 return 0;
2086}
2087
2088int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2089 struct fuse_file_info *fi)
2090{
2091 struct file_info *d = (struct file_info *)fi->fh;
2092 struct cgfs_files **list = NULL;
2093 int i, ret;
2094 char *nextcg = NULL;
2095 struct fuse_context *fc = fuse_get_context();
2096 char **clist = NULL;
2097
d639f863
CB
2098 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2099 return -EIO;
2100
237e200e 2101 if (d->type != LXC_TYPE_CGDIR) {
b8defc3d 2102 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
237e200e
SH
2103 return -EIO;
2104 }
2105 if (!d->cgroup && !d->controller) {
2106 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2107 int i;
2108
2109 for (i = 0; i < num_hierarchies; i++) {
2110 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2111 return -EIO;
2112 }
2113 }
2114 return 0;
2115 }
2116
2117 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2118 // not a valid cgroup
2119 ret = -EINVAL;
2120 goto out;
2121 }
2122
2123 pid_t initpid = lookup_initpid_in_store(fc->pid);
2124 if (initpid <= 0)
2125 initpid = fc->pid;
2126 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2127 if (nextcg) {
2128 ret = filler(buf, nextcg, NULL, 0);
2129 free(nextcg);
2130 if (ret != 0) {
2131 ret = -EIO;
2132 goto out;
2133 }
2134 }
2135 ret = 0;
2136 goto out;
2137 }
2138
2139 for (i = 0; list[i]; i++) {
2140 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2141 ret = -EIO;
2142 goto out;
2143 }
2144 }
2145
2146 // now get the list of child cgroups
2147
2148 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2149 ret = 0;
2150 goto out;
2151 }
f366da65
WB
2152 if (clist) {
2153 for (i = 0; clist[i]; i++) {
2154 if (filler(buf, clist[i], NULL, 0) != 0) {
2155 ret = -EIO;
2156 goto out;
2157 }
237e200e
SH
2158 }
2159 }
2160 ret = 0;
2161
2162out:
2163 free_keys(list);
2164 if (clist) {
2165 for (i = 0; clist[i]; i++)
2166 free(clist[i]);
2167 free(clist);
2168 }
2169 return ret;
2170}
2171
43215927 2172static void do_release_file_info(struct fuse_file_info *fi)
237e200e 2173{
43215927
SH
2174 struct file_info *f = (struct file_info *)fi->fh;
2175
237e200e
SH
2176 if (!f)
2177 return;
43215927
SH
2178
2179 fi->fh = 0;
2180
237e200e 2181 free(f->controller);
43215927 2182 f->controller = NULL;
237e200e 2183 free(f->cgroup);
43215927 2184 f->cgroup = NULL;
237e200e 2185 free(f->file);
43215927 2186 f->file = NULL;
237e200e 2187 free(f->buf);
43215927 2188 f->buf = NULL;
237e200e 2189 free(f);
bbb508dd 2190 f = NULL;
237e200e
SH
2191}
2192
2193int cg_releasedir(const char *path, struct fuse_file_info *fi)
2194{
43215927 2195 do_release_file_info(fi);
237e200e
SH
2196 return 0;
2197}
2198
2199int cg_open(const char *path, struct fuse_file_info *fi)
2200{
2201 const char *cgroup;
2202 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2203 struct cgfs_files *k = NULL;
2204 struct file_info *file_info;
2205 struct fuse_context *fc = fuse_get_context();
2206 int ret;
2207
2208 if (!fc)
2209 return -EIO;
2210
2211 controller = pick_controller_from_path(fc, path);
2212 if (!controller)
2f7036d0 2213 return -errno;
237e200e
SH
2214 cgroup = find_cgroup_in_path(path);
2215 if (!cgroup)
bc70ba9b 2216 return -errno;
237e200e
SH
2217
2218 get_cgdir_and_path(cgroup, &cgdir, &last);
2219 if (!last) {
2220 path1 = "/";
2221 path2 = cgdir;
2222 } else {
2223 path1 = cgdir;
2224 path2 = last;
2225 }
2226
2227 k = cgfs_get_key(controller, path1, path2);
2228 if (!k) {
2229 ret = -EINVAL;
2230 goto out;
2231 }
2232 free_key(k);
2233
2234 pid_t initpid = lookup_initpid_in_store(fc->pid);
2235 if (initpid <= 0)
2236 initpid = fc->pid;
2237 if (!caller_may_see_dir(initpid, controller, path1)) {
2238 ret = -ENOENT;
2239 goto out;
2240 }
2241 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
237e200e
SH
2242 ret = -EACCES;
2243 goto out;
2244 }
2245
2246 /* we'll free this at cg_release */
2247 file_info = malloc(sizeof(*file_info));
2248 if (!file_info) {
2249 ret = -ENOMEM;
2250 goto out;
2251 }
2252 file_info->controller = must_copy_string(controller);
2253 file_info->cgroup = must_copy_string(path1);
2254 file_info->file = must_copy_string(path2);
2255 file_info->type = LXC_TYPE_CGFILE;
2256 file_info->buf = NULL;
2257 file_info->buflen = 0;
2258
2259 fi->fh = (unsigned long)file_info;
2260 ret = 0;
2261
2262out:
2263 free(cgdir);
2264 return ret;
2265}
2266
bddbb106
SH
2267int cg_access(const char *path, int mode)
2268{
6f0f6b83 2269 int ret;
bddbb106 2270 const char *cgroup;
6f0f6b83
CB
2271 char *path1, *path2, *controller;
2272 char *last = NULL, *cgdir = NULL;
bddbb106
SH
2273 struct cgfs_files *k = NULL;
2274 struct fuse_context *fc = fuse_get_context();
6f0f6b83 2275
9873c5e8 2276 if (strcmp(path, "/cgroup") == 0)
6f0f6b83 2277 return 0;
bddbb106
SH
2278
2279 if (!fc)
2280 return -EIO;
2281
2282 controller = pick_controller_from_path(fc, path);
2283 if (!controller)
2f7036d0 2284 return -errno;
bddbb106 2285 cgroup = find_cgroup_in_path(path);
575316c4
SH
2286 if (!cgroup) {
2287 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
3f441bc7
SH
2288 if ((mode & W_OK) == 0)
2289 return 0;
2290 return -EACCES;
575316c4 2291 }
bddbb106
SH
2292
2293 get_cgdir_and_path(cgroup, &cgdir, &last);
2294 if (!last) {
2295 path1 = "/";
2296 path2 = cgdir;
2297 } else {
2298 path1 = cgdir;
2299 path2 = last;
2300 }
2301
2302 k = cgfs_get_key(controller, path1, path2);
2303 if (!k) {
3f441bc7
SH
2304 if ((mode & W_OK) == 0)
2305 ret = 0;
2306 else
2307 ret = -EACCES;
bddbb106
SH
2308 goto out;
2309 }
2310 free_key(k);
2311
2312 pid_t initpid = lookup_initpid_in_store(fc->pid);
2313 if (initpid <= 0)
2314 initpid = fc->pid;
2315 if (!caller_may_see_dir(initpid, controller, path1)) {
2316 ret = -ENOENT;
2317 goto out;
2318 }
2319 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2320 ret = -EACCES;
2321 goto out;
2322 }
2323
2324 ret = 0;
2325
2326out:
2327 free(cgdir);
2328 return ret;
2329}
2330
237e200e
SH
2331int cg_release(const char *path, struct fuse_file_info *fi)
2332{
43215927 2333 do_release_file_info(fi);
237e200e
SH
2334 return 0;
2335}
2336
2337#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2338
2339static bool wait_for_sock(int sock, int timeout)
2340{
2341 struct epoll_event ev;
2342 int epfd, ret, now, starttime, deltatime, saved_errno;
2343
2344 if ((starttime = time(NULL)) < 0)
2345 return false;
2346
2347 if ((epfd = epoll_create(1)) < 0) {
b8defc3d 2348 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
237e200e
SH
2349 return false;
2350 }
2351
2352 ev.events = POLLIN_SET;
2353 ev.data.fd = sock;
2354 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
b8defc3d 2355 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
237e200e
SH
2356 close(epfd);
2357 return false;
2358 }
2359
2360again:
2361 if ((now = time(NULL)) < 0) {
2362 close(epfd);
2363 return false;
2364 }
2365
2366 deltatime = (starttime + timeout) - now;
2367 if (deltatime < 0) { // timeout
2368 errno = 0;
2369 close(epfd);
2370 return false;
2371 }
2372 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2373 if (ret < 0 && errno == EINTR)
2374 goto again;
2375 saved_errno = errno;
2376 close(epfd);
2377
2378 if (ret <= 0) {
2379 errno = saved_errno;
2380 return false;
2381 }
2382 return true;
2383}
2384
2385static int msgrecv(int sockfd, void *buf, size_t len)
2386{
2387 if (!wait_for_sock(sockfd, 2))
2388 return -1;
2389 return recv(sockfd, buf, len, MSG_DONTWAIT);
2390}
2391
2392static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2393{
2394 struct msghdr msg = { 0 };
2395 struct iovec iov;
2396 struct cmsghdr *cmsg;
2397 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2398 char buf[1];
2399 buf[0] = 'p';
2400
2401 if (pingfirst) {
2402 if (msgrecv(sock, buf, 1) != 1) {
b8defc3d 2403 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
237e200e
SH
2404 return SEND_CREDS_FAIL;
2405 }
2406 }
2407
2408 msg.msg_control = cmsgbuf;
2409 msg.msg_controllen = sizeof(cmsgbuf);
2410
2411 cmsg = CMSG_FIRSTHDR(&msg);
2412 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2413 cmsg->cmsg_level = SOL_SOCKET;
2414 cmsg->cmsg_type = SCM_CREDENTIALS;
2415 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2416
2417 msg.msg_name = NULL;
2418 msg.msg_namelen = 0;
2419
2420 buf[0] = v;
2421 iov.iov_base = buf;
2422 iov.iov_len = sizeof(buf);
2423 msg.msg_iov = &iov;
2424 msg.msg_iovlen = 1;
2425
2426 if (sendmsg(sock, &msg, 0) < 0) {
b8defc3d 2427 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
237e200e
SH
2428 if (errno == 3)
2429 return SEND_CREDS_NOTSK;
2430 return SEND_CREDS_FAIL;
2431 }
2432
2433 return SEND_CREDS_OK;
2434}
2435
2436static bool recv_creds(int sock, struct ucred *cred, char *v)
2437{
2438 struct msghdr msg = { 0 };
2439 struct iovec iov;
2440 struct cmsghdr *cmsg;
2441 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2442 char buf[1];
2443 int ret;
2444 int optval = 1;
2445
2446 *v = '1';
2447
2448 cred->pid = -1;
2449 cred->uid = -1;
2450 cred->gid = -1;
2451
2452 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
b8defc3d 2453 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
237e200e
SH
2454 return false;
2455 }
2456 buf[0] = '1';
2457 if (write(sock, buf, 1) != 1) {
b8defc3d 2458 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
237e200e
SH
2459 return false;
2460 }
2461
2462 msg.msg_name = NULL;
2463 msg.msg_namelen = 0;
2464 msg.msg_control = cmsgbuf;
2465 msg.msg_controllen = sizeof(cmsgbuf);
2466
2467 iov.iov_base = buf;
2468 iov.iov_len = sizeof(buf);
2469 msg.msg_iov = &iov;
2470 msg.msg_iovlen = 1;
2471
2472 if (!wait_for_sock(sock, 2)) {
b8defc3d 2473 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
237e200e
SH
2474 return false;
2475 }
2476 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2477 if (ret < 0) {
b8defc3d 2478 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
237e200e
SH
2479 return false;
2480 }
2481
2482 cmsg = CMSG_FIRSTHDR(&msg);
2483
2484 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2485 cmsg->cmsg_level == SOL_SOCKET &&
2486 cmsg->cmsg_type == SCM_CREDENTIALS) {
2487 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2488 }
2489 *v = buf[0];
2490
2491 return true;
2492}
2493
35174b0f
FG
2494struct pid_ns_clone_args {
2495 int *cpipe;
2496 int sock;
2497 pid_t tpid;
2498 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2499};
2500
2501/*
2502 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2503 * with clone(). This simply writes '1' as ACK back to the parent
2504 * before calling the actual wrapped function.
2505 */
2506static int pid_ns_clone_wrapper(void *arg) {
2507 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2508 char b = '1';
2509
2510 close(args->cpipe[0]);
b8defc3d
CB
2511 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2512 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
35174b0f
FG
2513 close(args->cpipe[1]);
2514 return args->wrapped(args->sock, args->tpid);
2515}
237e200e
SH
2516
2517/*
2518 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2519 * int value back over the socket. This shifts the pid from the
2520 * sender's pidns into tpid's pidns.
2521 */
35174b0f 2522static int pid_to_ns(int sock, pid_t tpid)
237e200e
SH
2523{
2524 char v = '0';
2525 struct ucred cred;
2526
2527 while (recv_creds(sock, &cred, &v)) {
2528 if (v == '1')
35174b0f 2529 return 0;
237e200e 2530 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
35174b0f 2531 return 1;
237e200e 2532 }
35174b0f 2533 return 0;
237e200e
SH
2534}
2535
35174b0f 2536
237e200e
SH
2537/*
2538 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
35174b0f
FG
2539 * in your old pidns. Only children which you clone will be in the target
2540 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2541 * actually convert pids.
2542 *
2543 * Note: glibc's fork() does not respect pidns, which can lead to failed
2544 * assertions inside glibc (and thus failed forks) if the child's pid in
2545 * the pidns and the parent pid outside are identical. Using clone prevents
2546 * this issue.
237e200e
SH
2547 */
2548static void pid_to_ns_wrapper(int sock, pid_t tpid)
2549{
2550 int newnsfd = -1, ret, cpipe[2];
2551 char fnam[100];
2552 pid_t cpid;
2553 char v;
2554
2555 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2556 if (ret < 0 || ret >= sizeof(fnam))
2557 _exit(1);
2558 newnsfd = open(fnam, O_RDONLY);
2559 if (newnsfd < 0)
2560 _exit(1);
2561 if (setns(newnsfd, 0) < 0)
2562 _exit(1);
2563 close(newnsfd);
2564
2565 if (pipe(cpipe) < 0)
2566 _exit(1);
2567
35174b0f
FG
2568 struct pid_ns_clone_args args = {
2569 .cpipe = cpipe,
2570 .sock = sock,
2571 .tpid = tpid,
2572 .wrapped = &pid_to_ns
2573 };
2574 size_t stack_size = sysconf(_SC_PAGESIZE);
2575 void *stack = alloca(stack_size);
2576
2577 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2578 if (cpid < 0)
2579 _exit(1);
2580
237e200e
SH
2581 // give the child 1 second to be done forking and
2582 // write its ack
2583 if (!wait_for_sock(cpipe[0], 1))
2584 _exit(1);
2585 ret = read(cpipe[0], &v, 1);
2586 if (ret != sizeof(char) || v != '1')
2587 _exit(1);
2588
2589 if (!wait_for_pid(cpid))
2590 _exit(1);
2591 _exit(0);
2592}
2593
2594/*
2595 * To read cgroup files with a particular pid, we will setns into the child
2596 * pidns, open a pipe, fork a child - which will be the first to really be in
2597 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2598 */
2599bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2600{
2601 int sock[2] = {-1, -1};
2602 char *tmpdata = NULL;
2603 int ret;
2604 pid_t qpid, cpid = -1;
2605 bool answer = false;
2606 char v = '0';
2607 struct ucred cred;
2608 size_t sz = 0, asz = 0;
2609
2610 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2611 return false;
2612
2613 /*
2614 * Now we read the pids from returned data one by one, pass
2615 * them into a child in the target namespace, read back the
2616 * translated pids, and put them into our to-return data
2617 */
2618
2619 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2620 perror("socketpair");
2621 free(tmpdata);
2622 return false;
2623 }
2624
2625 cpid = fork();
2626 if (cpid == -1)
2627 goto out;
2628
2629 if (!cpid) // child - exits when done
2630 pid_to_ns_wrapper(sock[1], tpid);
2631
2632 char *ptr = tmpdata;
2633 cred.uid = 0;
2634 cred.gid = 0;
2635 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2636 cred.pid = qpid;
2637 ret = send_creds(sock[0], &cred, v, true);
2638
2639 if (ret == SEND_CREDS_NOTSK)
2640 goto next;
2641 if (ret == SEND_CREDS_FAIL)
2642 goto out;
2643
2644 // read converted results
2645 if (!wait_for_sock(sock[0], 2)) {
b8defc3d 2646 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
237e200e
SH
2647 goto out;
2648 }
2649 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2650 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
237e200e
SH
2651 goto out;
2652 }
2653 must_strcat_pid(d, &sz, &asz, qpid);
2654next:
2655 ptr = strchr(ptr, '\n');
2656 if (!ptr)
2657 break;
2658 ptr++;
2659 }
2660
2661 cred.pid = getpid();
2662 v = '1';
2663 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2664 // failed to ask child to exit
b8defc3d 2665 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
237e200e
SH
2666 goto out;
2667 }
2668
2669 answer = true;
2670
2671out:
2672 free(tmpdata);
2673 if (cpid != -1)
2674 wait_for_pid(cpid);
2675 if (sock[0] != -1) {
2676 close(sock[0]);
2677 close(sock[1]);
2678 }
2679 return answer;
2680}
2681
2682int cg_read(const char *path, char *buf, size_t size, off_t offset,
2683 struct fuse_file_info *fi)
2684{
2685 struct fuse_context *fc = fuse_get_context();
2686 struct file_info *f = (struct file_info *)fi->fh;
2687 struct cgfs_files *k = NULL;
2688 char *data = NULL;
2689 int ret, s;
2690 bool r;
2691
2692 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 2693 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
237e200e
SH
2694 return -EIO;
2695 }
2696
2697 if (offset)
2698 return 0;
2699
2700 if (!fc)
2701 return -EIO;
2702
2703 if (!f->controller)
2704 return -EINVAL;
2705
2706 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2707 return -EINVAL;
2708 }
2709 free_key(k);
2710
2711
888f8f3c 2712 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
237e200e
SH
2713 ret = -EACCES;
2714 goto out;
2715 }
2716
2717 if (strcmp(f->file, "tasks") == 0 ||
2718 strcmp(f->file, "/tasks") == 0 ||
2719 strcmp(f->file, "/cgroup.procs") == 0 ||
2720 strcmp(f->file, "cgroup.procs") == 0)
2721 // special case - we have to translate the pids
2722 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2723 else
2724 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2725
2726 if (!r) {
2727 ret = -EINVAL;
2728 goto out;
2729 }
2730
2731 if (!data) {
2732 ret = 0;
2733 goto out;
2734 }
2735 s = strlen(data);
2736 if (s > size)
2737 s = size;
2738 memcpy(buf, data, s);
2739 if (s > 0 && s < size && data[s-1] != '\n')
2740 buf[s++] = '\n';
2741
2742 ret = s;
2743
2744out:
2745 free(data);
2746 return ret;
2747}
2748
35174b0f 2749static int pid_from_ns(int sock, pid_t tpid)
237e200e
SH
2750{
2751 pid_t vpid;
2752 struct ucred cred;
2753 char v;
2754 int ret;
2755
2756 cred.uid = 0;
2757 cred.gid = 0;
2758 while (1) {
2759 if (!wait_for_sock(sock, 2)) {
b8defc3d 2760 lxcfs_error("%s\n", "Timeout reading from parent.");
35174b0f 2761 return 1;
237e200e
SH
2762 }
2763 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
b8defc3d 2764 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
35174b0f 2765 return 1;
237e200e
SH
2766 }
2767 if (vpid == -1) // done
2768 break;
2769 v = '0';
2770 cred.pid = vpid;
2771 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2772 v = '1';
2773 cred.pid = getpid();
2774 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
35174b0f 2775 return 1;
237e200e
SH
2776 }
2777 }
35174b0f 2778 return 0;
237e200e
SH
2779}
2780
2781static void pid_from_ns_wrapper(int sock, pid_t tpid)
2782{
2783 int newnsfd = -1, ret, cpipe[2];
2784 char fnam[100];
2785 pid_t cpid;
2786 char v;
2787
2788 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2789 if (ret < 0 || ret >= sizeof(fnam))
2790 _exit(1);
2791 newnsfd = open(fnam, O_RDONLY);
2792 if (newnsfd < 0)
2793 _exit(1);
2794 if (setns(newnsfd, 0) < 0)
2795 _exit(1);
2796 close(newnsfd);
2797
2798 if (pipe(cpipe) < 0)
2799 _exit(1);
2800
35174b0f
FG
2801 struct pid_ns_clone_args args = {
2802 .cpipe = cpipe,
2803 .sock = sock,
2804 .tpid = tpid,
2805 .wrapped = &pid_from_ns
2806 };
f0f8b851
SH
2807 size_t stack_size = sysconf(_SC_PAGESIZE);
2808 void *stack = alloca(stack_size);
35174b0f
FG
2809
2810 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2811 if (cpid < 0)
2812 _exit(1);
2813
237e200e
SH
2814 // give the child 1 second to be done forking and
2815 // write its ack
2816 if (!wait_for_sock(cpipe[0], 1))
f0f8b851 2817 _exit(1);
237e200e 2818 ret = read(cpipe[0], &v, 1);
f0f8b851
SH
2819 if (ret != sizeof(char) || v != '1')
2820 _exit(1);
237e200e
SH
2821
2822 if (!wait_for_pid(cpid))
2823 _exit(1);
2824 _exit(0);
237e200e
SH
2825}
2826
2827/*
2828 * Given host @uid, return the uid to which it maps in
2829 * @pid's user namespace, or -1 if none.
2830 */
2831bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2832{
2833 FILE *f;
2834 char line[400];
2835
2836 sprintf(line, "/proc/%d/uid_map", pid);
2837 if ((f = fopen(line, "r")) == NULL) {
2838 return false;
2839 }
2840
2841 *answer = convert_id_to_ns(f, uid);
2842 fclose(f);
2843
2844 if (*answer == -1)
2845 return false;
2846 return true;
2847}
2848
2849/*
2850 * get_pid_creds: get the real uid and gid of @pid from
2851 * /proc/$$/status
2852 * (XXX should we use euid here?)
2853 */
2854void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2855{
2856 char line[400];
2857 uid_t u;
2858 gid_t g;
2859 FILE *f;
2860
2861 *uid = -1;
2862 *gid = -1;
2863 sprintf(line, "/proc/%d/status", pid);
2864 if ((f = fopen(line, "r")) == NULL) {
b8defc3d 2865 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
237e200e
SH
2866 return;
2867 }
2868 while (fgets(line, 400, f)) {
2869 if (strncmp(line, "Uid:", 4) == 0) {
2870 if (sscanf(line+4, "%u", &u) != 1) {
b8defc3d 2871 lxcfs_error("bad uid line for pid %u\n", pid);
237e200e
SH
2872 fclose(f);
2873 return;
2874 }
2875 *uid = u;
2876 } else if (strncmp(line, "Gid:", 4) == 0) {
2877 if (sscanf(line+4, "%u", &g) != 1) {
b8defc3d 2878 lxcfs_error("bad gid line for pid %u\n", pid);
237e200e
SH
2879 fclose(f);
2880 return;
2881 }
2882 *gid = g;
2883 }
2884 }
2885 fclose(f);
2886}
2887
2888/*
2889 * May the requestor @r move victim @v to a new cgroup?
2890 * This is allowed if
2891 * . they are the same task
2892 * . they are ownedy by the same uid
2893 * . @r is root on the host, or
2894 * . @v's uid is mapped into @r's where @r is root.
2895 */
2896bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2897{
2898 uid_t v_uid, tmpuid;
2899 gid_t v_gid;
2900
2901 if (r == v)
2902 return true;
2903 if (r_uid == 0)
2904 return true;
2905 get_pid_creds(v, &v_uid, &v_gid);
2906 if (r_uid == v_uid)
2907 return true;
2908 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2909 && hostuid_to_ns(v_uid, r, &tmpuid))
2910 return true;
2911 return false;
2912}
2913
2914static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2915 const char *file, const char *buf)
2916{
2917 int sock[2] = {-1, -1};
2918 pid_t qpid, cpid = -1;
2919 FILE *pids_file = NULL;
2920 bool answer = false, fail = false;
2921
2922 pids_file = open_pids_file(contrl, cg);
2923 if (!pids_file)
2924 return false;
2925
2926 /*
2927 * write the pids to a socket, have helper in writer's pidns
2928 * call movepid for us
2929 */
2930 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2931 perror("socketpair");
2932 goto out;
2933 }
2934
2935 cpid = fork();
2936 if (cpid == -1)
2937 goto out;
2938
2939 if (!cpid) { // child
2940 fclose(pids_file);
2941 pid_from_ns_wrapper(sock[1], tpid);
2942 }
2943
2944 const char *ptr = buf;
2945 while (sscanf(ptr, "%d", &qpid) == 1) {
2946 struct ucred cred;
2947 char v;
2948
2949 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2950 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
237e200e
SH
2951 goto out;
2952 }
2953
2954 if (recv_creds(sock[0], &cred, &v)) {
2955 if (v == '0') {
2956 if (!may_move_pid(tpid, tuid, cred.pid)) {
2957 fail = true;
2958 break;
2959 }
2960 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2961 fail = true;
2962 }
2963 }
2964
2965 ptr = strchr(ptr, '\n');
2966 if (!ptr)
2967 break;
2968 ptr++;
2969 }
2970
2971 /* All good, write the value */
2972 qpid = -1;
2973 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
b8defc3d 2974 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
237e200e
SH
2975
2976 if (!fail)
2977 answer = true;
2978
2979out:
2980 if (cpid != -1)
2981 wait_for_pid(cpid);
2982 if (sock[0] != -1) {
2983 close(sock[0]);
2984 close(sock[1]);
2985 }
2986 if (pids_file) {
2987 if (fclose(pids_file) != 0)
2988 answer = false;
2989 }
2990 return answer;
2991}
2992
2993int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2994 struct fuse_file_info *fi)
2995{
2996 struct fuse_context *fc = fuse_get_context();
2997 char *localbuf = NULL;
2998 struct cgfs_files *k = NULL;
2999 struct file_info *f = (struct file_info *)fi->fh;
3000 bool r;
3001
3002 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 3003 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
237e200e
SH
3004 return -EIO;
3005 }
3006
3007 if (offset)
3008 return 0;
3009
3010 if (!fc)
3011 return -EIO;
3012
3013 localbuf = alloca(size+1);
3014 localbuf[size] = '\0';
3015 memcpy(localbuf, buf, size);
3016
3017 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
3018 size = -EINVAL;
3019 goto out;
3020 }
3021
3022 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
3023 size = -EACCES;
3024 goto out;
3025 }
3026
3027 if (strcmp(f->file, "tasks") == 0 ||
3028 strcmp(f->file, "/tasks") == 0 ||
3029 strcmp(f->file, "/cgroup.procs") == 0 ||
3030 strcmp(f->file, "cgroup.procs") == 0)
3031 // special case - we have to translate the pids
3032 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
3033 else
3034 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
3035
3036 if (!r)
3037 size = -EINVAL;
3038
3039out:
3040 free_key(k);
3041 return size;
3042}
3043
3044int cg_chown(const char *path, uid_t uid, gid_t gid)
3045{
3046 struct fuse_context *fc = fuse_get_context();
3047 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3048 struct cgfs_files *k = NULL;
3049 const char *cgroup;
3050 int ret;
3051
3052 if (!fc)
3053 return -EIO;
3054
3055 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 3056 return -EPERM;
237e200e
SH
3057
3058 controller = pick_controller_from_path(fc, path);
3059 if (!controller)
bc70ba9b
CB
3060 return errno == ENOENT ? -EPERM : -errno;
3061
237e200e
SH
3062 cgroup = find_cgroup_in_path(path);
3063 if (!cgroup)
3064 /* this is just /cgroup/controller */
bc70ba9b 3065 return -EPERM;
237e200e
SH
3066
3067 get_cgdir_and_path(cgroup, &cgdir, &last);
3068
3069 if (!last) {
3070 path1 = "/";
3071 path2 = cgdir;
3072 } else {
3073 path1 = cgdir;
3074 path2 = last;
3075 }
3076
3077 if (is_child_cgroup(controller, path1, path2)) {
3078 // get uid, gid, from '/tasks' file and make up a mode
3079 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3080 k = cgfs_get_key(controller, cgroup, "tasks");
3081
3082 } else
3083 k = cgfs_get_key(controller, path1, path2);
3084
3085 if (!k) {
3086 ret = -EINVAL;
3087 goto out;
3088 }
3089
3090 /*
3091 * This being a fuse request, the uid and gid must be valid
3092 * in the caller's namespace. So we can just check to make
3093 * sure that the caller is root in his uid, and privileged
3094 * over the file's current owner.
3095 */
3096 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3097 ret = -EACCES;
3098 goto out;
3099 }
3100
3101 ret = cgfs_chown_file(controller, cgroup, uid, gid);
3102
3103out:
3104 free_key(k);
3105 free(cgdir);
3106
3107 return ret;
3108}
3109
3110int cg_chmod(const char *path, mode_t mode)
3111{
3112 struct fuse_context *fc = fuse_get_context();
3113 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3114 struct cgfs_files *k = NULL;
3115 const char *cgroup;
3116 int ret;
3117
3118 if (!fc)
3119 return -EIO;
3120
3121 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 3122 return -EPERM;
237e200e
SH
3123
3124 controller = pick_controller_from_path(fc, path);
3125 if (!controller)
bc70ba9b
CB
3126 return errno == ENOENT ? -EPERM : -errno;
3127
237e200e
SH
3128 cgroup = find_cgroup_in_path(path);
3129 if (!cgroup)
3130 /* this is just /cgroup/controller */
bc70ba9b 3131 return -EPERM;
237e200e
SH
3132
3133 get_cgdir_and_path(cgroup, &cgdir, &last);
3134
3135 if (!last) {
3136 path1 = "/";
3137 path2 = cgdir;
3138 } else {
3139 path1 = cgdir;
3140 path2 = last;
3141 }
3142
3143 if (is_child_cgroup(controller, path1, path2)) {
3144 // get uid, gid, from '/tasks' file and make up a mode
3145 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3146 k = cgfs_get_key(controller, cgroup, "tasks");
3147
3148 } else
3149 k = cgfs_get_key(controller, path1, path2);
3150
3151 if (!k) {
3152 ret = -EINVAL;
3153 goto out;
3154 }
3155
3156 /*
3157 * This being a fuse request, the uid and gid must be valid
3158 * in the caller's namespace. So we can just check to make
3159 * sure that the caller is root in his uid, and privileged
3160 * over the file's current owner.
3161 */
3162 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3163 ret = -EPERM;
3164 goto out;
3165 }
3166
3167 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3168 ret = -EINVAL;
3169 goto out;
3170 }
3171
3172 ret = 0;
3173out:
3174 free_key(k);
3175 free(cgdir);
3176 return ret;
3177}
3178
3179int cg_mkdir(const char *path, mode_t mode)
3180{
3181 struct fuse_context *fc = fuse_get_context();
3182 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3183 const char *cgroup;
3184 int ret;
3185
3186 if (!fc)
3187 return -EIO;
3188
237e200e
SH
3189 controller = pick_controller_from_path(fc, path);
3190 if (!controller)
2f7036d0 3191 return errno == ENOENT ? -EPERM : -errno;
237e200e
SH
3192
3193 cgroup = find_cgroup_in_path(path);
3194 if (!cgroup)
bc70ba9b 3195 return -errno;
237e200e
SH
3196
3197 get_cgdir_and_path(cgroup, &cgdir, &last);
3198 if (!last)
3199 path1 = "/";
3200 else
3201 path1 = cgdir;
3202
3203 pid_t initpid = lookup_initpid_in_store(fc->pid);
3204 if (initpid <= 0)
3205 initpid = fc->pid;
3206 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3207 if (!next)
3208 ret = -EINVAL;
3209 else if (last && strcmp(next, last) == 0)
3210 ret = -EEXIST;
3211 else
2f7036d0 3212 ret = -EPERM;
237e200e
SH
3213 goto out;
3214 }
3215
3216 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3217 ret = -EACCES;
3218 goto out;
3219 }
3220 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3221 ret = -EACCES;
3222 goto out;
3223 }
3224
3225 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3226
3227out:
3228 free(cgdir);
3229 free(next);
3230 return ret;
3231}
3232
3233int cg_rmdir(const char *path)
3234{
3235 struct fuse_context *fc = fuse_get_context();
3236 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3237 const char *cgroup;
3238 int ret;
3239
3240 if (!fc)
3241 return -EIO;
3242
3243 controller = pick_controller_from_path(fc, path);
e254948f
CB
3244 if (!controller) /* Someone's trying to delete "/cgroup". */
3245 return -EPERM;
237e200e
SH
3246
3247 cgroup = find_cgroup_in_path(path);
e254948f
CB
3248 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3249 return -EPERM;
237e200e
SH
3250
3251 get_cgdir_and_path(cgroup, &cgdir, &last);
3252 if (!last) {
e254948f
CB
3253 /* Someone's trying to delete a cgroup on the same level as the
3254 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3255 * rmdir "/cgroup/blkio/init.slice".
3256 */
3257 ret = -EPERM;
237e200e
SH
3258 goto out;
3259 }
3260
3261 pid_t initpid = lookup_initpid_in_store(fc->pid);
3262 if (initpid <= 0)
3263 initpid = fc->pid;
3264 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
de77249b 3265 if (!last || (next && (strcmp(next, last) == 0)))
237e200e
SH
3266 ret = -EBUSY;
3267 else
3268 ret = -ENOENT;
3269 goto out;
3270 }
3271
3272 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3273 ret = -EACCES;
3274 goto out;
3275 }
3276 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3277 ret = -EACCES;
3278 goto out;
3279 }
3280
3281 if (!cgfs_remove(controller, cgroup)) {
3282 ret = -EINVAL;
3283 goto out;
3284 }
3285
3286 ret = 0;
3287
3288out:
3289 free(cgdir);
3290 free(next);
3291 return ret;
3292}
3293
3294static bool startswith(const char *line, const char *pref)
3295{
3296 if (strncmp(line, pref, strlen(pref)) == 0)
3297 return true;
3298 return false;
3299}
3300
c6095b08
SH
3301static void parse_memstat(char *memstat, unsigned long *cached,
3302 unsigned long *active_anon, unsigned long *inactive_anon,
3303 unsigned long *active_file, unsigned long *inactive_file,
559eaa8f 3304 unsigned long *unevictable, unsigned long *shmem)
237e200e
SH
3305{
3306 char *eol;
3307
237e200e 3308 while (*memstat) {
4accebfb
AS
3309 if (startswith(memstat, "total_cache")) {
3310 sscanf(memstat + 11, "%lu", cached);
c6095b08 3311 *cached /= 1024;
4accebfb
AS
3312 } else if (startswith(memstat, "total_active_anon")) {
3313 sscanf(memstat + 17, "%lu", active_anon);
c6095b08 3314 *active_anon /= 1024;
4accebfb
AS
3315 } else if (startswith(memstat, "total_inactive_anon")) {
3316 sscanf(memstat + 19, "%lu", inactive_anon);
c6095b08 3317 *inactive_anon /= 1024;
4accebfb
AS
3318 } else if (startswith(memstat, "total_active_file")) {
3319 sscanf(memstat + 17, "%lu", active_file);
c6095b08 3320 *active_file /= 1024;
4accebfb
AS
3321 } else if (startswith(memstat, "total_inactive_file")) {
3322 sscanf(memstat + 19, "%lu", inactive_file);
c6095b08 3323 *inactive_file /= 1024;
4accebfb
AS
3324 } else if (startswith(memstat, "total_unevictable")) {
3325 sscanf(memstat + 17, "%lu", unevictable);
c6095b08 3326 *unevictable /= 1024;
559eaa8f
JS
3327 } else if (startswith(memstat, "total_shmem")) {
3328 sscanf(memstat + 11, "%lu", shmem);
3329 *shmem /= 1024;
237e200e
SH
3330 }
3331 eol = strchr(memstat, '\n');
3332 if (!eol)
3333 return;
3334 memstat = eol+1;
3335 }
3336}
3337
3338static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3339{
3340 char *eol;
3341 char key[32];
3342
3343 memset(key, 0, 32);
3344 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3345
3346 size_t len = strlen(key);
3347 *v = 0;
3348
3349 while (*str) {
3350 if (startswith(str, key)) {
3351 sscanf(str + len, "%lu", v);
3352 return;
3353 }
3354 eol = strchr(str, '\n');
3355 if (!eol)
3356 return;
3357 str = eol+1;
3358 }
3359}
3360
3361static int read_file(const char *path, char *buf, size_t size,
3362 struct file_info *d)
3363{
3364 size_t linelen = 0, total_len = 0, rv = 0;
3365 char *line = NULL;
3366 char *cache = d->buf;
3367 size_t cache_size = d->buflen;
3368 FILE *f = fopen(path, "r");
3369 if (!f)
3370 return 0;
3371
3372 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3373 ssize_t l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
3374 if (l < 0) {
3375 perror("Error writing to cache");
3376 rv = 0;
3377 goto err;
3378 }
3379 if (l >= cache_size) {
b8defc3d 3380 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3381 rv = 0;
3382 goto err;
3383 }
3384 cache += l;
3385 cache_size -= l;
3386 total_len += l;
3387 }
3388
3389 d->size = total_len;
a262ddb7
CB
3390 if (total_len > size)
3391 total_len = size;
237e200e
SH
3392
3393 /* read from off 0 */
3394 memcpy(buf, d->buf, total_len);
3395 rv = total_len;
3396 err:
3397 fclose(f);
3398 free(line);
3399 return rv;
3400}
3401
3402/*
3403 * FUSE ops for /proc
3404 */
3405
018246ff 3406static unsigned long get_memlimit(const char *cgroup, const char *file)
237e200e
SH
3407{
3408 char *memlimit_str = NULL;
3409 unsigned long memlimit = -1;
3410
018246ff 3411 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
237e200e
SH
3412 memlimit = strtoul(memlimit_str, NULL, 10);
3413
3414 free(memlimit_str);
3415
3416 return memlimit;
3417}
3418
018246ff 3419static unsigned long get_min_memlimit(const char *cgroup, const char *file)
237e200e
SH
3420{
3421 char *copy = strdupa(cgroup);
3422 unsigned long memlimit = 0, retlimit;
3423
018246ff 3424 retlimit = get_memlimit(copy, file);
237e200e
SH
3425
3426 while (strcmp(copy, "/") != 0) {
3427 copy = dirname(copy);
018246ff 3428 memlimit = get_memlimit(copy, file);
237e200e
SH
3429 if (memlimit != -1 && memlimit < retlimit)
3430 retlimit = memlimit;
3431 };
3432
3433 return retlimit;
3434}
3435
3436static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3437 struct fuse_file_info *fi)
3438{
3439 struct fuse_context *fc = fuse_get_context();
3440 struct file_info *d = (struct file_info *)fi->fh;
3441 char *cg;
3442 char *memusage_str = NULL, *memstat_str = NULL,
018246ff 3443 *memswlimit_str = NULL, *memswusage_str = NULL;
237e200e 3444 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
c6095b08 3445 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
559eaa8f 3446 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
594a10e6 3447 hostswtotal = 0;
237e200e
SH
3448 char *line = NULL;
3449 size_t linelen = 0, total_len = 0, rv = 0;
3450 char *cache = d->buf;
3451 size_t cache_size = d->buflen;
3452 FILE *f = NULL;
3453
3454 if (offset){
3455 if (offset > d->size)
3456 return -EINVAL;
3457 if (!d->cached)
3458 return 0;
3459 int left = d->size - offset;
3460 total_len = left > size ? size: left;
3461 memcpy(buf, cache + offset, total_len);
3462 return total_len;
3463 }
3464
3465 pid_t initpid = lookup_initpid_in_store(fc->pid);
3466 if (initpid <= 0)
3467 initpid = fc->pid;
3468 cg = get_pid_cgroup(initpid, "memory");
3469 if (!cg)
3470 return read_file("/proc/meminfo", buf, size, d);
6d2f6996 3471 prune_init_slice(cg);
237e200e 3472
018246ff 3473 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
237e200e
SH
3474 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3475 goto err;
3476 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3477 goto err;
3478
3479 // Following values are allowed to fail, because swapaccount might be turned
3480 // off for current kernel
3481 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3482 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3483 {
018246ff 3484 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
237e200e
SH
3485 memswusage = strtoul(memswusage_str, NULL, 10);
3486
237e200e
SH
3487 memswlimit = memswlimit / 1024;
3488 memswusage = memswusage / 1024;
3489 }
3490
3491 memusage = strtoul(memusage_str, NULL, 10);
3492 memlimit /= 1024;
3493 memusage /= 1024;
3494
c6095b08
SH
3495 parse_memstat(memstat_str, &cached, &active_anon,
3496 &inactive_anon, &active_file, &inactive_file,
559eaa8f 3497 &unevictable, &shmem);
237e200e
SH
3498
3499 f = fopen("/proc/meminfo", "r");
3500 if (!f)
3501 goto err;
3502
3503 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3504 ssize_t l;
237e200e
SH
3505 char *printme, lbuf[100];
3506
3507 memset(lbuf, 0, 100);
3508 if (startswith(line, "MemTotal:")) {
594a10e6 3509 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
237e200e
SH
3510 if (hosttotal < memlimit)
3511 memlimit = hosttotal;
3512 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3513 printme = lbuf;
3514 } else if (startswith(line, "MemFree:")) {
3515 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3516 printme = lbuf;
3517 } else if (startswith(line, "MemAvailable:")) {
ad19b86d 3518 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
237e200e
SH
3519 printme = lbuf;
3520 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
594a10e6 3521 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
4127e51b 3522 if (hostswtotal < memswlimit)
3523 memswlimit = hostswtotal;
3524 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
237e200e
SH
3525 printme = lbuf;
3526 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
4127e51b 3527 unsigned long swaptotal = memswlimit,
b4665ce0
SH
3528 swapusage = memswusage - memusage,
3529 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3530 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
237e200e 3531 printme = lbuf;
da35d72a
SH
3532 } else if (startswith(line, "Slab:")) {
3533 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3534 printme = lbuf;
237e200e
SH
3535 } else if (startswith(line, "Buffers:")) {
3536 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3537 printme = lbuf;
3538 } else if (startswith(line, "Cached:")) {
3539 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3540 printme = lbuf;
3541 } else if (startswith(line, "SwapCached:")) {
3542 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3543 printme = lbuf;
2f306ad3 3544 } else if (startswith(line, "Active:")) {
c6095b08
SH
3545 snprintf(lbuf, 100, "Active: %8lu kB\n",
3546 active_anon + active_file);
3547 printme = lbuf;
2f306ad3 3548 } else if (startswith(line, "Inactive:")) {
c6095b08
SH
3549 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3550 inactive_anon + inactive_file);
3551 printme = lbuf;
3552 } else if (startswith(line, "Active(anon)")) {
3553 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3554 printme = lbuf;
3555 } else if (startswith(line, "Inactive(anon)")) {
3556 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3557 printme = lbuf;
3558 } else if (startswith(line, "Active(file)")) {
3559 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3560 printme = lbuf;
3561 } else if (startswith(line, "Inactive(file)")) {
3562 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3563 printme = lbuf;
3564 } else if (startswith(line, "Unevictable")) {
3565 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3566 printme = lbuf;
3567 } else if (startswith(line, "SReclaimable")) {
3568 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3569 printme = lbuf;
3570 } else if (startswith(line, "SUnreclaim")) {
3571 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3572 printme = lbuf;
559eaa8f
JS
3573 } else if (startswith(line, "Shmem:")) {
3574 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3575 printme = lbuf;
28cdea9b
JS
3576 } else if (startswith(line, "ShmemHugePages")) {
3577 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3578 printme = lbuf;
3579 } else if (startswith(line, "ShmemPmdMapped")) {
3580 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3581 printme = lbuf;
237e200e
SH
3582 } else
3583 printme = line;
3584
3585 l = snprintf(cache, cache_size, "%s", printme);
3586 if (l < 0) {
3587 perror("Error writing to cache");
3588 rv = 0;
3589 goto err;
3590
3591 }
3592 if (l >= cache_size) {
b8defc3d 3593 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3594 rv = 0;
3595 goto err;
3596 }
3597
3598 cache += l;
3599 cache_size -= l;
3600 total_len += l;
3601 }
3602
3603 d->cached = 1;
3604 d->size = total_len;
3605 if (total_len > size ) total_len = size;
3606 memcpy(buf, d->buf, total_len);
3607
3608 rv = total_len;
3609err:
3610 if (f)
3611 fclose(f);
3612 free(line);
3613 free(cg);
3614 free(memusage_str);
3615 free(memswlimit_str);
3616 free(memswusage_str);
3617 free(memstat_str);
237e200e
SH
3618 return rv;
3619}
3620
3621/*
3622 * Read the cpuset.cpus for cg
3623 * Return the answer in a newly allocated string which must be freed
3624 */
3625static char *get_cpuset(const char *cg)
3626{
3627 char *answer;
3628
3629 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3630 return NULL;
3631 return answer;
3632}
3633
3634bool cpu_in_cpuset(int cpu, const char *cpuset);
3635
3636static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3637{
3638 int cpu;
3639
3640 if (sscanf(line, "processor : %d", &cpu) != 1)
3641 return false;
3642 return cpu_in_cpuset(cpu, cpuset);
3643}
3644
c59d6a55
JS
3645/*
3646 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3647 * depending on `param`. Parameter value is returned throuh `value`.
3648 */
3649static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3650{
3651 bool rv = false;
3652 char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3653 char *str = NULL;
3654
3655 sprintf(file, "cpu.cfs_%s_us", param);
3656
3657 if (!cgfs_get_value("cpu", cg, file, &str))
3658 goto err;
3659
3660 if (sscanf(str, "%ld", value) != 1)
3661 goto err;
3662
3663 rv = true;
3664
3665err:
3666 if (str)
3667 free(str);
3668 return rv;
3669}
3670
3671/*
3672 * Return the maximum number of visible CPUs based on CPU quotas.
3673 * If there is no quota set, zero is returned.
3674 */
3675int max_cpu_count(const char *cg)
3676{
3677 int rv, nprocs;
3678 int64_t cfs_quota, cfs_period;
3679
3680 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3681 return 0;
3682
3683 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3684 return 0;
3685
3686 if (cfs_quota <= 0 || cfs_period <= 0)
3687 return 0;
3688
3689 rv = cfs_quota / cfs_period;
3690
3691 /* In case quota/period does not yield a whole number, add one CPU for
3692 * the remainder.
3693 */
3694 if ((cfs_quota % cfs_period) > 0)
3695 rv += 1;
3696
3697 nprocs = get_nprocs();
3698
3699 if (rv > nprocs)
3700 rv = nprocs;
3701
3702 return rv;
3703}
3704
3705/*
3706 * Determine whether CPU views should be used or not.
3707 */
3708bool use_cpuview(const char *cg)
3709{
3710 int cfd;
3711 char *tmpc;
3712
3713 tmpc = find_mounted_controller("cpu", &cfd);
3714 if (!tmpc)
3715 return false;
3716
3717 tmpc = find_mounted_controller("cpuacct", &cfd);
3718 if (!tmpc)
3719 return false;
3720
3721 return true;
3722}
3723
237e200e
SH
3724/*
3725 * check whether this is a '^processor" line in /proc/cpuinfo
3726 */
3727static bool is_processor_line(const char *line)
3728{
3729 int cpu;
3730
3731 if (sscanf(line, "processor : %d", &cpu) == 1)
3732 return true;
3733 return false;
3734}
3735
3736static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3737 struct fuse_file_info *fi)
3738{
3739 struct fuse_context *fc = fuse_get_context();
3740 struct file_info *d = (struct file_info *)fi->fh;
3741 char *cg;
3742 char *cpuset = NULL;
3743 char *line = NULL;
3744 size_t linelen = 0, total_len = 0, rv = 0;
f676eb79 3745 bool am_printing = false, firstline = true, is_s390x = false;
c59d6a55
JS
3746 int curcpu = -1, cpu, max_cpus = 0;
3747 bool use_view;
237e200e
SH
3748 char *cache = d->buf;
3749 size_t cache_size = d->buflen;
3750 FILE *f = NULL;
3751
3752 if (offset){
3753 if (offset > d->size)
3754 return -EINVAL;
3755 if (!d->cached)
3756 return 0;
3757 int left = d->size - offset;
3758 total_len = left > size ? size: left;
3759 memcpy(buf, cache + offset, total_len);
3760 return total_len;
3761 }
3762
3763 pid_t initpid = lookup_initpid_in_store(fc->pid);
3764 if (initpid <= 0)
3765 initpid = fc->pid;
3766 cg = get_pid_cgroup(initpid, "cpuset");
3767 if (!cg)
3768 return read_file("proc/cpuinfo", buf, size, d);
6d2f6996 3769 prune_init_slice(cg);
237e200e
SH
3770
3771 cpuset = get_cpuset(cg);
3772 if (!cpuset)
3773 goto err;
3774
c59d6a55
JS
3775 use_view = use_cpuview(cg);
3776
3777 if (use_view)
3778 max_cpus = max_cpu_count(cg);
3779
237e200e
SH
3780 f = fopen("/proc/cpuinfo", "r");
3781 if (!f)
3782 goto err;
3783
3784 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3785 ssize_t l;
f676eb79
SH
3786 if (firstline) {
3787 firstline = false;
3788 if (strstr(line, "IBM/S390") != NULL) {
3789 is_s390x = true;
3790 am_printing = true;
5ed9d4e2 3791 continue;
f676eb79
SH
3792 }
3793 }
5ed9d4e2
SH
3794 if (strncmp(line, "# processors:", 12) == 0)
3795 continue;
237e200e 3796 if (is_processor_line(line)) {
c59d6a55
JS
3797 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3798 break;
237e200e
SH
3799 am_printing = cpuline_in_cpuset(line, cpuset);
3800 if (am_printing) {
3801 curcpu ++;
3802 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3803 if (l < 0) {
3804 perror("Error writing to cache");
3805 rv = 0;
3806 goto err;
3807 }
3808 if (l >= cache_size) {
b8defc3d 3809 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3810 rv = 0;
3811 goto err;
3812 }
3813 cache += l;
3814 cache_size -= l;
3815 total_len += l;
3816 }
3817 continue;
f676eb79
SH
3818 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3819 char *p;
c59d6a55
JS
3820 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3821 break;
f676eb79
SH
3822 if (!cpu_in_cpuset(cpu, cpuset))
3823 continue;
3824 curcpu ++;
3825 p = strchr(line, ':');
3826 if (!p || !*p)
3827 goto err;
3828 p++;
5ed9d4e2 3829 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
f676eb79
SH
3830 if (l < 0) {
3831 perror("Error writing to cache");
3832 rv = 0;
3833 goto err;
3834 }
3835 if (l >= cache_size) {
b8defc3d 3836 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
f676eb79
SH
3837 rv = 0;
3838 goto err;
3839 }
3840 cache += l;
3841 cache_size -= l;
3842 total_len += l;
3843 continue;
3844
237e200e
SH
3845 }
3846 if (am_printing) {
3847 l = snprintf(cache, cache_size, "%s", line);
3848 if (l < 0) {
3849 perror("Error writing to cache");
3850 rv = 0;
3851 goto err;
3852 }
3853 if (l >= cache_size) {
b8defc3d 3854 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3855 rv = 0;
3856 goto err;
3857 }
3858 cache += l;
3859 cache_size -= l;
3860 total_len += l;
3861 }
3862 }
3863
5ed9d4e2
SH
3864 if (is_s390x) {
3865 char *origcache = d->buf;
a262ddb7 3866 ssize_t l;
5ed9d4e2
SH
3867 do {
3868 d->buf = malloc(d->buflen);
3869 } while (!d->buf);
3870 cache = d->buf;
3871 cache_size = d->buflen;
3872 total_len = 0;
3873 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3874 if (l < 0 || l >= cache_size) {
3875 free(origcache);
3876 goto err;
3877 }
3878 cache_size -= l;
3879 cache += l;
3880 total_len += l;
3881 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3882 if (l < 0 || l >= cache_size) {
3883 free(origcache);
3884 goto err;
3885 }
3886 cache_size -= l;
3887 cache += l;
3888 total_len += l;
3889 l = snprintf(cache, cache_size, "%s", origcache);
3890 free(origcache);
3891 if (l < 0 || l >= cache_size)
3892 goto err;
3893 total_len += l;
3894 }
3895
237e200e
SH
3896 d->cached = 1;
3897 d->size = total_len;
3898 if (total_len > size ) total_len = size;
3899
3900 /* read from off 0 */
3901 memcpy(buf, d->buf, total_len);
3902 rv = total_len;
3903err:
3904 if (f)
3905 fclose(f);
3906 free(line);
3907 free(cpuset);
3908 free(cg);
3909 return rv;
3910}
3911
0ecddf02 3912static uint64_t get_reaper_start_time(pid_t pid)
9ac264cf 3913{
9ac264cf 3914 int ret;
0ecddf02
CB
3915 FILE *f;
3916 uint64_t starttime;
3917 /* strlen("/proc/") = 6
3918 * +
3919 * LXCFS_NUMSTRLEN64
3920 * +
3921 * strlen("/stat") = 5
3922 * +
3923 * \0 = 1
3924 * */
3925#define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3926 char path[__PROC_PID_STAT_LEN];
9ac264cf
JB
3927 pid_t qpid;
3928
3929 qpid = lookup_initpid_in_store(pid);
0ecddf02
CB
3930 if (qpid <= 0) {
3931 /* Caller can check for EINVAL on 0. */
3932 errno = EINVAL;
9ac264cf 3933 return 0;
0ecddf02 3934 }
9ac264cf 3935
0ecddf02
CB
3936 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3937 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3938 /* Caller can check for EINVAL on 0. */
3939 errno = EINVAL;
9ac264cf 3940 return 0;
0ecddf02 3941 }
9ac264cf 3942
0ecddf02
CB
3943 f = fopen(path, "r");
3944 if (!f) {
3945 /* Caller can check for EINVAL on 0. */
3946 errno = EINVAL;
9ac264cf 3947 return 0;
0ecddf02 3948 }
9ac264cf 3949
0ecddf02
CB
3950 /* Note that the *scanf() argument supression requires that length
3951 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3952 * at us. It's like telling someone you're not married and then asking
3953 * if you can bring your wife to the party.
3954 */
3955 ret = fscanf(f, "%*d " /* (1) pid %d */
3956 "%*s " /* (2) comm %s */
3957 "%*c " /* (3) state %c */
3958 "%*d " /* (4) ppid %d */
3959 "%*d " /* (5) pgrp %d */
3960 "%*d " /* (6) session %d */
3961 "%*d " /* (7) tty_nr %d */
3962 "%*d " /* (8) tpgid %d */
3963 "%*u " /* (9) flags %u */
3964 "%*u " /* (10) minflt %lu */
3965 "%*u " /* (11) cminflt %lu */
3966 "%*u " /* (12) majflt %lu */
3967 "%*u " /* (13) cmajflt %lu */
3968 "%*u " /* (14) utime %lu */
3969 "%*u " /* (15) stime %lu */
3970 "%*d " /* (16) cutime %ld */
3971 "%*d " /* (17) cstime %ld */
3972 "%*d " /* (18) priority %ld */
3973 "%*d " /* (19) nice %ld */
3974 "%*d " /* (20) num_threads %ld */
3975 "%*d " /* (21) itrealvalue %ld */
3976 "%" PRIu64, /* (22) starttime %llu */
3977 &starttime);
3978 if (ret != 1) {
3979 fclose(f);
3980 /* Caller can check for EINVAL on 0. */
3981 errno = EINVAL;
3982 return 0;
3983 }
3984
3985 fclose(f);
3986
3987 errno = 0;
3988 return starttime;
3989}
3990
3991static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3992{
3993 uint64_t clockticks;
3994 int64_t ticks_per_sec;
3995
3996 clockticks = get_reaper_start_time(pid);
3997 if (clockticks == 0 && errno == EINVAL) {
3998 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3999 return 0;
4000 }
4001
4002 ticks_per_sec = sysconf(_SC_CLK_TCK);
4003 if (ticks_per_sec < 0 && errno == EINVAL) {
4004 lxcfs_debug(
4005 "%s\n",
4006 "failed to determine number of clock ticks in a second");
4007 return 0;
4008 }
4009
4010 return (clockticks /= ticks_per_sec);
4011}
4012
4013static uint64_t get_reaper_age(pid_t pid)
4014{
4015 uint64_t procstart, uptime, procage;
4016
4017 /* We need to substract the time the process has started since system
4018 * boot minus the time when the system has started to get the actual
4019 * reaper age.
4020 */
4021 procstart = get_reaper_start_time_in_sec(pid);
4022 procage = procstart;
4023 if (procstart > 0) {
4024 int ret;
4025 struct timespec spec;
4026
4027 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
4028 if (ret < 0)
4029 return 0;
4030 /* We could make this more precise here by using the tv_nsec
4031 * field in the timespec struct and convert it to milliseconds
4032 * and then create a double for the seconds and milliseconds but
4033 * that seems more work than it is worth.
4034 */
4035 uptime = spec.tv_sec;
4036 procage = uptime - procstart;
4037 }
4038
4039 return procage;
4040}
4041
8be92dd1
JS
4042/*
4043 * Returns 0 on success.
4044 * It is the caller's responsibility to free `return_usage`, unless this
4045 * function returns an error.
4046 */
4047static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage)
4048{
77005a6c 4049 int cpucount = get_nprocs_conf();
8be92dd1
JS
4050 struct cpuacct_usage *cpu_usage;
4051 int rv = 0, i, j, ret, read_pos = 0, read_cnt;
4052 int cg_cpu;
4053 uint64_t cg_user, cg_system;
4054 int64_t ticks_per_sec;
4055 char *usage_str = NULL;
4056
4057 ticks_per_sec = sysconf(_SC_CLK_TCK);
4058
4059 if (ticks_per_sec < 0 && errno == EINVAL) {
4060 lxcfs_debug(
4061 "%s\n",
4062 "read_cpuacct_usage_all failed to determine number of clock ticks "
4063 "in a second");
4064 return -1;
4065 }
4066
4067 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4068 if (!cpu_usage)
4069 return -ENOMEM;
4070
4071 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4072 rv = -1;
4073 goto err;
4074 }
4075
4076 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4077 lxcfs_error("read_cpuacct_usage_all reading first line from "
4078 "%s/cpuacct.usage_all failed.\n", cg);
4079 rv = -1;
4080 goto err;
4081 }
4082
4083 read_pos += read_cnt;
4084
4085 for (i = 0, j = 0; i < cpucount; i++) {
4086 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4087 &cg_system, &read_cnt);
4088
4089 if (ret == EOF)
4090 break;
4091
4092 if (ret != 3) {
4093 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4094 "failed.\n", cg);
4095 rv = -1;
4096 goto err;
4097 }
4098
4099 read_pos += read_cnt;
4100
8be92dd1
JS
4101 /* Convert the time from nanoseconds to USER_HZ */
4102 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4103 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4104 j++;
4105 }
4106
4107 rv = 0;
4108 *return_usage = cpu_usage;
4109
4110err:
4111 if (usage_str)
4112 free(usage_str);
4113
4114 if (rv != 0) {
4115 free(cpu_usage);
4116 *return_usage = NULL;
4117 }
4118
4119 return rv;
4120}
4121
056adcef
JS
4122static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4123{
4124 int i;
4125 unsigned long sum = 0;
4126
4127 for (i = 0; i < cpu_count; i++) {
77005a6c
JS
4128 if (!newer[i].online)
4129 continue;
4130
056adcef
JS
4131 /* When cpuset is changed on the fly, the CPUs might get reordered.
4132 * We could either reset all counters, or check that the substractions
4133 * below will return expected results.
4134 */
4135 if (newer[i].user > older[i].user)
4136 diff[i].user = newer[i].user - older[i].user;
4137 else
4138 diff[i].user = 0;
4139
4140 if (newer[i].system > older[i].system)
4141 diff[i].system = newer[i].system - older[i].system;
4142 else
4143 diff[i].system = 0;
4144
4145 if (newer[i].idle > older[i].idle)
4146 diff[i].idle = newer[i].idle - older[i].idle;
4147 else
4148 diff[i].idle = 0;
4149
4150 sum += diff[i].user;
4151 sum += diff[i].system;
4152 sum += diff[i].idle;
4153 }
4154
4155 return sum;
4156}
4157
4158static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4159{
4160 unsigned long free_space, to_add;
4161
4162 free_space = threshold - usage->user - usage->system;
4163
4164 if (free_space > usage->idle)
4165 free_space = usage->idle;
4166
4167 to_add = free_space > *surplus ? *surplus : free_space;
4168
4169 *counter += to_add;
4170 usage->idle -= to_add;
4171 *surplus -= to_add;
4172}
4173
951acc94
JS
4174static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4175{
4176 struct cg_proc_stat *first = NULL, *prev, *tmp;
4177
4178 for (prev = NULL; node; ) {
4179 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4180 tmp = node;
4181 lxcfs_debug("Removing stat node for %s\n", node->cg);
4182
4183 if (prev)
4184 prev->next = node->next;
4185 else
4186 first = node->next;
4187
4188 node = node->next;
4189 free_proc_stat_node(tmp);
4190 } else {
4191 if (!first)
4192 first = node;
4193 prev = node;
4194 node = node->next;
4195 }
4196 }
4197
4198 return first;
4199}
4200
4201#define PROC_STAT_PRUNE_INTERVAL 10
4202static void prune_proc_stat_history(void)
4203{
4204 int i;
4205 time_t now = time(NULL);
4206
4207 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
2f49b662
JS
4208 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4209
4210 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4211 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
951acc94 4212 return;
2f49b662 4213 }
951acc94 4214
2f49b662
JS
4215 if (proc_stat_history[i]->next) {
4216 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4217 proc_stat_history[i]->lastcheck = now;
4218 }
951acc94 4219
2f49b662 4220 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
951acc94
JS
4221 }
4222}
4223
2f49b662 4224static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
056adcef 4225{
056adcef
JS
4226 struct cg_proc_stat *node;
4227
2f49b662
JS
4228 pthread_rwlock_rdlock(&head->lock);
4229
4230 if (!head->next) {
4231 pthread_rwlock_unlock(&head->lock);
056adcef 4232 return NULL;
2f49b662 4233 }
056adcef
JS
4234
4235 node = head->next;
4236
4237 do {
4238 if (strcmp(cg, node->cg) == 0)
951acc94 4239 goto out;
056adcef
JS
4240 } while ((node = node->next));
4241
951acc94
JS
4242 node = NULL;
4243
4244out:
2f49b662 4245 pthread_rwlock_unlock(&head->lock);
951acc94
JS
4246 prune_proc_stat_history();
4247 return node;
056adcef
JS
4248}
4249
4250static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4251{
4252 struct cg_proc_stat *node;
4253 int i;
4254
4255 node = malloc(sizeof(struct cg_proc_stat));
4256 if (!node)
4257 goto err;
4258
4259 node->cg = NULL;
4260 node->usage = NULL;
4261 node->view = NULL;
4262
4263 node->cg = malloc(strlen(cg) + 1);
4264 if (!node->cg)
4265 goto err;
4266
4267 strcpy(node->cg, cg);
4268
4269 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4270 if (!node->usage)
4271 goto err;
4272
4273 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4274
4275 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4276 if (!node->view)
4277 goto err;
4278
4279 node->cpu_count = cpu_count;
4280 node->next = NULL;
4281
2f49b662
JS
4282 if (pthread_mutex_init(&node->lock, NULL) != 0) {
4283 lxcfs_error("%s\n", "Failed to initialize node lock");
4284 goto err;
4285 }
4286
056adcef
JS
4287 for (i = 0; i < cpu_count; i++) {
4288 node->view[i].user = 0;
4289 node->view[i].system = 0;
4290 node->view[i].idle = 0;
4291 }
4292
4293 return node;
4294
4295err:
4296 if (node && node->cg)
4297 free(node->cg);
4298 if (node && node->usage)
4299 free(node->usage);
4300 if (node && node->view)
4301 free(node->view);
4302 if (node)
4303 free(node);
4304
4305 return NULL;
4306}
4307
2f49b662 4308static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
056adcef
JS
4309{
4310 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4311 struct cg_proc_stat_head *head = proc_stat_history[hash];
2f49b662
JS
4312 struct cg_proc_stat *node, *rv = new_node;
4313
4314 pthread_rwlock_wrlock(&head->lock);
056adcef
JS
4315
4316 if (!head->next) {
4317 head->next = new_node;
2f49b662 4318 goto out;
056adcef
JS
4319 }
4320
2f49b662
JS
4321 node = head->next;
4322
056adcef 4323 for (;;) {
2f49b662
JS
4324 if (strcmp(node->cg, new_node->cg) == 0) {
4325 /* The node is already present, return it */
4326 free_proc_stat_node(new_node);
4327 rv = node;
4328 goto out;
4329 }
056adcef
JS
4330
4331 if (node->next) {
4332 node = node->next;
4333 continue;
4334 }
4335
4336 node->next = new_node;
2f49b662
JS
4337 goto out;
4338 }
4339
4340out:
4341 pthread_rwlock_unlock(&head->lock);
4342 return rv;
4343}
4344
895f28e5
JS
4345static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4346{
4347 struct cpuacct_usage *new_usage, *new_view;
4348 int i;
4349
4350 /* Allocate new memory */
4351 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4352 if (!new_usage)
4353 return false;
4354
4355 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4356 if (!new_view) {
4357 free(new_usage);
4358 return false;
4359 }
4360
4361 /* Copy existing data & initialize new elements */
4362 for (i = 0; i < cpu_count; i++) {
4363 if (i < node->cpu_count) {
4364 new_usage[i].user = node->usage[i].user;
4365 new_usage[i].system = node->usage[i].system;
4366 new_usage[i].idle = node->usage[i].idle;
4367
4368 new_view[i].user = node->view[i].user;
4369 new_view[i].system = node->view[i].system;
4370 new_view[i].idle = node->view[i].idle;
4371 } else {
4372 new_usage[i].user = 0;
4373 new_usage[i].system = 0;
4374 new_usage[i].idle = 0;
4375
4376 new_view[i].user = 0;
4377 new_view[i].system = 0;
4378 new_view[i].idle = 0;
4379 }
4380 }
4381
4382 free(node->usage);
4383 free(node->view);
4384
4385 node->usage = new_usage;
4386 node->view = new_view;
4387 node->cpu_count = cpu_count;
4388
4389 return true;
4390}
4391
2f49b662
JS
4392static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4393{
4394 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4395 struct cg_proc_stat_head *head = proc_stat_history[hash];
4396 struct cg_proc_stat *node;
4397
4398 node = find_proc_stat_node(head, cg);
4399
4400 if (!node) {
4401 node = new_proc_stat_node(usage, cpu_count, cg);
4402 if (!node)
4403 return NULL;
4404
4405 node = add_proc_stat_node(node);
4406 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
056adcef 4407 }
2f49b662
JS
4408
4409 pthread_mutex_lock(&node->lock);
895f28e5
JS
4410
4411 /* If additional CPUs on the host have been enabled, CPU usage counter
4412 * arrays have to be expanded */
4413 if (node->cpu_count < cpu_count) {
4414 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4415 node->cpu_count, cpu_count, cg);
4416
4417 if (!expand_proc_stat_node(node, cpu_count)) {
4418 pthread_mutex_unlock(&node->lock);
4419 lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4420 node->cpu_count, cpu_count, cg);
4421 return NULL;
4422 }
4423 }
4424
2f49b662 4425 return node;
056adcef
JS
4426}
4427
4428static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4429{
4430 int i;
4431
4432 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4433 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4434
4435 for (i = 0; i < cpu_count; i++) {
4436 node->view[i].user = 0;
4437 node->view[i].system = 0;
4438 node->view[i].idle = 0;
4439 }
4440
4441 node->cpu_count = cpu_count;
4442}
4443
4444static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, FILE *f, char *buf, size_t buf_size)
4445{
4446 char *line = NULL;
4447 size_t linelen = 0, total_len = 0, rv = 0, l;
4448 int curcpu = -1; /* cpu numbering starts at 0 */
77005a6c 4449 int physcpu, i;
056adcef
JS
4450 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4451 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4452 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4453 unsigned long user_surplus = 0, system_surplus = 0;
4454 unsigned long total_sum, threshold;
4455 struct cg_proc_stat *stat_node;
4456 struct cpuacct_usage *diff = NULL;
77005a6c 4457 int nprocs = get_nprocs_conf();
056adcef
JS
4458
4459 /* Read all CPU stats and stop when we've encountered other lines */
4460 while (getline(&line, &linelen, f) != -1) {
77005a6c 4461 int ret;
056adcef
JS
4462 char cpu_char[10]; /* That's a lot of cores */
4463 uint64_t all_used, cg_used;
4464
4465 if (strlen(line) == 0)
4466 continue;
4467 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4468 /* not a ^cpuN line containing a number N */
4469 break;
4470 }
4471
77005a6c 4472 if (sscanf(cpu_char, "%d", &physcpu) != 1)
056adcef 4473 continue;
77005a6c 4474
056adcef
JS
4475 curcpu ++;
4476 cpu_cnt ++;
4477
77005a6c
JS
4478 if (!cpu_in_cpuset(physcpu, cpuset)) {
4479 for (i = curcpu; i <= physcpu; i++) {
4480 cg_cpu_usage[i].online = false;
4481 }
4482 continue;
4483 }
4484
4485 if (curcpu < physcpu) {
4486 /* Some CPUs may be disabled */
4487 for (i = curcpu; i < physcpu; i++)
4488 cg_cpu_usage[i].online = false;
4489
4490 curcpu = physcpu;
4491 }
4492
4493 cg_cpu_usage[curcpu].online = true;
4494
056adcef
JS
4495 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4496 &user,
4497 &nice,
4498 &system,
4499 &idle,
4500 &iowait,
4501 &irq,
4502 &softirq,
4503 &steal,
4504 &guest,
4505 &guest_nice);
4506
4507 if (ret != 10)
4508 continue;
4509
4510 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4511 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4512
4513 if (all_used >= cg_used) {
4514 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4515
4516 } else {
4517 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4518 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4519 curcpu, cg, all_used, cg_used);
4520 cg_cpu_usage[curcpu].idle = idle;
4521 }
4522 }
4523
4524 /* Cannot use more CPUs than is available due to cpuset */
4525 if (max_cpus > cpu_cnt)
4526 max_cpus = cpu_cnt;
4527
2f49b662 4528 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
056adcef
JS
4529
4530 if (!stat_node) {
2f49b662
JS
4531 lxcfs_error("unable to find/create stat node for %s\n", cg);
4532 rv = 0;
4533 goto err;
056adcef
JS
4534 }
4535
4536 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4537 if (!diff) {
4538 rv = 0;
4539 goto err;
4540 }
4541
4542 /*
4543 * If the new values are LOWER than values stored in memory, it means
4544 * the cgroup has been reset/recreated and we should reset too.
4545 */
77005a6c
JS
4546 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4547 if (!cg_cpu_usage[curcpu].online)
4548 continue;
4549
4550 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4551 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4552
4553 break;
4554 }
056adcef 4555
77005a6c
JS
4556 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4557
4558 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4559 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4560
4561 if (!stat_node->usage[curcpu].online)
4562 continue;
4563
4564 i++;
056adcef 4565
056adcef
JS
4566 stat_node->usage[curcpu].user += diff[curcpu].user;
4567 stat_node->usage[curcpu].system += diff[curcpu].system;
4568 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4569
77005a6c 4570 if (max_cpus > 0 && i >= max_cpus) {
056adcef
JS
4571 user_surplus += diff[curcpu].user;
4572 system_surplus += diff[curcpu].system;
4573 }
4574 }
4575
4576 /* Calculate usage counters of visible CPUs */
4577 if (max_cpus > 0) {
4578 /* threshold = maximum usage per cpu, including idle */
4579 threshold = total_sum / cpu_cnt * max_cpus;
4580
77005a6c
JS
4581 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4582 if (i == max_cpus)
4583 break;
4584
4585 if (!stat_node->usage[curcpu].online)
4586 continue;
4587
4588 i++;
4589
056adcef
JS
4590 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4591 continue;
4592
4593 /* Add user */
4594 add_cpu_usage(
4595 &user_surplus,
4596 &diff[curcpu],
4597 &diff[curcpu].user,
4598 threshold);
4599
4600 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4601 continue;
4602
4603 /* If there is still room, add system */
4604 add_cpu_usage(
4605 &system_surplus,
4606 &diff[curcpu],
4607 &diff[curcpu].system,
4608 threshold);
4609 }
4610
4611 if (user_surplus > 0)
4612 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4613 if (system_surplus > 0)
4614 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4615
77005a6c
JS
4616 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4617 if (i == max_cpus)
4618 break;
4619
4620 if (!stat_node->usage[curcpu].online)
4621 continue;
4622
4623 i++;
4624
056adcef
JS
4625 stat_node->view[curcpu].user += diff[curcpu].user;
4626 stat_node->view[curcpu].system += diff[curcpu].system;
4627 stat_node->view[curcpu].idle += diff[curcpu].idle;
4628
4629 user_sum += stat_node->view[curcpu].user;
4630 system_sum += stat_node->view[curcpu].system;
4631 idle_sum += stat_node->view[curcpu].idle;
4632 }
4633
4634 } else {
77005a6c
JS
4635 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4636 if (!stat_node->usage[curcpu].online)
4637 continue;
4638
056adcef
JS
4639 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4640 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4641 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4642
4643 user_sum += stat_node->view[curcpu].user;
4644 system_sum += stat_node->view[curcpu].system;
4645 idle_sum += stat_node->view[curcpu].idle;
4646 }
4647 }
4648
4649 /* Render the file */
4650 /* cpu-all */
4651 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4652 user_sum,
4653 system_sum,
4654 idle_sum);
4655
4656 if (l < 0) {
4657 perror("Error writing to cache");
4658 rv = 0;
4659 goto err;
4660
4661 }
4662 if (l >= buf_size) {
4663 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4664 rv = 0;
4665 goto err;
4666 }
4667
4668 buf += l;
4669 buf_size -= l;
4670 total_len += l;
4671
4672 /* Render visible CPUs */
77005a6c
JS
4673 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4674 if (!stat_node->usage[curcpu].online)
4675 continue;
4676
4677 i++;
4678
4679 if (max_cpus > 0 && i == max_cpus)
056adcef
JS
4680 break;
4681
4682 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
77005a6c 4683 i,
056adcef
JS
4684 stat_node->view[curcpu].user,
4685 stat_node->view[curcpu].system,
4686 stat_node->view[curcpu].idle);
4687
4688 if (l < 0) {
4689 perror("Error writing to cache");
4690 rv = 0;
4691 goto err;
4692
4693 }
4694 if (l >= buf_size) {
4695 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4696 rv = 0;
4697 goto err;
4698 }
4699
4700 buf += l;
4701 buf_size -= l;
4702 total_len += l;
4703 }
4704
4705 /* Pass the rest of /proc/stat, start with the last line read */
4706 l = snprintf(buf, buf_size, "%s", line);
4707
4708 if (l < 0) {
4709 perror("Error writing to cache");
4710 rv = 0;
4711 goto err;
4712
4713 }
4714 if (l >= buf_size) {
4715 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4716 rv = 0;
4717 goto err;
4718 }
4719
4720 buf += l;
4721 buf_size -= l;
4722 total_len += l;
4723
4724 /* Pass the rest of the host's /proc/stat */
4725 while (getline(&line, &linelen, f) != -1) {
4726 l = snprintf(buf, buf_size, "%s", line);
4727 if (l < 0) {
4728 perror("Error writing to cache");
4729 rv = 0;
4730 goto err;
4731 }
4732 if (l >= buf_size) {
4733 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4734 rv = 0;
4735 goto err;
4736 }
4737 buf += l;
4738 buf_size -= l;
4739 total_len += l;
4740 }
4741
4742 rv = total_len;
4743
4744err:
2f49b662
JS
4745 if (stat_node)
4746 pthread_mutex_unlock(&stat_node->lock);
056adcef
JS
4747 if (line)
4748 free(line);
4749 if (diff)
4750 free(diff);
4751 return rv;
4752}
4753
f34de69a 4754#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
237e200e
SH
4755static int proc_stat_read(char *buf, size_t size, off_t offset,
4756 struct fuse_file_info *fi)
4757{
4758 struct fuse_context *fc = fuse_get_context();
4759 struct file_info *d = (struct file_info *)fi->fh;
4760 char *cg;
4761 char *cpuset = NULL;
4762 char *line = NULL;
4763 size_t linelen = 0, total_len = 0, rv = 0;
4764 int curcpu = -1; /* cpu numbering starts at 0 */
77005a6c 4765 int physcpu = 0;
7144f069 4766 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
237e200e 4767 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
7144f069 4768 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
237e200e
SH
4769 char cpuall[CPUALL_MAX_SIZE];
4770 /* reserve for cpu all */
4771 char *cache = d->buf + CPUALL_MAX_SIZE;
4772 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4773 FILE *f = NULL;
8be92dd1 4774 struct cpuacct_usage *cg_cpu_usage = NULL;
237e200e
SH
4775
4776 if (offset){
4777 if (offset > d->size)
4778 return -EINVAL;
4779 if (!d->cached)
4780 return 0;
4781 int left = d->size - offset;
4782 total_len = left > size ? size: left;
4783 memcpy(buf, d->buf + offset, total_len);
4784 return total_len;
4785 }
4786
4787 pid_t initpid = lookup_initpid_in_store(fc->pid);
4788 if (initpid <= 0)
4789 initpid = fc->pid;
4790 cg = get_pid_cgroup(initpid, "cpuset");
4791 if (!cg)
4792 return read_file("/proc/stat", buf, size, d);
6d2f6996 4793 prune_init_slice(cg);
237e200e
SH
4794
4795 cpuset = get_cpuset(cg);
4796 if (!cpuset)
4797 goto err;
4798
8be92dd1
JS
4799 /*
4800 * Read cpuacct.usage_all for all CPUs.
4801 * If the cpuacct cgroup is present, it is used to calculate the container's
4802 * CPU usage. If not, values from the host's /proc/stat are used.
4803 */
4804 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage) != 0) {
4805 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
4806 "falling back to the host's /proc/stat");
4807 }
4808
237e200e
SH
4809 f = fopen("/proc/stat", "r");
4810 if (!f)
4811 goto err;
4812
4813 //skip first line
4814 if (getline(&line, &linelen, f) < 0) {
b8defc3d 4815 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
237e200e
SH
4816 goto err;
4817 }
4818
056adcef
JS
4819 if (use_cpuview(cg) && cg_cpu_usage) {
4820 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, f, d->buf, d->buflen);
4821 goto out;
4822 }
4823
237e200e 4824 while (getline(&line, &linelen, f) != -1) {
a262ddb7 4825 ssize_t l;
237e200e
SH
4826 char cpu_char[10]; /* That's a lot of cores */
4827 char *c;
8be92dd1
JS
4828 uint64_t all_used, cg_used, new_idle;
4829 int ret;
237e200e 4830
b4665ce0
SH
4831 if (strlen(line) == 0)
4832 continue;
237e200e
SH
4833 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4834 /* not a ^cpuN line containing a number N, just print it */
9502bae2 4835 l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
4836 if (l < 0) {
4837 perror("Error writing to cache");
4838 rv = 0;
4839 goto err;
4840 }
4841 if (l >= cache_size) {
b8defc3d 4842 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
4843 rv = 0;
4844 goto err;
4845 }
4846 cache += l;
4847 cache_size -= l;
4848 total_len += l;
4849 continue;
4850 }
4851
77005a6c 4852 if (sscanf(cpu_char, "%d", &physcpu) != 1)
237e200e 4853 continue;
77005a6c 4854 if (!cpu_in_cpuset(physcpu, cpuset))
237e200e
SH
4855 continue;
4856 curcpu ++;
4857
8be92dd1 4858 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
7144f069
CB
4859 &user,
4860 &nice,
4861 &system,
4862 &idle,
4863 &iowait,
4864 &irq,
4865 &softirq,
4866 &steal,
4867 &guest,
8be92dd1
JS
4868 &guest_nice);
4869
4870 if (ret != 10 || !cg_cpu_usage) {
4871 c = strchr(line, ' ');
4872 if (!c)
4873 continue;
4874 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4875 if (l < 0) {
4876 perror("Error writing to cache");
4877 rv = 0;
4878 goto err;
4879
4880 }
4881 if (l >= cache_size) {
4882 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4883 rv = 0;
4884 goto err;
4885 }
4886
4887 cache += l;
4888 cache_size -= l;
4889 total_len += l;
4890
4891 if (ret != 10)
4892 continue;
4893 }
4894
4895 if (cg_cpu_usage) {
4896 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
77005a6c 4897 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
8be92dd1
JS
4898
4899 if (all_used >= cg_used) {
4900 new_idle = idle + (all_used - cg_used);
4901
4902 } else {
4903 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4904 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4905 curcpu, cg, all_used, cg_used);
4906 new_idle = idle;
4907 }
4908
4909 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
77005a6c 4910 curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
8be92dd1
JS
4911 new_idle);
4912
4913 if (l < 0) {
4914 perror("Error writing to cache");
4915 rv = 0;
4916 goto err;
4917
4918 }
4919 if (l >= cache_size) {
4920 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4921 rv = 0;
4922 goto err;
4923 }
4924
4925 cache += l;
4926 cache_size -= l;
4927 total_len += l;
4928
77005a6c
JS
4929 user_sum += cg_cpu_usage[physcpu].user;
4930 system_sum += cg_cpu_usage[physcpu].system;
8be92dd1
JS
4931 idle_sum += new_idle;
4932
4933 } else {
4934 user_sum += user;
4935 nice_sum += nice;
4936 system_sum += system;
4937 idle_sum += idle;
4938 iowait_sum += iowait;
4939 irq_sum += irq;
4940 softirq_sum += softirq;
4941 steal_sum += steal;
4942 guest_sum += guest;
4943 guest_nice_sum += guest_nice;
4944 }
237e200e
SH
4945 }
4946
4947 cache = d->buf;
4948
7144f069
CB
4949 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4950 user_sum,
4951 nice_sum,
4952 system_sum,
4953 idle_sum,
4954 iowait_sum,
4955 irq_sum,
4956 softirq_sum,
4957 steal_sum,
4958 guest_sum,
4959 guest_nice_sum);
4960 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
237e200e
SH
4961 memcpy(cache, cpuall, cpuall_len);
4962 cache += cpuall_len;
7144f069 4963 } else {
237e200e 4964 /* shouldn't happen */
b8defc3d 4965 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
237e200e
SH
4966 cpuall_len = 0;
4967 }
4968
4969 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4970 total_len += cpuall_len;
056adcef
JS
4971
4972out:
237e200e
SH
4973 d->cached = 1;
4974 d->size = total_len;
7144f069
CB
4975 if (total_len > size)
4976 total_len = size;
237e200e
SH
4977
4978 memcpy(buf, d->buf, total_len);
4979 rv = total_len;
4980
4981err:
4982 if (f)
4983 fclose(f);
8be92dd1
JS
4984 if (cg_cpu_usage)
4985 free(cg_cpu_usage);
237e200e
SH
4986 free(line);
4987 free(cpuset);
4988 free(cg);
4989 return rv;
4990}
4991
0ecddf02
CB
4992/* This function retrieves the busy time of a group of tasks by looking at
4993 * cpuacct.usage. Unfortunately, this only makes sense when the container has
4994 * been given it's own cpuacct cgroup. If not, this function will take the busy
4995 * time of all other taks that do not actually belong to the container into
4996 * account as well. If someone has a clever solution for this please send a
4997 * patch!
4998 */
237e200e
SH
4999static unsigned long get_reaper_busy(pid_t task)
5000{
5001 pid_t initpid = lookup_initpid_in_store(task);
5002 char *cgroup = NULL, *usage_str = NULL;
5003 unsigned long usage = 0;
5004
5005 if (initpid <= 0)
5006 return 0;
5007
5008 cgroup = get_pid_cgroup(initpid, "cpuacct");
5009 if (!cgroup)
5010 goto out;
6d2f6996 5011 prune_init_slice(cgroup);
237e200e
SH
5012 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
5013 goto out;
5014 usage = strtoul(usage_str, NULL, 10);
5015 usage /= 1000000000;
5016
5017out:
5018 free(cgroup);
5019 free(usage_str);
5020 return usage;
5021}
5022
5023#if RELOADTEST
5024void iwashere(void)
5025{
237e200e
SH
5026 int fd;
5027
ec2b5e7c 5028 fd = creat("/tmp/lxcfs-iwashere", 0644);
237e200e
SH
5029 if (fd >= 0)
5030 close(fd);
5031}
5032#endif
5033
5034/*
5035 * We read /proc/uptime and reuse its second field.
5036 * For the first field, we use the mtime for the reaper for
5037 * the calling pid as returned by getreaperage
5038 */
5039static int proc_uptime_read(char *buf, size_t size, off_t offset,
5040 struct fuse_file_info *fi)
5041{
5042 struct fuse_context *fc = fuse_get_context();
5043 struct file_info *d = (struct file_info *)fi->fh;
0ecddf02 5044 unsigned long int busytime = get_reaper_busy(fc->pid);
237e200e 5045 char *cache = d->buf;
a262ddb7 5046 ssize_t total_len = 0;
0ecddf02 5047 uint64_t idletime, reaperage;
237e200e
SH
5048
5049#if RELOADTEST
5050 iwashere();
5051#endif
5052
5053 if (offset){
237e200e
SH
5054 if (!d->cached)
5055 return 0;
bbdf646b
BM
5056 if (offset > d->size)
5057 return -EINVAL;
237e200e
SH
5058 int left = d->size - offset;
5059 total_len = left > size ? size: left;
5060 memcpy(buf, cache + offset, total_len);
5061 return total_len;
5062 }
5063
0ecddf02
CB
5064 reaperage = get_reaper_age(fc->pid);
5065 /* To understand why this is done, please read the comment to the
5066 * get_reaper_busy() function.
5067 */
5068 idletime = reaperage;
5069 if (reaperage >= busytime)
5070 idletime = reaperage - busytime;
237e200e 5071
bbdf646b
BM
5072 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
5073 if (total_len < 0 || total_len >= d->buflen){
0ecddf02 5074 lxcfs_error("%s\n", "failed to write to cache");
237e200e
SH
5075 return 0;
5076 }
5077
5078 d->size = (int)total_len;
5079 d->cached = 1;
5080
5081 if (total_len > size) total_len = size;
5082
5083 memcpy(buf, d->buf, total_len);
5084 return total_len;
5085}
5086
5087static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5088 struct fuse_file_info *fi)
5089{
5090 char dev_name[72];
5091 struct fuse_context *fc = fuse_get_context();
5092 struct file_info *d = (struct file_info *)fi->fh;
5093 char *cg;
5094 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5095 *io_wait_time_str = NULL, *io_service_time_str = NULL;
5096 unsigned long read = 0, write = 0;
5097 unsigned long read_merged = 0, write_merged = 0;
5098 unsigned long read_sectors = 0, write_sectors = 0;
5099 unsigned long read_ticks = 0, write_ticks = 0;
5100 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5101 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5102 char *cache = d->buf;
5103 size_t cache_size = d->buflen;
5104 char *line = NULL;
5105 size_t linelen = 0, total_len = 0, rv = 0;
5106 unsigned int major = 0, minor = 0;
5107 int i = 0;
5108 FILE *f = NULL;
5109
5110 if (offset){
5111 if (offset > d->size)
5112 return -EINVAL;
5113 if (!d->cached)
5114 return 0;
5115 int left = d->size - offset;
5116 total_len = left > size ? size: left;
5117 memcpy(buf, cache + offset, total_len);
5118 return total_len;
5119 }
5120
5121 pid_t initpid = lookup_initpid_in_store(fc->pid);
5122 if (initpid <= 0)
5123 initpid = fc->pid;
5124 cg = get_pid_cgroup(initpid, "blkio");
5125 if (!cg)
5126 return read_file("/proc/diskstats", buf, size, d);
6d2f6996 5127 prune_init_slice(cg);
237e200e 5128
2209fe50 5129 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
237e200e 5130 goto err;
2209fe50 5131 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
237e200e 5132 goto err;
2209fe50 5133 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
237e200e 5134 goto err;
2209fe50 5135 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
237e200e 5136 goto err;
2209fe50 5137 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
237e200e
SH
5138 goto err;
5139
5140
5141 f = fopen("/proc/diskstats", "r");
5142 if (!f)
5143 goto err;
5144
5145 while (getline(&line, &linelen, f) != -1) {
a262ddb7 5146 ssize_t l;
2209fe50 5147 char lbuf[256];
237e200e
SH
5148
5149 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2209fe50 5150 if (i != 3)
237e200e 5151 continue;
2209fe50
SH
5152
5153 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5154 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5155 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5156 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5157 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5158 read_sectors = read_sectors/512;
5159 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5160 write_sectors = write_sectors/512;
5161
5162 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5163 rd_svctm = rd_svctm/1000000;
5164 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5165 rd_wait = rd_wait/1000000;
5166 read_ticks = rd_svctm + rd_wait;
5167
5168 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5169 wr_svctm = wr_svctm/1000000;
5170 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5171 wr_wait = wr_wait/1000000;
5172 write_ticks = wr_svctm + wr_wait;
5173
5174 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5175 tot_ticks = tot_ticks/1000000;
237e200e
SH
5176
5177 memset(lbuf, 0, 256);
2db31eb6
SH
5178 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5179 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5180 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5181 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5182 else
5183 continue;
237e200e 5184
2209fe50 5185 l = snprintf(cache, cache_size, "%s", lbuf);
237e200e
SH
5186 if (l < 0) {
5187 perror("Error writing to fuse buf");
5188 rv = 0;
5189 goto err;
5190 }
5191 if (l >= cache_size) {
b8defc3d 5192 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
5193 rv = 0;
5194 goto err;
5195 }
5196 cache += l;
5197 cache_size -= l;
5198 total_len += l;
5199 }
5200
5201 d->cached = 1;
5202 d->size = total_len;
5203 if (total_len > size ) total_len = size;
5204 memcpy(buf, d->buf, total_len);
5205
5206 rv = total_len;
5207err:
5208 free(cg);
5209 if (f)
5210 fclose(f);
5211 free(line);
5212 free(io_serviced_str);
5213 free(io_merged_str);
5214 free(io_service_bytes_str);
5215 free(io_wait_time_str);
5216 free(io_service_time_str);
5217 return rv;
5218}
5219
70dcc12e
SH
5220static int proc_swaps_read(char *buf, size_t size, off_t offset,
5221 struct fuse_file_info *fi)
5222{
5223 struct fuse_context *fc = fuse_get_context();
5224 struct file_info *d = (struct file_info *)fi->fh;
5225 char *cg = NULL;
018246ff 5226 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
70dcc12e 5227 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
a262ddb7
CB
5228 ssize_t total_len = 0, rv = 0;
5229 ssize_t l = 0;
70dcc12e
SH
5230 char *cache = d->buf;
5231
5232 if (offset) {
5233 if (offset > d->size)
5234 return -EINVAL;
5235 if (!d->cached)
5236 return 0;
5237 int left = d->size - offset;
5238 total_len = left > size ? size: left;
5239 memcpy(buf, cache + offset, total_len);
5240 return total_len;
5241 }
5242
5243 pid_t initpid = lookup_initpid_in_store(fc->pid);
5244 if (initpid <= 0)
5245 initpid = fc->pid;
5246 cg = get_pid_cgroup(initpid, "memory");
5247 if (!cg)
5248 return read_file("/proc/swaps", buf, size, d);
6d2f6996 5249 prune_init_slice(cg);
70dcc12e 5250
018246ff 5251 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
70dcc12e
SH
5252
5253 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
5254 goto err;
5255
70dcc12e
SH
5256 memusage = strtoul(memusage_str, NULL, 10);
5257
5258 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5259 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5260
018246ff 5261 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
70dcc12e
SH
5262 memswusage = strtoul(memswusage_str, NULL, 10);
5263
70dcc12e
SH
5264 swap_total = (memswlimit - memlimit) / 1024;
5265 swap_free = (memswusage - memusage) / 1024;
5266 }
5267
5268 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5269
5270 /* When no mem + swap limit is specified or swapaccount=0*/
5271 if (!memswlimit) {
5272 char *line = NULL;
5273 size_t linelen = 0;
5274 FILE *f = fopen("/proc/meminfo", "r");
5275
5276 if (!f)
5277 goto err;
5278
5279 while (getline(&line, &linelen, f) != -1) {
5280 if (startswith(line, "SwapTotal:")) {
5281 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
5282 } else if (startswith(line, "SwapFree:")) {
5283 sscanf(line, "SwapFree: %8lu kB", &swap_free);
5284 }
5285 }
5286
5287 free(line);
5288 fclose(f);
5289 }
5290
5291 if (swap_total > 0) {
a262ddb7
CB
5292 l = snprintf(d->buf + total_len, d->size - total_len,
5293 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5294 swap_total, swap_free);
5295 total_len += l;
70dcc12e
SH
5296 }
5297
a262ddb7 5298 if (total_len < 0 || l < 0) {
70dcc12e
SH
5299 perror("Error writing to cache");
5300 rv = 0;
5301 goto err;
5302 }
5303
5304 d->cached = 1;
5305 d->size = (int)total_len;
5306
5307 if (total_len > size) total_len = size;
5308 memcpy(buf, d->buf, total_len);
5309 rv = total_len;
5310
5311err:
5312 free(cg);
5313 free(memswlimit_str);
5314 free(memlimit_str);
5315 free(memusage_str);
5316 free(memswusage_str);
70dcc12e
SH
5317 return rv;
5318}
6db4f7a3 5319/*
5320 * Find the process pid from cgroup path.
5321 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5322 * @pid_buf : put pid to pid_buf.
5323 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5324 * @depth : the depth of cgroup in container.
5325 * @sum : return the number of pid.
5326 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5327 */
5328static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5329{
5330 DIR *dir;
5331 int fd;
5332 struct dirent *file;
5333 FILE *f = NULL;
5334 size_t linelen = 0;
5335 char *line = NULL;
5336 int pd;
5337 char *path_dir, *path;
5338 char **pid;
5339
5340 /* path = dpath + "/cgroup.procs" + /0 */
5341 do {
5342 path = malloc(strlen(dpath) + 20);
5343 } while (!path);
5344
5345 strcpy(path, dpath);
5346 fd = openat(cfd, path, O_RDONLY);
5347 if (fd < 0)
5348 goto out;
5349
5350 dir = fdopendir(fd);
5351 if (dir == NULL) {
5352 close(fd);
5353 goto out;
5354 }
5355
5356 while (((file = readdir(dir)) != NULL) && depth > 0) {
5357 if (strncmp(file->d_name, ".", 1) == 0)
5358 continue;
5359 if (strncmp(file->d_name, "..", 1) == 0)
5360 continue;
5361 if (file->d_type == DT_DIR) {
5362 /* path + '/' + d_name +/0 */
5363 do {
5364 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5365 } while (!path_dir);
5366 strcpy(path_dir, path);
5367 strcat(path_dir, "/");
5368 strcat(path_dir, file->d_name);
5369 pd = depth - 1;
5370 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5371 free(path_dir);
5372 }
5373 }
5374 closedir(dir);
5375
5376 strcat(path, "/cgroup.procs");
5377 fd = openat(cfd, path, O_RDONLY);
5378 if (fd < 0)
5379 goto out;
5380
5381 f = fdopen(fd, "r");
5382 if (!f) {
5383 close(fd);
5384 goto out;
5385 }
5386
5387 while (getline(&line, &linelen, f) != -1) {
5388 do {
5389 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5390 } while (!pid);
5391 *pid_buf = pid;
5392 do {
5393 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5394 } while (*(*pid_buf + sum) == NULL);
5395 strcpy(*(*pid_buf + sum), line);
5396 sum++;
5397 }
5398 fclose(f);
5399out:
832904c1
JS
5400 if (line)
5401 free(line);
6db4f7a3 5402 free(path);
5403 return sum;
5404}
5405/*
5406 * calc_load calculates the load according to the following formula:
5407 * load1 = load0 * exp + active * (1 - exp)
5408 *
5409 * @load1: the new loadavg.
5410 * @load0: the former loadavg.
5411 * @active: the total number of running pid at this moment.
5412 * @exp: the fixed-point defined in the beginning.
5413 */
5414static unsigned long
5415calc_load(unsigned long load, unsigned long exp, unsigned long active)
5416{
5417 unsigned long newload;
5418
5419 active = active > 0 ? active * FIXED_1 : 0;
5420 newload = load * exp + active * (FIXED_1 - exp);
5421 if (active >= load)
5422 newload += FIXED_1 - 1;
5423
5424 return newload / FIXED_1;
5425}
5426
5427/*
5428 * Return 0 means that container p->cg is closed.
5429 * Return -1 means that error occurred in refresh.
5430 * Positive num equals the total number of pid.
5431 */
5432static int refresh_load(struct load_node *p, char *path)
5433{
5434 FILE *f = NULL;
5435 char **idbuf;
5436 char proc_path[256];
5437 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5438 char *line = NULL;
5439 size_t linelen = 0;
5440 int sum, length;
5441 DIR *dp;
5442 struct dirent *file;
5443
5444 do {
5445 idbuf = malloc(sizeof(char *));
5446 } while (!idbuf);
5447 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5448 /* normal exit */
5449 if (sum == 0)
5450 goto out;
5451
5452 for (i = 0; i < sum; i++) {
5453 /*clean up '\n' */
5454 length = strlen(idbuf[i])-1;
5455 idbuf[i][length] = '\0';
5456 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5457 if (ret < 0 || ret > 255) {
5458 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5459 i = sum;
5460 sum = -1;
5461 goto err_out;
5462 }
5463
5464 dp = opendir(proc_path);
5465 if (!dp) {
5466 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5467 continue;
5468 }
5469 while ((file = readdir(dp)) != NULL) {
5470 if (strncmp(file->d_name, ".", 1) == 0)
5471 continue;
5472 if (strncmp(file->d_name, "..", 1) == 0)
5473 continue;
5474 total_pid++;
5475 /* We make the biggest pid become last_pid.*/
5476 ret = atof(file->d_name);
5477 last_pid = (ret > last_pid) ? ret : last_pid;
5478
5479 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5480 if (ret < 0 || ret > 255) {
5481 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5482 i = sum;
5483 sum = -1;
5484 closedir(dp);
5485 goto err_out;
5486 }
5487 f = fopen(proc_path, "r");
5488 if (f != NULL) {
5489 while (getline(&line, &linelen, f) != -1) {
5490 /* Find State */
5491 if ((line[0] == 'S') && (line[1] == 't'))
5492 break;
5493 }
5494 if ((line[7] == 'R') || (line[7] == 'D'))
5495 run_pid++;
5496 fclose(f);
5497 }
5498 }
5499 closedir(dp);
5500 }
5501 /*Calculate the loadavg.*/
5502 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5503 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5504 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5505 p->run_pid = run_pid;
5506 p->total_pid = total_pid;
5507 p->last_pid = last_pid;
5508
5509 free(line);
beb5024e 5510err_out:
6db4f7a3 5511 for (; i > 0; i--)
5512 free(idbuf[i-1]);
5513out:
5514 free(idbuf);
5515 return sum;
5516}
5517/*
5518 * Traverse the hash table and update it.
5519 */
5520void *load_begin(void *arg)
5521{
5522
5523 char *path = NULL;
5524 int i, sum, length, ret;
5525 struct load_node *f;
5526 int first_node;
5527 clock_t time1, time2;
5528
5529 while (1) {
a83618e2
JS
5530 if (loadavg_stop == 1)
5531 return NULL;
5532
6db4f7a3 5533 time1 = clock();
5534 for (i = 0; i < LOAD_SIZE; i++) {
5535 pthread_mutex_lock(&load_hash[i].lock);
5536 if (load_hash[i].next == NULL) {
5537 pthread_mutex_unlock(&load_hash[i].lock);
5538 continue;
5539 }
5540 f = load_hash[i].next;
5541 first_node = 1;
5542 while (f) {
5543 length = strlen(f->cg) + 2;
5544 do {
5545 /* strlen(f->cg) + '.' or '' + \0 */
5546 path = malloc(length);
5547 } while (!path);
5548
5549 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5550 if (ret < 0 || ret > length - 1) {
5551 /* snprintf failed, ignore the node.*/
5552 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5553 goto out;
5554 }
5555 sum = refresh_load(f, path);
5556 if (sum == 0) {
5557 f = del_node(f, i);
5558 } else {
5559out: f = f->next;
5560 }
5561 free(path);
5562 /* load_hash[i].lock locks only on the first node.*/
5563 if (first_node == 1) {
5564 first_node = 0;
5565 pthread_mutex_unlock(&load_hash[i].lock);
5566 }
5567 }
5568 }
a83618e2
JS
5569
5570 if (loadavg_stop == 1)
5571 return NULL;
5572
6db4f7a3 5573 time2 = clock();
5574 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5575 }
5576}
5577
5578static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5579 struct fuse_file_info *fi)
5580{
5581 struct fuse_context *fc = fuse_get_context();
5582 struct file_info *d = (struct file_info *)fi->fh;
5583 pid_t initpid;
5584 char *cg;
5585 size_t total_len = 0;
5586 char *cache = d->buf;
5587 struct load_node *n;
5588 int hash;
01d88ede 5589 int cfd, rv = 0;
6db4f7a3 5590 unsigned long a, b, c;
5591
5592 if (offset) {
5593 if (offset > d->size)
5594 return -EINVAL;
5595 if (!d->cached)
5596 return 0;
5597 int left = d->size - offset;
5598 total_len = left > size ? size : left;
5599 memcpy(buf, cache + offset, total_len);
5600 return total_len;
5601 }
5602 if (!loadavg)
5603 return read_file("/proc/loadavg", buf, size, d);
5604
5605 initpid = lookup_initpid_in_store(fc->pid);
5606 if (initpid <= 0)
5607 initpid = fc->pid;
5608 cg = get_pid_cgroup(initpid, "cpu");
5609 if (!cg)
5610 return read_file("/proc/loadavg", buf, size, d);
5611
5612 prune_init_slice(cg);
b077527b 5613 hash = calc_hash(cg) % LOAD_SIZE;
6db4f7a3 5614 n = locate_node(cg, hash);
5615
5616 /* First time */
5617 if (n == NULL) {
5618 if (!find_mounted_controller("cpu", &cfd)) {
5619 /*
5620 * In locate_node() above, pthread_rwlock_unlock() isn't used
5621 * because delete is not allowed before read has ended.
5622 */
5623 pthread_rwlock_unlock(&load_hash[hash].rdlock);
01d88ede
JS
5624 rv = 0;
5625 goto err;
6db4f7a3 5626 }
5627 do {
5628 n = malloc(sizeof(struct load_node));
5629 } while (!n);
5630
5631 do {
5632 n->cg = malloc(strlen(cg)+1);
5633 } while (!n->cg);
5634 strcpy(n->cg, cg);
5635 n->avenrun[0] = 0;
5636 n->avenrun[1] = 0;
5637 n->avenrun[2] = 0;
5638 n->run_pid = 0;
5639 n->total_pid = 1;
5640 n->last_pid = initpid;
5641 n->cfd = cfd;
5642 insert_node(&n, hash);
5643 }
5644 a = n->avenrun[0] + (FIXED_1/200);
5645 b = n->avenrun[1] + (FIXED_1/200);
5646 c = n->avenrun[2] + (FIXED_1/200);
5647 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5648 LOAD_INT(a), LOAD_FRAC(a),
5649 LOAD_INT(b), LOAD_FRAC(b),
5650 LOAD_INT(c), LOAD_FRAC(c),
5651 n->run_pid, n->total_pid, n->last_pid);
5652 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5653 if (total_len < 0 || total_len >= d->buflen) {
5654 lxcfs_error("%s\n", "Failed to write to cache");
01d88ede
JS
5655 rv = 0;
5656 goto err;
6db4f7a3 5657 }
5658 d->size = (int)total_len;
5659 d->cached = 1;
5660
5661 if (total_len > size)
5662 total_len = size;
5663 memcpy(buf, d->buf, total_len);
01d88ede
JS
5664 rv = total_len;
5665
5666err:
5667 free(cg);
5668 return rv;
6db4f7a3 5669}
5670/* Return a positive number on success, return 0 on failure.*/
5671pthread_t load_daemon(int load_use)
5672{
5673 int ret;
5674 pthread_t pid;
5675
5676 ret = init_load();
5677 if (ret == -1) {
5678 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5679 return 0;
5680 }
5681 ret = pthread_create(&pid, NULL, load_begin, NULL);
5682 if (ret != 0) {
5683 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5684 load_free();
5685 return 0;
5686 }
5687 /* use loadavg, here loadavg = 1*/
5688 loadavg = load_use;
5689 return pid;
5690}
70dcc12e 5691
a83618e2
JS
5692/* Returns 0 on success. */
5693int stop_load_daemon(pthread_t pid)
5694{
5695 int s;
5696
5697 /* Signal the thread to gracefully stop */
5698 loadavg_stop = 1;
5699
5700 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5701 if (s != 0) {
5702 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5703 return -1;
5704 }
5705
5706 load_free();
5707 loadavg_stop = 0;
5708
5709 return 0;
5710}
5711
237e200e
SH
5712static off_t get_procfile_size(const char *which)
5713{
5714 FILE *f = fopen(which, "r");
5715 char *line = NULL;
5716 size_t len = 0;
5717 ssize_t sz, answer = 0;
5718 if (!f)
5719 return 0;
5720
5721 while ((sz = getline(&line, &len, f)) != -1)
5722 answer += sz;
5723 fclose (f);
5724 free(line);
5725
5726 return answer;
5727}
5728
5729int proc_getattr(const char *path, struct stat *sb)
5730{
5731 struct timespec now;
5732
5733 memset(sb, 0, sizeof(struct stat));
5734 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5735 return -EINVAL;
5736 sb->st_uid = sb->st_gid = 0;
5737 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5738 if (strcmp(path, "/proc") == 0) {
5739 sb->st_mode = S_IFDIR | 00555;
5740 sb->st_nlink = 2;
5741 return 0;
5742 }
5743 if (strcmp(path, "/proc/meminfo") == 0 ||
5744 strcmp(path, "/proc/cpuinfo") == 0 ||
5745 strcmp(path, "/proc/uptime") == 0 ||
5746 strcmp(path, "/proc/stat") == 0 ||
70dcc12e 5747 strcmp(path, "/proc/diskstats") == 0 ||
46be8eed 5748 strcmp(path, "/proc/swaps") == 0 ||
5749 strcmp(path, "/proc/loadavg") == 0) {
237e200e
SH
5750 sb->st_size = 0;
5751 sb->st_mode = S_IFREG | 00444;
5752 sb->st_nlink = 1;
5753 return 0;
5754 }
5755
5756 return -ENOENT;
5757}
5758
5759int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5760 struct fuse_file_info *fi)
5761{
d639f863
CB
5762 if (filler(buf, ".", NULL, 0) != 0 ||
5763 filler(buf, "..", NULL, 0) != 0 ||
5764 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5765 filler(buf, "meminfo", NULL, 0) != 0 ||
5766 filler(buf, "stat", NULL, 0) != 0 ||
5767 filler(buf, "uptime", NULL, 0) != 0 ||
5768 filler(buf, "diskstats", NULL, 0) != 0 ||
46be8eed 5769 filler(buf, "swaps", NULL, 0) != 0 ||
5770 filler(buf, "loadavg", NULL, 0) != 0)
237e200e
SH
5771 return -EINVAL;
5772 return 0;
5773}
5774
5775int proc_open(const char *path, struct fuse_file_info *fi)
5776{
5777 int type = -1;
5778 struct file_info *info;
5779
5780 if (strcmp(path, "/proc/meminfo") == 0)
5781 type = LXC_TYPE_PROC_MEMINFO;
5782 else if (strcmp(path, "/proc/cpuinfo") == 0)
5783 type = LXC_TYPE_PROC_CPUINFO;
5784 else if (strcmp(path, "/proc/uptime") == 0)
5785 type = LXC_TYPE_PROC_UPTIME;
5786 else if (strcmp(path, "/proc/stat") == 0)
5787 type = LXC_TYPE_PROC_STAT;
5788 else if (strcmp(path, "/proc/diskstats") == 0)
5789 type = LXC_TYPE_PROC_DISKSTATS;
70dcc12e
SH
5790 else if (strcmp(path, "/proc/swaps") == 0)
5791 type = LXC_TYPE_PROC_SWAPS;
46be8eed 5792 else if (strcmp(path, "/proc/loadavg") == 0)
5793 type = LXC_TYPE_PROC_LOADAVG;
237e200e
SH
5794 if (type == -1)
5795 return -ENOENT;
5796
5797 info = malloc(sizeof(*info));
5798 if (!info)
5799 return -ENOMEM;
5800
5801 memset(info, 0, sizeof(*info));
5802 info->type = type;
5803
5804 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5805 do {
5806 info->buf = malloc(info->buflen);
5807 } while (!info->buf);
5808 memset(info->buf, 0, info->buflen);
5809 /* set actual size to buffer size */
5810 info->size = info->buflen;
5811
5812 fi->fh = (unsigned long)info;
5813 return 0;
5814}
5815
bddbb106
SH
5816int proc_access(const char *path, int mask)
5817{
e7849aa3
CB
5818 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5819 return 0;
5820
bddbb106
SH
5821 /* these are all read-only */
5822 if ((mask & ~R_OK) != 0)
1b060d0a 5823 return -EACCES;
bddbb106
SH
5824 return 0;
5825}
5826
237e200e
SH
5827int proc_release(const char *path, struct fuse_file_info *fi)
5828{
43215927 5829 do_release_file_info(fi);
237e200e
SH
5830 return 0;
5831}
5832
5833int proc_read(const char *path, char *buf, size_t size, off_t offset,
5834 struct fuse_file_info *fi)
5835{
5836 struct file_info *f = (struct file_info *) fi->fh;
5837
5838 switch (f->type) {
5839 case LXC_TYPE_PROC_MEMINFO:
5840 return proc_meminfo_read(buf, size, offset, fi);
5841 case LXC_TYPE_PROC_CPUINFO:
5842 return proc_cpuinfo_read(buf, size, offset, fi);
5843 case LXC_TYPE_PROC_UPTIME:
5844 return proc_uptime_read(buf, size, offset, fi);
5845 case LXC_TYPE_PROC_STAT:
5846 return proc_stat_read(buf, size, offset, fi);
5847 case LXC_TYPE_PROC_DISKSTATS:
5848 return proc_diskstats_read(buf, size, offset, fi);
70dcc12e
SH
5849 case LXC_TYPE_PROC_SWAPS:
5850 return proc_swaps_read(buf, size, offset, fi);
46be8eed 5851 case LXC_TYPE_PROC_LOADAVG:
5852 return proc_loadavg_read(buf, size, offset, fi);
237e200e
SH
5853 default:
5854 return -EINVAL;
5855 }
5856}
5857
29a73c2f
CB
5858/*
5859 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
5860 */
5861
5862static bool mkdir_p(const char *dir, mode_t mode)
5863{
5864 const char *tmp = dir;
5865 const char *orig = dir;
5866 char *makeme;
5867
5868 do {
5869 dir = tmp + strspn(tmp, "/");
5870 tmp = dir + strcspn(dir, "/");
5871 makeme = strndup(orig, dir - orig);
5872 if (!makeme)
5873 return false;
5874 if (mkdir(makeme, mode) && errno != EEXIST) {
b8defc3d 5875 lxcfs_error("Failed to create directory '%s': %s.\n",
29a73c2f
CB
5876 makeme, strerror(errno));
5877 free(makeme);
5878 return false;
5879 }
5880 free(makeme);
5881 } while(tmp != dir);
5882
5883 return true;
5884}
5885
5886static bool umount_if_mounted(void)
5887{
5888 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 5889 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
5890 return false;
5891 }
5892 return true;
5893}
5894
2283e240
CB
5895/* __typeof__ should be safe to use with all compilers. */
5896typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5897static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5898{
5899 return (fs->f_type == (fs_type_magic)magic_val);
5900}
5901
0a4dea41
CB
5902/*
5903 * looking at fs/proc_namespace.c, it appears we can
5904 * actually expect the rootfs entry to very specifically contain
5905 * " - rootfs rootfs "
5906 * IIUC, so long as we've chrooted so that rootfs is not our root,
5907 * the rootfs entry should always be skipped in mountinfo contents.
5908 */
5909static bool is_on_ramfs(void)
5910{
5911 FILE *f;
5912 char *p, *p2;
5913 char *line = NULL;
5914 size_t len = 0;
5915 int i;
5916
5917 f = fopen("/proc/self/mountinfo", "r");
5918 if (!f)
5919 return false;
5920
5921 while (getline(&line, &len, f) != -1) {
5922 for (p = line, i = 0; p && i < 4; i++)
5923 p = strchr(p + 1, ' ');
5924 if (!p)
5925 continue;
5926 p2 = strchr(p + 1, ' ');
5927 if (!p2)
5928 continue;
5929 *p2 = '\0';
5930 if (strcmp(p + 1, "/") == 0) {
5931 // this is '/'. is it the ramfs?
5932 p = strchr(p2 + 1, '-');
5933 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5934 free(line);
5935 fclose(f);
5936 return true;
5937 }
5938 }
5939 }
5940 free(line);
5941 fclose(f);
5942 return false;
5943}
5944
cc309f33 5945static int pivot_enter()
0a4dea41 5946{
cc309f33
CB
5947 int ret = -1, oldroot = -1, newroot = -1;
5948
5949 oldroot = open("/", O_DIRECTORY | O_RDONLY);
5950 if (oldroot < 0) {
5951 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5952 return ret;
5953 }
5954
5955 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5956 if (newroot < 0) {
5957 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5958 goto err;
5959 }
5960
5961 /* change into new root fs */
5962 if (fchdir(newroot) < 0) {
5963 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5964 goto err;
5965 }
5966
0a4dea41
CB
5967 /* pivot_root into our new root fs */
5968 if (pivot_root(".", ".") < 0) {
5969 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
cc309f33 5970 goto err;
0a4dea41
CB
5971 }
5972
5973 /*
5974 * At this point the old-root is mounted on top of our new-root.
5975 * To unmounted it we must not be chdir'd into it, so escape back
5976 * to the old-root.
5977 */
5978 if (fchdir(oldroot) < 0) {
5979 lxcfs_error("%s\n", "Failed to enter old root.");
cc309f33 5980 goto err;
0a4dea41
CB
5981 }
5982
5983 if (umount2(".", MNT_DETACH) < 0) {
5984 lxcfs_error("%s\n", "Failed to detach old root.");
cc309f33 5985 goto err;
0a4dea41
CB
5986 }
5987
5988 if (fchdir(newroot) < 0) {
5989 lxcfs_error("%s\n", "Failed to re-enter new root.");
cc309f33 5990 goto err;
0a4dea41
CB
5991 }
5992
cc309f33
CB
5993 ret = 0;
5994
5995err:
5996 if (oldroot > 0)
5997 close(oldroot);
5998 if (newroot > 0)
5999 close(newroot);
6000
6001 return ret;
0a4dea41
CB
6002}
6003
6004static int chroot_enter()
6005{
6006 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
6007 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
6008 return -1;
6009 }
6010
6011 if (chroot(".") < 0) {
6012 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
6013 return -1;
6014 }
6015
6016 if (chdir("/") < 0) {
6017 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
6018 return -1;
6019 }
6020
6021 return 0;
6022}
6023
0232cbac 6024static int permute_and_enter(void)
29a73c2f 6025{
0a4dea41
CB
6026 struct statfs sb;
6027
6028 if (statfs("/", &sb) < 0) {
6029 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 6030 return -1;
0a4dea41
CB
6031 }
6032
6033 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
6034 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
6035 * /proc/1/mountinfo. */
6036 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
6037 return chroot_enter();
29a73c2f 6038
cc309f33 6039 if (pivot_enter() < 0) {
0a4dea41 6040 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 6041 return -1;
29a73c2f
CB
6042 }
6043
cc309f33 6044 return 0;
29a73c2f
CB
6045}
6046
6047/* Prepare our new clean root. */
0232cbac 6048static int permute_prepare(void)
29a73c2f
CB
6049{
6050 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 6051 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
6052 return -1;
6053 }
6054
6055 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 6056 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
6057 return -1;
6058 }
6059
6060 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 6061 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
6062 return -1;
6063 }
6064
6065 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 6066 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
6067 return -1;
6068 }
6069
6070 return 0;
6071}
6072
0232cbac
CB
6073/* Calls chroot() on ramfs, pivot_root() in all other cases. */
6074static bool permute_root(void)
29a73c2f
CB
6075{
6076 /* Prepare new root. */
0232cbac 6077 if (permute_prepare() < 0)
29a73c2f
CB
6078 return false;
6079
6080 /* Pivot into new root. */
0232cbac 6081 if (permute_and_enter() < 0)
29a73c2f
CB
6082 return false;
6083
6084 return true;
6085}
6086
a257a8ee
CB
6087static int preserve_mnt_ns(int pid)
6088{
6089 int ret;
6090 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6091 char path[len];
6092
6093 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6094 if (ret < 0 || (size_t)ret >= len)
6095 return -1;
6096
6097 return open(path, O_RDONLY | O_CLOEXEC);
6098}
6099
0a4dea41 6100static bool cgfs_prepare_mounts(void)
29a73c2f
CB
6101{
6102 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 6103 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
6104 return false;
6105 }
480262c9 6106
29a73c2f 6107 if (!umount_if_mounted()) {
b8defc3d 6108 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
6109 return false;
6110 }
6111
6112 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 6113 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
6114 return false;
6115 }
6116
a257a8ee
CB
6117 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6118 if (cgroup_mount_ns_fd < 0) {
6119 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6120 return false;
6121 }
6122
480262c9 6123 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 6124 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
6125 return false;
6126 }
480262c9 6127
29a73c2f 6128 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 6129 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
6130 return false;
6131 }
480262c9 6132
29a73c2f
CB
6133 return true;
6134}
6135
0a4dea41 6136static bool cgfs_mount_hierarchies(void)
29a73c2f
CB
6137{
6138 char *target;
6139 size_t clen, len;
6140 int i, ret;
6141
6142 for (i = 0; i < num_hierarchies; i++) {
6143 char *controller = hierarchies[i];
51c7ca35 6144
29a73c2f
CB
6145 clen = strlen(controller);
6146 len = strlen(BASEDIR) + clen + 2;
6147 target = malloc(len);
6148 if (!target)
6149 return false;
51c7ca35 6150
29a73c2f
CB
6151 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
6152 if (ret < 0 || ret >= len) {
6153 free(target);
6154 return false;
6155 }
6156 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
6157 free(target);
6158 return false;
6159 }
51c7ca35
CB
6160 if (!strcmp(controller, "unified"))
6161 ret = mount("none", target, "cgroup2", 0, NULL);
6162 else
6163 ret = mount(controller, target, "cgroup", 0, controller);
6164 if (ret < 0) {
6165 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
29a73c2f
CB
6166 free(target);
6167 return false;
6168 }
6169
6170 fd_hierarchies[i] = open(target, O_DIRECTORY);
6171 if (fd_hierarchies[i] < 0) {
6172 free(target);
6173 return false;
6174 }
6175 free(target);
6176 }
6177 return true;
6178}
6179
480262c9 6180static bool cgfs_setup_controllers(void)
29a73c2f 6181{
0a4dea41 6182 if (!cgfs_prepare_mounts())
29a73c2f 6183 return false;
29a73c2f 6184
0a4dea41 6185 if (!cgfs_mount_hierarchies()) {
b8defc3d 6186 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
29a73c2f
CB
6187 return false;
6188 }
6189
0232cbac 6190 if (!permute_root())
29a73c2f
CB
6191 return false;
6192
6193 return true;
6194}
6195
6196static void __attribute__((constructor)) collect_and_mount_subsystems(void)
237e200e
SH
6197{
6198 FILE *f;
e58dab00
CB
6199 char *cret, *line = NULL;
6200 char cwd[MAXPATHLEN];
237e200e 6201 size_t len = 0;
480262c9 6202 int i, init_ns = -1;
51c7ca35 6203 bool found_unified = false;
237e200e
SH
6204
6205 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
b8defc3d 6206 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
237e200e
SH
6207 return;
6208 }
e58dab00 6209
237e200e 6210 while (getline(&line, &len, f) != -1) {
51c7ca35 6211 char *idx, *p, *p2;
237e200e
SH
6212
6213 p = strchr(line, ':');
6214 if (!p)
6215 goto out;
51c7ca35 6216 idx = line;
237e200e
SH
6217 *(p++) = '\0';
6218
6219 p2 = strrchr(p, ':');
6220 if (!p2)
6221 goto out;
6222 *p2 = '\0';
6223
a67719f6
CB
6224 /* With cgroupv2 /proc/self/cgroup can contain entries of the
6225 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
6226 * because it parses out the empty string "" and later on passes
6227 * it to mount(). Let's skip such entries.
6228 */
51c7ca35
CB
6229 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
6230 found_unified = true;
6231 p = "unified";
6232 }
a67719f6 6233
237e200e
SH
6234 if (!store_hierarchy(line, p))
6235 goto out;
6236 }
6237
480262c9 6238 /* Preserve initial namespace. */
a257a8ee 6239 init_ns = preserve_mnt_ns(getpid());
b8defc3d
CB
6240 if (init_ns < 0) {
6241 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
480262c9 6242 goto out;
b8defc3d 6243 }
480262c9 6244
92c3ee11 6245 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
b8defc3d
CB
6246 if (!fd_hierarchies) {
6247 lxcfs_error("%s\n", strerror(errno));
29a73c2f 6248 goto out;
b8defc3d 6249 }
29a73c2f 6250
480262c9
CB
6251 for (i = 0; i < num_hierarchies; i++)
6252 fd_hierarchies[i] = -1;
6253
e58dab00
CB
6254 cret = getcwd(cwd, MAXPATHLEN);
6255 if (!cret)
6256 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6257
480262c9
CB
6258 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6259 * to privately mount lxcfs cgroups. */
b8defc3d
CB
6260 if (!cgfs_setup_controllers()) {
6261 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
29a73c2f 6262 goto out;
b8defc3d 6263 }
480262c9 6264
b8defc3d
CB
6265 if (setns(init_ns, 0) < 0) {
6266 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
29a73c2f 6267 goto out;
b8defc3d 6268 }
29a73c2f 6269
e58dab00
CB
6270 if (!cret || chdir(cwd) < 0)
6271 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6272
056adcef
JS
6273 if (!init_cpuview()) {
6274 lxcfs_error("%s\n", "failed to init CPU view");
6275 goto out;
6276 }
6277
237e200e
SH
6278 print_subsystems();
6279
6280out:
6281 free(line);
6282 fclose(f);
480262c9
CB
6283 if (init_ns >= 0)
6284 close(init_ns);
237e200e
SH
6285}
6286
6287static void __attribute__((destructor)) free_subsystems(void)
6288{
6289 int i;
6290
b8defc3d
CB
6291 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6292
29a73c2f 6293 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
6294 if (hierarchies[i])
6295 free(hierarchies[i]);
480262c9 6296 if (fd_hierarchies && fd_hierarchies[i] >= 0)
29a73c2f
CB
6297 close(fd_hierarchies[i]);
6298 }
237e200e 6299 free(hierarchies);
480262c9 6300 free(fd_hierarchies);
056adcef 6301 free_cpuview();
a257a8ee
CB
6302
6303 if (cgroup_mount_ns_fd >= 0)
6304 close(cgroup_mount_ns_fd);
237e200e 6305}