]> git.proxmox.com Git - mirror_lxcfs.git/blame - bindings.c
Merge pull request #270 from tomponline/tp-reload
[mirror_lxcfs.git] / bindings.c
CommitLineData
237e200e
SH
1/* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9#define FUSE_USE_VERSION 26
10
0ecddf02 11#define __STDC_FORMAT_MACROS
237e200e 12#include <dirent.h>
29a73c2f 13#include <errno.h>
237e200e
SH
14#include <fcntl.h>
15#include <fuse.h>
0ecddf02 16#include <inttypes.h>
237e200e 17#include <libgen.h>
237e200e 18#include <pthread.h>
29a73c2f
CB
19#include <sched.h>
20#include <stdbool.h>
0ecddf02 21#include <stdint.h>
29a73c2f
CB
22#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25#include <time.h>
26#include <unistd.h>
27#include <wait.h>
d89504c4 28#include <linux/magic.h>
237e200e 29#include <linux/sched.h>
29a73c2f
CB
30#include <sys/epoll.h>
31#include <sys/mman.h>
32#include <sys/mount.h>
237e200e
SH
33#include <sys/param.h>
34#include <sys/socket.h>
29a73c2f 35#include <sys/syscall.h>
0ecddf02 36#include <sys/sysinfo.h>
d89504c4 37#include <sys/vfs.h>
237e200e 38
237e200e 39#include "bindings.h"
237e200e
SH
40#include "config.h" // for VERSION
41
0ecddf02
CB
42/* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
43#define LXCFS_NUMSTRLEN64 21
44
29a73c2f
CB
45/* Define pivot_root() if missing from the C library */
46#ifndef HAVE_PIVOT_ROOT
47static int pivot_root(const char * new_root, const char * put_old)
48{
49#ifdef __NR_pivot_root
50return syscall(__NR_pivot_root, new_root, put_old);
51#else
52errno = ENOSYS;
53return -1;
54#endif
55}
56#else
57extern int pivot_root(const char * new_root, const char * put_old);
58#endif
59
237e200e
SH
60enum {
61 LXC_TYPE_CGDIR,
62 LXC_TYPE_CGFILE,
63 LXC_TYPE_PROC_MEMINFO,
64 LXC_TYPE_PROC_CPUINFO,
65 LXC_TYPE_PROC_UPTIME,
66 LXC_TYPE_PROC_STAT,
67 LXC_TYPE_PROC_DISKSTATS,
70dcc12e 68 LXC_TYPE_PROC_SWAPS,
46be8eed 69 LXC_TYPE_PROC_LOADAVG,
237e200e
SH
70};
71
72struct file_info {
73 char *controller;
74 char *cgroup;
75 char *file;
76 int type;
77 char *buf; // unused as of yet
78 int buflen;
79 int size; //actual data size
80 int cached;
81};
82
8be92dd1
JS
83struct cpuacct_usage {
84 uint64_t user;
85 uint64_t system;
056adcef 86 uint64_t idle;
77005a6c 87 bool online;
8be92dd1
JS
88};
89
0e47acaa 90/* The function of hash table.*/
91#define LOAD_SIZE 100 /*the size of hash_table */
6db4f7a3 92#define FLUSH_TIME 5 /*the flush rate */
93#define DEPTH_DIR 3 /*the depth of per cgroup */
94/* The function of calculate loadavg .*/
95#define FSHIFT 11 /* nr of bits of precision */
96#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
97#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
98#define EXP_5 2014 /* 1/exp(5sec/5min) */
99#define EXP_15 2037 /* 1/exp(5sec/15min) */
100#define LOAD_INT(x) ((x) >> FSHIFT)
101#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
beb5024e 102/*
6db4f7a3 103 * This parameter is used for proc_loadavg_read().
104 * 1 means use loadavg, 0 means not use.
105 */
106static int loadavg = 0;
a83618e2 107static volatile sig_atomic_t loadavg_stop = 0;
056adcef 108static int calc_hash(const char *name)
0e47acaa 109{
110 unsigned int hash = 0;
111 unsigned int x = 0;
112 /* ELFHash algorithm. */
113 while (*name) {
114 hash = (hash << 4) + *name++;
115 x = hash & 0xf0000000;
116 if (x != 0)
117 hash ^= (x >> 24);
118 hash &= ~x;
119 }
b077527b 120 return (hash & 0x7fffffff);
0e47acaa 121}
122
123struct load_node {
124 char *cg; /*cg */
125 unsigned long avenrun[3]; /* Load averages */
126 unsigned int run_pid;
127 unsigned int total_pid;
128 unsigned int last_pid;
129 int cfd; /* The file descriptor of the mounted cgroup */
130 struct load_node *next;
131 struct load_node **pre;
132};
133
134struct load_head {
135 /*
136 * The lock is about insert load_node and refresh load_node.To the first
137 * load_node of each hash bucket, insert and refresh in this hash bucket is
138 * mutually exclusive.
139 */
140 pthread_mutex_t lock;
141 /*
142 * The rdlock is about read loadavg and delete load_node.To each hash
143 * bucket, read and delete is mutually exclusive. But at the same time, we
144 * allow paratactic read operation. This rdlock is at list level.
145 */
146 pthread_rwlock_t rdlock;
147 /*
148 * The rilock is about read loadavg and insert load_node.To the first
149 * load_node of each hash bucket, read and insert is mutually exclusive.
150 * But at the same time, we allow paratactic read operation.
151 */
152 pthread_rwlock_t rilock;
153 struct load_node *next;
154};
155
156static struct load_head load_hash[LOAD_SIZE]; /* hash table */
157/*
158 * init_load initialize the hash table.
159 * Return 0 on success, return -1 on failure.
160 */
161static int init_load(void)
162{
163 int i;
164 int ret;
165
166 for (i = 0; i < LOAD_SIZE; i++) {
167 load_hash[i].next = NULL;
168 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
169 if (ret != 0) {
170 lxcfs_error("%s\n", "Failed to initialize lock");
171 goto out3;
172 }
173 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
174 if (ret != 0) {
175 lxcfs_error("%s\n", "Failed to initialize rdlock");
176 goto out2;
177 }
178 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
179 if (ret != 0) {
180 lxcfs_error("%s\n", "Failed to initialize rilock");
181 goto out1;
182 }
183 }
184 return 0;
185out1:
186 pthread_rwlock_destroy(&load_hash[i].rdlock);
187out2:
188 pthread_mutex_destroy(&load_hash[i].lock);
189out3:
190 while (i > 0) {
191 i--;
192 pthread_mutex_destroy(&load_hash[i].lock);
193 pthread_rwlock_destroy(&load_hash[i].rdlock);
194 pthread_rwlock_destroy(&load_hash[i].rilock);
195 }
196 return -1;
197}
198
199static void insert_node(struct load_node **n, int locate)
200{
201 struct load_node *f;
202
203 pthread_mutex_lock(&load_hash[locate].lock);
204 pthread_rwlock_wrlock(&load_hash[locate].rilock);
205 f = load_hash[locate].next;
206 load_hash[locate].next = *n;
207
208 (*n)->pre = &(load_hash[locate].next);
209 if (f)
210 f->pre = &((*n)->next);
211 (*n)->next = f;
212 pthread_mutex_unlock(&load_hash[locate].lock);
213 pthread_rwlock_unlock(&load_hash[locate].rilock);
214}
215/*
216 * locate_node() finds special node. Not return NULL means success.
217 * It should be noted that rdlock isn't unlocked at the end of code
218 * because this function is used to read special node. Delete is not
219 * allowed before read has ended.
220 * unlock rdlock only in proc_loadavg_read().
221 */
222static struct load_node *locate_node(char *cg, int locate)
223{
224 struct load_node *f = NULL;
225 int i = 0;
226
227 pthread_rwlock_rdlock(&load_hash[locate].rilock);
228 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
229 if (load_hash[locate].next == NULL) {
230 pthread_rwlock_unlock(&load_hash[locate].rilock);
231 return f;
232 }
233 f = load_hash[locate].next;
234 pthread_rwlock_unlock(&load_hash[locate].rilock);
235 while (f && ((i = strcmp(f->cg, cg)) != 0))
236 f = f->next;
237 return f;
238}
239/* Delete the load_node n and return the next node of it. */
240static struct load_node *del_node(struct load_node *n, int locate)
241{
242 struct load_node *g;
243
244 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
245 if (n->next == NULL) {
246 *(n->pre) = NULL;
247 } else {
248 *(n->pre) = n->next;
249 n->next->pre = n->pre;
250 }
251 g = n->next;
252 free(n->cg);
253 free(n);
254 pthread_rwlock_unlock(&load_hash[locate].rdlock);
255 return g;
256}
257
a83618e2 258static void load_free(void)
9c480eb7 259{
260 int i;
261 struct load_node *f, *p;
262
263 for (i = 0; i < LOAD_SIZE; i++) {
264 pthread_mutex_lock(&load_hash[i].lock);
265 pthread_rwlock_wrlock(&load_hash[i].rilock);
266 pthread_rwlock_wrlock(&load_hash[i].rdlock);
267 if (load_hash[i].next == NULL) {
268 pthread_mutex_unlock(&load_hash[i].lock);
269 pthread_mutex_destroy(&load_hash[i].lock);
270 pthread_rwlock_unlock(&load_hash[i].rilock);
271 pthread_rwlock_destroy(&load_hash[i].rilock);
272 pthread_rwlock_unlock(&load_hash[i].rdlock);
273 pthread_rwlock_destroy(&load_hash[i].rdlock);
274 continue;
275 }
276 for (f = load_hash[i].next; f; ) {
277 free(f->cg);
278 p = f->next;
279 free(f);
280 f = p;
281 }
282 pthread_mutex_unlock(&load_hash[i].lock);
283 pthread_mutex_destroy(&load_hash[i].lock);
284 pthread_rwlock_unlock(&load_hash[i].rilock);
285 pthread_rwlock_destroy(&load_hash[i].rilock);
286 pthread_rwlock_unlock(&load_hash[i].rdlock);
287 pthread_rwlock_destroy(&load_hash[i].rdlock);
288 }
289}
056adcef
JS
290
291/* Data for CPU view */
292struct cg_proc_stat {
293 char *cg;
294 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
295 struct cpuacct_usage *view; // Usage stats reported to the container
296 int cpu_count;
2f49b662 297 pthread_mutex_t lock; // For node manipulation
056adcef
JS
298 struct cg_proc_stat *next;
299};
300
301struct cg_proc_stat_head {
302 struct cg_proc_stat *next;
951acc94 303 time_t lastcheck;
2f49b662
JS
304
305 /*
306 * For access to the list. Reading can be parallel, pruning is exclusive.
307 */
308 pthread_rwlock_t lock;
056adcef
JS
309};
310
311#define CPUVIEW_HASH_SIZE 100
312static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
313
314static bool cpuview_init_head(struct cg_proc_stat_head **head)
315{
316 *head = malloc(sizeof(struct cg_proc_stat_head));
317 if (!(*head)) {
318 lxcfs_error("%s\n", strerror(errno));
319 return false;
320 }
321
951acc94 322 (*head)->lastcheck = time(NULL);
056adcef 323 (*head)->next = NULL;
2f49b662
JS
324
325 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
326 lxcfs_error("%s\n", "Failed to initialize list lock");
327 free(*head);
328 return false;
329 }
330
056adcef
JS
331 return true;
332}
333
334static bool init_cpuview()
335{
336 int i;
337
338 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
339 proc_stat_history[i] = NULL;
340
341 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
342 if (!cpuview_init_head(&proc_stat_history[i]))
343 goto err;
344 }
345
346 return true;
347
348err:
349 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
350 if (proc_stat_history[i]) {
351 free(proc_stat_history[i]);
352 proc_stat_history[i] = NULL;
353 }
354 }
355
356 return false;
357}
358
951acc94
JS
359static void free_proc_stat_node(struct cg_proc_stat *node)
360{
2f49b662 361 pthread_mutex_destroy(&node->lock);
951acc94
JS
362 free(node->cg);
363 free(node->usage);
364 free(node->view);
365 free(node);
366}
367
056adcef
JS
368static void cpuview_free_head(struct cg_proc_stat_head *head)
369{
370 struct cg_proc_stat *node, *tmp;
371
372 if (head->next) {
373 node = head->next;
374
375 for (;;) {
376 tmp = node;
377 node = node->next;
951acc94 378 free_proc_stat_node(tmp);
056adcef
JS
379
380 if (!node)
381 break;
382 }
383 }
384
2f49b662 385 pthread_rwlock_destroy(&head->lock);
056adcef
JS
386 free(head);
387}
388
389static void free_cpuview()
390{
391 int i;
392
393 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
394 if (proc_stat_history[i])
395 cpuview_free_head(proc_stat_history[i]);
396 }
397}
398
f34de69a
CB
399/* Reserve buffer size to account for file size changes. */
400#define BUF_RESERVE_SIZE 512
237e200e
SH
401
402/*
403 * A table caching which pid is init for a pid namespace.
404 * When looking up which pid is init for $qpid, we first
405 * 1. Stat /proc/$qpid/ns/pid.
406 * 2. Check whether the ino_t is in our store.
407 * a. if not, fork a child in qpid's ns to send us
408 * ucred.pid = 1, and read the initpid. Cache
409 * initpid and creation time for /proc/initpid
410 * in a new store entry.
411 * b. if so, verify that /proc/initpid still matches
412 * what we have saved. If not, clear the store
413 * entry and go back to a. If so, return the
414 * cached initpid.
415 */
416struct pidns_init_store {
417 ino_t ino; // inode number for /proc/$pid/ns/pid
418 pid_t initpid; // the pid of nit in that ns
419 long int ctime; // the time at which /proc/$initpid was created
420 struct pidns_init_store *next;
421 long int lastcheck;
422};
423
424/* lol - look at how they are allocated in the kernel */
425#define PIDNS_HASH_SIZE 4096
426#define HASH(x) ((x) % PIDNS_HASH_SIZE)
427
428static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
429static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
430static void lock_mutex(pthread_mutex_t *l)
431{
432 int ret;
433
434 if ((ret = pthread_mutex_lock(l)) != 0) {
b8defc3d 435 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
436 exit(1);
437 }
438}
439
29a73c2f
CB
440/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
441 * Number of hierarchies mounted. */
442static int num_hierarchies;
443
444/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
445 * Hierachies mounted {cpuset, blkio, ...}:
446 * Initialized via __constructor__ collect_and_mount_subsystems(). */
447static char **hierarchies;
448
449/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
450 * Open file descriptors:
451 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
452 * private mount namespace.
453 * Initialized via __constructor__ collect_and_mount_subsystems().
454 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
455 * mounts and respective files in the private namespace even when located in
456 * another namespace using the *at() family of functions
457 * {openat(), fchownat(), ...}. */
458static int *fd_hierarchies;
a257a8ee 459static int cgroup_mount_ns_fd = -1;
29a73c2f 460
237e200e
SH
461static void unlock_mutex(pthread_mutex_t *l)
462{
463 int ret;
464
465 if ((ret = pthread_mutex_unlock(l)) != 0) {
b8defc3d 466 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
467 exit(1);
468 }
469}
470
471static void store_lock(void)
472{
473 lock_mutex(&pidns_store_mutex);
474}
475
476static void store_unlock(void)
477{
478 unlock_mutex(&pidns_store_mutex);
479}
480
481/* Must be called under store_lock */
482static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
483{
484 struct stat initsb;
485 char fnam[100];
486
487 snprintf(fnam, 100, "/proc/%d", e->initpid);
488 if (stat(fnam, &initsb) < 0)
489 return false;
7dd6560a
CB
490
491 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
492 initsb.st_ctime, e->initpid);
493
237e200e
SH
494 if (e->ctime != initsb.st_ctime)
495 return false;
496 return true;
497}
498
499/* Must be called under store_lock */
500static void remove_initpid(struct pidns_init_store *e)
501{
502 struct pidns_init_store *tmp;
503 int h;
504
7dd6560a
CB
505 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
506
237e200e
SH
507 h = HASH(e->ino);
508 if (pidns_hash_table[h] == e) {
509 pidns_hash_table[h] = e->next;
510 free(e);
511 return;
512 }
513
514 tmp = pidns_hash_table[h];
515 while (tmp) {
516 if (tmp->next == e) {
517 tmp->next = e->next;
518 free(e);
519 return;
520 }
521 tmp = tmp->next;
522 }
523}
524
525#define PURGE_SECS 5
526/* Must be called under store_lock */
527static void prune_initpid_store(void)
528{
529 static long int last_prune = 0;
530 struct pidns_init_store *e, *prev, *delme;
531 long int now, threshold;
532 int i;
533
534 if (!last_prune) {
535 last_prune = time(NULL);
536 return;
537 }
538 now = time(NULL);
539 if (now < last_prune + PURGE_SECS)
540 return;
7dd6560a
CB
541
542 lxcfs_debug("%s\n", "Pruning.");
543
237e200e
SH
544 last_prune = now;
545 threshold = now - 2 * PURGE_SECS;
546
547 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
548 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
549 if (e->lastcheck < threshold) {
7dd6560a
CB
550
551 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
552
237e200e
SH
553 delme = e;
554 if (prev)
555 prev->next = e->next;
556 else
557 pidns_hash_table[i] = e->next;
558 e = e->next;
559 free(delme);
560 } else {
561 prev = e;
562 e = e->next;
563 }
564 }
565 }
566}
567
568/* Must be called under store_lock */
569static void save_initpid(struct stat *sb, pid_t pid)
570{
571 struct pidns_init_store *e;
572 char fpath[100];
573 struct stat procsb;
574 int h;
575
7dd6560a
CB
576 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
577
237e200e
SH
578 snprintf(fpath, 100, "/proc/%d", pid);
579 if (stat(fpath, &procsb) < 0)
580 return;
581 do {
582 e = malloc(sizeof(*e));
583 } while (!e);
584 e->ino = sb->st_ino;
585 e->initpid = pid;
586 e->ctime = procsb.st_ctime;
587 h = HASH(e->ino);
588 e->next = pidns_hash_table[h];
589 e->lastcheck = time(NULL);
590 pidns_hash_table[h] = e;
591}
592
593/*
594 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
595 * entry for the inode number and creation time. Verify that the init pid
596 * is still valid. If not, remove it. Return the entry if valid, NULL
597 * otherwise.
598 * Must be called under store_lock
599 */
600static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
601{
602 int h = HASH(sb->st_ino);
603 struct pidns_init_store *e = pidns_hash_table[h];
604
605 while (e) {
606 if (e->ino == sb->st_ino) {
607 if (initpid_still_valid(e, sb)) {
608 e->lastcheck = time(NULL);
609 return e;
610 }
611 remove_initpid(e);
612 return NULL;
613 }
614 e = e->next;
615 }
616
617 return NULL;
618}
619
0f657ce3 620static int is_dir(const char *path, int fd)
237e200e
SH
621{
622 struct stat statbuf;
0f657ce3 623 int ret = fstatat(fd, path, &statbuf, fd);
237e200e
SH
624 if (ret == 0 && S_ISDIR(statbuf.st_mode))
625 return 1;
626 return 0;
627}
628
629static char *must_copy_string(const char *str)
630{
631 char *dup = NULL;
632 if (!str)
633 return NULL;
634 do {
635 dup = strdup(str);
636 } while (!dup);
637
638 return dup;
639}
640
641static inline void drop_trailing_newlines(char *s)
642{
643 int l;
644
645 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
646 s[l-1] = '\0';
647}
648
649#define BATCH_SIZE 50
650static void dorealloc(char **mem, size_t oldlen, size_t newlen)
651{
652 int newbatches = (newlen / BATCH_SIZE) + 1;
653 int oldbatches = (oldlen / BATCH_SIZE) + 1;
654
655 if (!*mem || newbatches > oldbatches) {
656 char *tmp;
657 do {
658 tmp = realloc(*mem, newbatches * BATCH_SIZE);
659 } while (!tmp);
660 *mem = tmp;
661 }
662}
663static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
664{
665 size_t newlen = *len + linelen;
666 dorealloc(contents, *len, newlen + 1);
667 memcpy(*contents + *len, line, linelen+1);
668 *len = newlen;
669}
670
60f2ae53 671static char *slurp_file(const char *from, int fd)
237e200e
SH
672{
673 char *line = NULL;
674 char *contents = NULL;
60f2ae53 675 FILE *f = fdopen(fd, "r");
237e200e
SH
676 size_t len = 0, fulllen = 0;
677 ssize_t linelen;
678
679 if (!f)
680 return NULL;
681
682 while ((linelen = getline(&line, &len, f)) != -1) {
683 append_line(&contents, &fulllen, line, linelen);
684 }
685 fclose(f);
686
687 if (contents)
688 drop_trailing_newlines(contents);
689 free(line);
690 return contents;
691}
692
ba59ea09 693static bool write_string(const char *fnam, const char *string, int fd)
237e200e
SH
694{
695 FILE *f;
696 size_t len, ret;
697
beb5024e
CB
698 f = fdopen(fd, "w");
699 if (!f)
237e200e 700 return false;
beb5024e 701
237e200e
SH
702 len = strlen(string);
703 ret = fwrite(string, 1, len, f);
704 if (ret != len) {
beb5024e
CB
705 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
706 strerror(errno), string, fnam);
237e200e
SH
707 fclose(f);
708 return false;
709 }
beb5024e 710
237e200e 711 if (fclose(f) < 0) {
beb5024e 712 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
237e200e
SH
713 return false;
714 }
beb5024e 715
237e200e
SH
716 return true;
717}
718
237e200e
SH
719struct cgfs_files {
720 char *name;
721 uint32_t uid, gid;
722 uint32_t mode;
723};
724
0619767c 725#define ALLOC_NUM 20
237e200e
SH
726static bool store_hierarchy(char *stridx, char *h)
727{
0619767c
SH
728 if (num_hierarchies % ALLOC_NUM == 0) {
729 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
730 n *= ALLOC_NUM;
731 char **tmp = realloc(hierarchies, n * sizeof(char *));
0619767c 732 if (!tmp) {
b8defc3d 733 lxcfs_error("%s\n", strerror(errno));
0619767c
SH
734 exit(1);
735 }
237e200e 736 hierarchies = tmp;
237e200e 737 }
f676eb79 738
0619767c 739 hierarchies[num_hierarchies++] = must_copy_string(h);
237e200e
SH
740 return true;
741}
742
743static void print_subsystems(void)
744{
745 int i;
746
a257a8ee 747 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
cc97d34c 748 fprintf(stderr, "hierarchies:\n");
237e200e
SH
749 for (i = 0; i < num_hierarchies; i++) {
750 if (hierarchies[i])
b8defc3d
CB
751 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
752 fd_hierarchies[i], hierarchies[i]);
237e200e
SH
753 }
754}
755
756static bool in_comma_list(const char *needle, const char *haystack)
757{
758 const char *s = haystack, *e;
759 size_t nlen = strlen(needle);
760
06081b29 761 while (*s && (e = strchr(s, ','))) {
237e200e
SH
762 if (nlen != e - s) {
763 s = e + 1;
764 continue;
765 }
766 if (strncmp(needle, s, nlen) == 0)
767 return true;
768 s = e + 1;
769 }
770 if (strcmp(needle, s) == 0)
771 return true;
772 return false;
773}
774
775/* do we need to do any massaging here? I'm not sure... */
5dd3e6fd
CB
776/* Return the mounted controller and store the corresponding open file descriptor
777 * referring to the controller mountpoint in the private lxcfs namespace in
778 * @cfd.
779 */
780static char *find_mounted_controller(const char *controller, int *cfd)
237e200e
SH
781{
782 int i;
783
784 for (i = 0; i < num_hierarchies; i++) {
785 if (!hierarchies[i])
786 continue;
5dd3e6fd
CB
787 if (strcmp(hierarchies[i], controller) == 0) {
788 *cfd = fd_hierarchies[i];
237e200e 789 return hierarchies[i];
5dd3e6fd
CB
790 }
791 if (in_comma_list(controller, hierarchies[i])) {
792 *cfd = fd_hierarchies[i];
237e200e 793 return hierarchies[i];
5dd3e6fd 794 }
237e200e
SH
795 }
796
797 return NULL;
798}
799
800bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
801 const char *value)
802{
ba59ea09 803 int ret, fd, cfd;
237e200e 804 size_t len;
f5a6d92e 805 char *fnam, *tmpc;
237e200e 806
f5a6d92e 807 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
808 if (!tmpc)
809 return false;
f5a6d92e
CB
810
811 /* Make sure we pass a relative path to *at() family of functions.
812 * . + /cgroup + / + file + \0
813 */
ba59ea09 814 len = strlen(cgroup) + strlen(file) + 3;
237e200e 815 fnam = alloca(len);
ba59ea09
CB
816 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
817 if (ret < 0 || (size_t)ret >= len)
818 return false;
819
820 fd = openat(cfd, fnam, O_WRONLY);
821 if (fd < 0)
822 return false;
f676eb79 823
ba59ea09 824 return write_string(fnam, value, fd);
237e200e
SH
825}
826
827// Chown all the files in the cgroup directory. We do this when we create
828// a cgroup on behalf of a user.
f23fe717 829static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e 830{
f23fe717 831 struct dirent *direntp;
237e200e
SH
832 char path[MAXPATHLEN];
833 size_t len;
834 DIR *d;
f23fe717 835 int fd1, ret;
237e200e
SH
836
837 len = strlen(dirname);
838 if (len >= MAXPATHLEN) {
b8defc3d 839 lxcfs_error("Pathname too long: %s\n", dirname);
237e200e
SH
840 return;
841 }
842
f23fe717
CB
843 fd1 = openat(fd, dirname, O_DIRECTORY);
844 if (fd1 < 0)
845 return;
846
847 d = fdopendir(fd1);
237e200e 848 if (!d) {
b8defc3d 849 lxcfs_error("Failed to open %s\n", dirname);
237e200e
SH
850 return;
851 }
852
f23fe717 853 while ((direntp = readdir(d))) {
237e200e
SH
854 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
855 continue;
856 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
857 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 858 lxcfs_error("Pathname too long under %s\n", dirname);
237e200e
SH
859 continue;
860 }
f23fe717 861 if (fchownat(fd, path, uid, gid, 0) < 0)
b8defc3d 862 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
237e200e
SH
863 }
864 closedir(d);
865}
866
867int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
868{
5dd3e6fd 869 int cfd;
237e200e 870 size_t len;
f5a6d92e 871 char *dirnam, *tmpc;
237e200e 872
f5a6d92e 873 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
874 if (!tmpc)
875 return -EINVAL;
f5a6d92e
CB
876
877 /* Make sure we pass a relative path to *at() family of functions.
878 * . + /cg + \0
879 */
f23fe717 880 len = strlen(cg) + 2;
237e200e 881 dirnam = alloca(len);
f23fe717 882 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
237e200e 883
f23fe717 884 if (mkdirat(cfd, dirnam, 0755) < 0)
237e200e
SH
885 return -errno;
886
887 if (uid == 0 && gid == 0)
888 return 0;
889
f23fe717 890 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
237e200e
SH
891 return -errno;
892
f23fe717 893 chown_all_cgroup_files(dirnam, uid, gid, cfd);
237e200e
SH
894
895 return 0;
896}
897
7213ec5c 898static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
237e200e 899{
b7672ded 900 struct dirent *direntp;
237e200e
SH
901 DIR *dir;
902 bool ret = false;
903 char pathname[MAXPATHLEN];
b7672ded
CB
904 int dupfd;
905
906 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
907 if (dupfd < 0)
908 return false;
237e200e 909
b7672ded 910 dir = fdopendir(dupfd);
237e200e 911 if (!dir) {
7dd6560a 912 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
7213ec5c 913 close(dupfd);
237e200e
SH
914 return false;
915 }
916
b7672ded 917 while ((direntp = readdir(dir))) {
237e200e
SH
918 struct stat mystat;
919 int rc;
920
237e200e
SH
921 if (!strcmp(direntp->d_name, ".") ||
922 !strcmp(direntp->d_name, ".."))
923 continue;
924
925 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
926 if (rc < 0 || rc >= MAXPATHLEN) {
b8defc3d 927 lxcfs_error("%s\n", "Pathname too long.");
237e200e
SH
928 continue;
929 }
930
2e81a5e3
CB
931 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
932 if (rc) {
7dd6560a 933 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
237e200e
SH
934 continue;
935 }
7dd6560a 936 if (S_ISDIR(mystat.st_mode))
2e81a5e3 937 if (!recursive_rmdir(pathname, fd, cfd))
7dd6560a 938 lxcfs_debug("Error removing %s.\n", pathname);
237e200e
SH
939 }
940
941 ret = true;
942 if (closedir(dir) < 0) {
b8defc3d 943 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
237e200e
SH
944 ret = false;
945 }
946
2e81a5e3 947 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
7dd6560a 948 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
237e200e
SH
949 ret = false;
950 }
7213ec5c
CB
951
952 close(dupfd);
237e200e
SH
953
954 return ret;
955}
956
957bool cgfs_remove(const char *controller, const char *cg)
958{
b7672ded 959 int fd, cfd;
237e200e 960 size_t len;
f5a6d92e 961 char *dirnam, *tmpc;
7213ec5c 962 bool bret;
237e200e 963
f5a6d92e 964 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
965 if (!tmpc)
966 return false;
f5a6d92e
CB
967
968 /* Make sure we pass a relative path to *at() family of functions.
969 * . + /cg + \0
970 */
b7672ded 971 len = strlen(cg) + 2;
237e200e 972 dirnam = alloca(len);
b7672ded
CB
973 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
974
975 fd = openat(cfd, dirnam, O_DIRECTORY);
976 if (fd < 0)
977 return false;
978
7213ec5c
CB
979 bret = recursive_rmdir(dirnam, fd, cfd);
980 close(fd);
981 return bret;
237e200e
SH
982}
983
984bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
985{
5dd3e6fd 986 int cfd;
237e200e 987 size_t len;
f5a6d92e 988 char *pathname, *tmpc;
237e200e 989
f5a6d92e 990 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
991 if (!tmpc)
992 return false;
f5a6d92e
CB
993
994 /* Make sure we pass a relative path to *at() family of functions.
995 * . + /file + \0
996 */
534690b4 997 len = strlen(file) + 2;
237e200e 998 pathname = alloca(len);
534690b4
CB
999 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1000 if (fchmodat(cfd, pathname, mode, 0) < 0)
237e200e
SH
1001 return false;
1002 return true;
1003}
1004
0f657ce3 1005static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e
SH
1006{
1007 size_t len;
1008 char *fname;
1009
1010 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
1011 fname = alloca(len);
1012 snprintf(fname, len, "%s/tasks", dirname);
0f657ce3 1013 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
1014 return -errno;
1015 snprintf(fname, len, "%s/cgroup.procs", dirname);
0f657ce3 1016 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
1017 return -errno;
1018 return 0;
1019}
1020
1021int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
1022{
5dd3e6fd 1023 int cfd;
237e200e 1024 size_t len;
f5a6d92e 1025 char *pathname, *tmpc;
237e200e 1026
f5a6d92e 1027 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1028 if (!tmpc)
1029 return -EINVAL;
f5a6d92e
CB
1030
1031 /* Make sure we pass a relative path to *at() family of functions.
1032 * . + /file + \0
1033 */
0f657ce3 1034 len = strlen(file) + 2;
237e200e 1035 pathname = alloca(len);
0f657ce3
CB
1036 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1037 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
237e200e
SH
1038 return -errno;
1039
0f657ce3 1040 if (is_dir(pathname, cfd))
237e200e 1041 // like cgmanager did, we want to chown the tasks file as well
0f657ce3 1042 return chown_tasks_files(pathname, uid, gid, cfd);
237e200e
SH
1043
1044 return 0;
1045}
1046
1047FILE *open_pids_file(const char *controller, const char *cgroup)
1048{
3ffd08ee 1049 int fd, cfd;
237e200e 1050 size_t len;
f5a6d92e 1051 char *pathname, *tmpc;
237e200e 1052
f5a6d92e 1053 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1054 if (!tmpc)
1055 return NULL;
f5a6d92e
CB
1056
1057 /* Make sure we pass a relative path to *at() family of functions.
1058 * . + /cgroup + / "cgroup.procs" + \0
1059 */
3ffd08ee 1060 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
237e200e 1061 pathname = alloca(len);
3ffd08ee
CB
1062 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1063
1064 fd = openat(cfd, pathname, O_WRONLY);
1065 if (fd < 0)
1066 return NULL;
1067
1068 return fdopen(fd, "w");
237e200e
SH
1069}
1070
f366da65
WB
1071static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1072 void ***list, size_t typesize,
1073 void* (*iterator)(const char*, const char*, const char*))
237e200e 1074{
4ea38a4c 1075 int cfd, fd, ret;
237e200e 1076 size_t len;
4ea38a4c 1077 char *cg, *tmpc;
237e200e 1078 char pathname[MAXPATHLEN];
f366da65 1079 size_t sz = 0, asz = 0;
4ea38a4c 1080 struct dirent *dirent;
237e200e 1081 DIR *dir;
237e200e 1082
4ea38a4c 1083 tmpc = find_mounted_controller(controller, &cfd);
f366da65 1084 *list = NULL;
237e200e 1085 if (!tmpc)
e97c834b 1086 return false;
237e200e 1087
f5a6d92e 1088 /* Make sure we pass a relative path to *at() family of functions. */
4ea38a4c
CB
1089 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1090 cg = alloca(len);
1091 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1092 if (ret < 0 || (size_t)ret >= len) {
b8defc3d 1093 lxcfs_error("Pathname too long under %s\n", cgroup);
4ea38a4c
CB
1094 return false;
1095 }
237e200e 1096
4ea38a4c
CB
1097 fd = openat(cfd, cg, O_DIRECTORY);
1098 if (fd < 0)
1099 return false;
1100
1101 dir = fdopendir(fd);
237e200e
SH
1102 if (!dir)
1103 return false;
1104
4ea38a4c 1105 while ((dirent = readdir(dir))) {
237e200e 1106 struct stat mystat;
237e200e 1107
4ea38a4c
CB
1108 if (!strcmp(dirent->d_name, ".") ||
1109 !strcmp(dirent->d_name, ".."))
237e200e
SH
1110 continue;
1111
4ea38a4c
CB
1112 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1113 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 1114 lxcfs_error("Pathname too long under %s\n", cg);
237e200e
SH
1115 continue;
1116 }
1117
4ea38a4c 1118 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
237e200e 1119 if (ret) {
b8defc3d 1120 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
237e200e
SH
1121 continue;
1122 }
f366da65
WB
1123 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1124 (directories && !S_ISDIR(mystat.st_mode)))
237e200e
SH
1125 continue;
1126
1127 if (sz+2 >= asz) {
f366da65 1128 void **tmp;
237e200e
SH
1129 asz += BATCH_SIZE;
1130 do {
f366da65 1131 tmp = realloc(*list, asz * typesize);
237e200e
SH
1132 } while (!tmp);
1133 *list = tmp;
1134 }
4ea38a4c 1135 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
237e200e
SH
1136 (*list)[sz+1] = NULL;
1137 sz++;
1138 }
1139 if (closedir(dir) < 0) {
b8defc3d 1140 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
237e200e
SH
1141 return false;
1142 }
1143 return true;
1144}
1145
f366da65
WB
1146static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1147{
1148 char *dup;
1149 do {
1150 dup = strdup(dir_entry);
1151 } while (!dup);
1152 return dup;
1153}
1154
1155bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1156{
1157 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1158}
1159
237e200e
SH
1160void free_key(struct cgfs_files *k)
1161{
1162 if (!k)
1163 return;
1164 free(k->name);
1165 free(k);
1166}
1167
1168void free_keys(struct cgfs_files **keys)
1169{
1170 int i;
1171
1172 if (!keys)
1173 return;
1174 for (i = 0; keys[i]; i++) {
1175 free_key(keys[i]);
1176 }
1177 free(keys);
1178}
1179
1180bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1181{
60f2ae53 1182 int ret, fd, cfd;
237e200e 1183 size_t len;
f5a6d92e 1184 char *fnam, *tmpc;
237e200e 1185
f5a6d92e 1186 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1187 if (!tmpc)
1188 return false;
f5a6d92e
CB
1189
1190 /* Make sure we pass a relative path to *at() family of functions.
1191 * . + /cgroup + / + file + \0
1192 */
60f2ae53 1193 len = strlen(cgroup) + strlen(file) + 3;
237e200e 1194 fnam = alloca(len);
60f2ae53
CB
1195 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1196 if (ret < 0 || (size_t)ret >= len)
234a820c 1197 return false;
60f2ae53
CB
1198
1199 fd = openat(cfd, fnam, O_RDONLY);
1200 if (fd < 0)
234a820c 1201 return false;
237e200e 1202
60f2ae53 1203 *value = slurp_file(fnam, fd);
237e200e
SH
1204 return *value != NULL;
1205}
1206
951acc94
JS
1207bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1208{
1209 int ret, cfd;
1210 size_t len;
1211 char *fnam, *tmpc;
1212
1213 tmpc = find_mounted_controller(controller, &cfd);
1214 if (!tmpc)
1215 return false;
1216
1217 /* Make sure we pass a relative path to *at() family of functions.
1218 * . + /cgroup + / + file + \0
1219 */
1220 len = strlen(cgroup) + strlen(file) + 3;
1221 fnam = alloca(len);
1222 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1223 if (ret < 0 || (size_t)ret >= len)
1224 return false;
1225
1226 return (faccessat(cfd, fnam, F_OK, 0) == 0);
1227}
1228
237e200e
SH
1229struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1230{
4ea38a4c 1231 int ret, cfd;
237e200e 1232 size_t len;
f5a6d92e 1233 char *fnam, *tmpc;
237e200e
SH
1234 struct stat sb;
1235 struct cgfs_files *newkey;
237e200e 1236
f5a6d92e 1237 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1238 if (!tmpc)
1239 return false;
1240
1241 if (file && *file == '/')
1242 file++;
1243
06081b29 1244 if (file && strchr(file, '/'))
237e200e
SH
1245 return NULL;
1246
f5a6d92e
CB
1247 /* Make sure we pass a relative path to *at() family of functions.
1248 * . + /cgroup + / + file + \0
1249 */
4ea38a4c 1250 len = strlen(cgroup) + 3;
237e200e
SH
1251 if (file)
1252 len += strlen(file) + 1;
1253 fnam = alloca(len);
4ea38a4c
CB
1254 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1255 file ? "/" : "", file ? file : "");
237e200e 1256
4ea38a4c 1257 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1258 if (ret < 0)
1259 return NULL;
1260
1261 do {
1262 newkey = malloc(sizeof(struct cgfs_files));
1263 } while (!newkey);
1264 if (file)
1265 newkey->name = must_copy_string(file);
06081b29
CB
1266 else if (strrchr(cgroup, '/'))
1267 newkey->name = must_copy_string(strrchr(cgroup, '/'));
237e200e
SH
1268 else
1269 newkey->name = must_copy_string(cgroup);
1270 newkey->uid = sb.st_uid;
1271 newkey->gid = sb.st_gid;
1272 newkey->mode = sb.st_mode;
1273
1274 return newkey;
1275}
1276
f366da65 1277static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
237e200e 1278{
f366da65
WB
1279 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1280 if (!entry) {
b8defc3d
CB
1281 lxcfs_error("Error getting files under %s:%s\n", controller,
1282 cgroup);
237e200e 1283 }
f366da65
WB
1284 return entry;
1285}
1286
1287bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1288{
1289 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
237e200e
SH
1290}
1291
1292bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
5dd3e6fd
CB
1293{
1294 int cfd;
1295 size_t len;
f5a6d92e 1296 char *fnam, *tmpc;
237e200e
SH
1297 int ret;
1298 struct stat sb;
1299
f5a6d92e 1300 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1301 if (!tmpc)
1302 return false;
f5a6d92e
CB
1303
1304 /* Make sure we pass a relative path to *at() family of functions.
1305 * . + /cgroup + / + f + \0
1306 */
d04232f2 1307 len = strlen(cgroup) + strlen(f) + 3;
237e200e 1308 fnam = alloca(len);
d04232f2
CB
1309 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1310 if (ret < 0 || (size_t)ret >= len)
1311 return false;
237e200e 1312
d04232f2 1313 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1314 if (ret < 0 || !S_ISDIR(sb.st_mode))
1315 return false;
f5a6d92e 1316
237e200e
SH
1317 return true;
1318}
1319
1320#define SEND_CREDS_OK 0
1321#define SEND_CREDS_NOTSK 1
1322#define SEND_CREDS_FAIL 2
1323static bool recv_creds(int sock, struct ucred *cred, char *v);
1324static int wait_for_pid(pid_t pid);
1325static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
b10bdd6c 1326static int send_creds_clone_wrapper(void *arg);
237e200e
SH
1327
1328/*
b10bdd6c 1329 * clone a task which switches to @task's namespace and writes '1'.
237e200e
SH
1330 * over a unix sock so we can read the task's reaper's pid in our
1331 * namespace
b10bdd6c
FG
1332 *
1333 * Note: glibc's fork() does not respect pidns, which can lead to failed
1334 * assertions inside glibc (and thus failed forks) if the child's pid in
1335 * the pidns and the parent pid outside are identical. Using clone prevents
1336 * this issue.
237e200e
SH
1337 */
1338static void write_task_init_pid_exit(int sock, pid_t target)
1339{
237e200e
SH
1340 char fnam[100];
1341 pid_t pid;
237e200e 1342 int fd, ret;
b10bdd6c
FG
1343 size_t stack_size = sysconf(_SC_PAGESIZE);
1344 void *stack = alloca(stack_size);
237e200e
SH
1345
1346 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1347 if (ret < 0 || ret >= sizeof(fnam))
1348 _exit(1);
1349
1350 fd = open(fnam, O_RDONLY);
1351 if (fd < 0) {
1352 perror("write_task_init_pid_exit open of ns/pid");
1353 _exit(1);
1354 }
1355 if (setns(fd, 0)) {
1356 perror("write_task_init_pid_exit setns 1");
1357 close(fd);
1358 _exit(1);
1359 }
b10bdd6c 1360 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
237e200e
SH
1361 if (pid < 0)
1362 _exit(1);
1363 if (pid != 0) {
1364 if (!wait_for_pid(pid))
1365 _exit(1);
1366 _exit(0);
1367 }
b10bdd6c
FG
1368}
1369
1370static int send_creds_clone_wrapper(void *arg) {
1371 struct ucred cred;
1372 char v;
1373 int sock = *(int *)arg;
237e200e
SH
1374
1375 /* we are the child */
1376 cred.uid = 0;
1377 cred.gid = 0;
1378 cred.pid = 1;
1379 v = '1';
1380 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
b10bdd6c
FG
1381 return 1;
1382 return 0;
237e200e
SH
1383}
1384
1385static pid_t get_init_pid_for_task(pid_t task)
1386{
1387 int sock[2];
1388 pid_t pid;
1389 pid_t ret = -1;
1390 char v = '0';
1391 struct ucred cred;
1392
1393 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1394 perror("socketpair");
1395 return -1;
1396 }
1397
1398 pid = fork();
1399 if (pid < 0)
1400 goto out;
1401 if (!pid) {
1402 close(sock[1]);
1403 write_task_init_pid_exit(sock[0], task);
1404 _exit(0);
1405 }
1406
1407 if (!recv_creds(sock[1], &cred, &v))
1408 goto out;
1409 ret = cred.pid;
1410
1411out:
1412 close(sock[0]);
1413 close(sock[1]);
1414 if (pid > 0)
1415 wait_for_pid(pid);
1416 return ret;
1417}
1418
1419static pid_t lookup_initpid_in_store(pid_t qpid)
1420{
1421 pid_t answer = 0;
1422 struct stat sb;
1423 struct pidns_init_store *e;
1424 char fnam[100];
1425
1426 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1427 store_lock();
1428 if (stat(fnam, &sb) < 0)
1429 goto out;
1430 e = lookup_verify_initpid(&sb);
1431 if (e) {
1432 answer = e->initpid;
1433 goto out;
1434 }
1435 answer = get_init_pid_for_task(qpid);
1436 if (answer > 0)
1437 save_initpid(&sb, answer);
1438
1439out:
1440 /* we prune at end in case we are returning
1441 * the value we were about to return */
1442 prune_initpid_store();
1443 store_unlock();
1444 return answer;
1445}
1446
1447static int wait_for_pid(pid_t pid)
1448{
1449 int status, ret;
1450
1451 if (pid <= 0)
1452 return -1;
1453
1454again:
1455 ret = waitpid(pid, &status, 0);
1456 if (ret == -1) {
1457 if (errno == EINTR)
1458 goto again;
1459 return -1;
1460 }
1461 if (ret != pid)
1462 goto again;
1463 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1464 return -1;
1465 return 0;
1466}
1467
1468
1469/*
1470 * append pid to *src.
1471 * src: a pointer to a char* in which ot append the pid.
1472 * sz: the number of characters printed so far, minus trailing \0.
1473 * asz: the allocated size so far
1474 * pid: the pid to append
1475 */
1476static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1477{
1478 char tmp[30];
1479
1480 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1481
1482 if (!*src || tmplen + *sz + 1 >= *asz) {
1483 char *tmp;
1484 do {
1485 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1486 } while (!tmp);
1487 *src = tmp;
1488 *asz += BUF_RESERVE_SIZE;
1489 }
bbfd0e33 1490 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
237e200e 1491 *sz += tmplen;
237e200e
SH
1492}
1493
1494/*
1495 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1496 * valid in the caller's namespace, return the id mapped into
1497 * pid's namespace.
1498 * Returns the mapped id, or -1 on error.
1499 */
1500unsigned int
1501convert_id_to_ns(FILE *idfile, unsigned int in_id)
1502{
1503 unsigned int nsuid, // base id for a range in the idfile's namespace
1504 hostuid, // base id for a range in the caller's namespace
1505 count; // number of ids in this range
1506 char line[400];
1507 int ret;
1508
1509 fseek(idfile, 0L, SEEK_SET);
1510 while (fgets(line, 400, idfile)) {
1511 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1512 if (ret != 3)
1513 continue;
1514 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1515 /*
1516 * uids wrapped around - unexpected as this is a procfile,
1517 * so just bail.
1518 */
b8defc3d 1519 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
237e200e
SH
1520 nsuid, hostuid, count, line);
1521 return -1;
1522 }
1523 if (hostuid <= in_id && hostuid+count > in_id) {
1524 /*
1525 * now since hostuid <= in_id < hostuid+count, and
1526 * hostuid+count and nsuid+count do not wrap around,
1527 * we know that nsuid+(in_id-hostuid) which must be
1528 * less that nsuid+(count) must not wrap around
1529 */
1530 return (in_id - hostuid) + nsuid;
1531 }
1532 }
1533
1534 // no answer found
1535 return -1;
1536}
1537
1538/*
1539 * for is_privileged_over,
1540 * specify whether we require the calling uid to be root in his
1541 * namespace
1542 */
1543#define NS_ROOT_REQD true
1544#define NS_ROOT_OPT false
1545
1546#define PROCLEN 100
1547
1548static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1549{
1550 char fpath[PROCLEN];
1551 int ret;
1552 bool answer = false;
1553 uid_t nsuid;
1554
1555 if (victim == -1 || uid == -1)
1556 return false;
1557
1558 /*
1559 * If the request is one not requiring root in the namespace,
1560 * then having the same uid suffices. (i.e. uid 1000 has write
1561 * access to files owned by uid 1000
1562 */
1563 if (!req_ns_root && uid == victim)
1564 return true;
1565
1566 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1567 if (ret < 0 || ret >= PROCLEN)
1568 return false;
1569 FILE *f = fopen(fpath, "r");
1570 if (!f)
1571 return false;
1572
1573 /* if caller's not root in his namespace, reject */
1574 nsuid = convert_id_to_ns(f, uid);
1575 if (nsuid)
1576 goto out;
1577
1578 /*
1579 * If victim is not mapped into caller's ns, reject.
1580 * XXX I'm not sure this check is needed given that fuse
1581 * will be sending requests where the vfs has converted
1582 */
1583 nsuid = convert_id_to_ns(f, victim);
1584 if (nsuid == -1)
1585 goto out;
1586
1587 answer = true;
1588
1589out:
1590 fclose(f);
1591 return answer;
1592}
1593
1594static bool perms_include(int fmode, mode_t req_mode)
1595{
1596 mode_t r;
1597
1598 switch (req_mode & O_ACCMODE) {
1599 case O_RDONLY:
1600 r = S_IROTH;
1601 break;
1602 case O_WRONLY:
1603 r = S_IWOTH;
1604 break;
1605 case O_RDWR:
1606 r = S_IROTH | S_IWOTH;
1607 break;
1608 default:
1609 return false;
1610 }
1611 return ((fmode & r) == r);
1612}
1613
1614
1615/*
1616 * taskcg is a/b/c
1617 * querycg is /a/b/c/d/e
1618 * we return 'd'
1619 */
1620static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1621{
1622 char *start, *end;
1623
1624 if (strlen(taskcg) <= strlen(querycg)) {
b8defc3d 1625 lxcfs_error("%s\n", "I was fed bad input.");
237e200e
SH
1626 return NULL;
1627 }
1628
06081b29 1629 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
237e200e
SH
1630 start = strdup(taskcg + 1);
1631 else
1632 start = strdup(taskcg + strlen(querycg) + 1);
1633 if (!start)
1634 return NULL;
1635 end = strchr(start, '/');
1636 if (end)
1637 *end = '\0';
1638 return start;
1639}
1640
1641static void stripnewline(char *x)
1642{
1643 size_t l = strlen(x);
1644 if (l && x[l-1] == '\n')
1645 x[l-1] = '\0';
1646}
1647
1648static char *get_pid_cgroup(pid_t pid, const char *contrl)
1649{
5dd3e6fd 1650 int cfd;
237e200e
SH
1651 char fnam[PROCLEN];
1652 FILE *f;
1653 char *answer = NULL;
1654 char *line = NULL;
1655 size_t len = 0;
1656 int ret;
5dd3e6fd 1657 const char *h = find_mounted_controller(contrl, &cfd);
237e200e
SH
1658 if (!h)
1659 return NULL;
1660
1661 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1662 if (ret < 0 || ret >= PROCLEN)
1663 return NULL;
1664 if (!(f = fopen(fnam, "r")))
1665 return NULL;
1666
1667 while (getline(&line, &len, f) != -1) {
1668 char *c1, *c2;
1669 if (!line[0])
1670 continue;
1671 c1 = strchr(line, ':');
1672 if (!c1)
1673 goto out;
1674 c1++;
1675 c2 = strchr(c1, ':');
1676 if (!c2)
1677 goto out;
1678 *c2 = '\0';
1679 if (strcmp(c1, h) != 0)
1680 continue;
1681 c2++;
1682 stripnewline(c2);
1683 do {
1684 answer = strdup(c2);
1685 } while (!answer);
1686 break;
1687 }
1688
1689out:
1690 fclose(f);
1691 free(line);
1692 return answer;
1693}
1694
1695/*
1696 * check whether a fuse context may access a cgroup dir or file
1697 *
1698 * If file is not null, it is a cgroup file to check under cg.
1699 * If file is null, then we are checking perms on cg itself.
1700 *
1701 * For files we can check the mode of the list_keys result.
1702 * For cgroups, we must make assumptions based on the files under the
1703 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1704 * yet.
1705 */
1706static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1707{
1708 struct cgfs_files *k = NULL;
1709 bool ret = false;
1710
1711 k = cgfs_get_key(contrl, cg, file);
1712 if (!k)
1713 return false;
1714
1715 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1716 if (perms_include(k->mode >> 6, mode)) {
1717 ret = true;
1718 goto out;
1719 }
1720 }
1721 if (fc->gid == k->gid) {
1722 if (perms_include(k->mode >> 3, mode)) {
1723 ret = true;
1724 goto out;
1725 }
1726 }
1727 ret = perms_include(k->mode, mode);
1728
1729out:
1730 free_key(k);
1731 return ret;
1732}
1733
1734#define INITSCOPE "/init.scope"
1735static void prune_init_slice(char *cg)
1736{
1737 char *point;
1738 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1739
1740 if (cg_len < initscope_len)
1741 return;
1742
1743 point = cg + cg_len - initscope_len;
1744 if (strcmp(point, INITSCOPE) == 0) {
1745 if (point == cg)
1746 *(point+1) = '\0';
1747 else
1748 *point = '\0';
1749 }
1750}
1751
1752/*
1753 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1754 * If pid is in /a, he may act on /a/b, but not on /b.
1755 * if the answer is false and nextcg is not NULL, then *nextcg will point
1756 * to a string containing the next cgroup directory under cg, which must be
1757 * freed by the caller.
1758 */
1759static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1760{
1761 bool answer = false;
1762 char *c2 = get_pid_cgroup(pid, contrl);
1763 char *linecmp;
1764
1765 if (!c2)
1766 return false;
1767 prune_init_slice(c2);
1768
1769 /*
12c31268
CB
1770 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1771 * they pass in a cgroup without leading '/'
1772 *
1773 * The original line here was:
1774 * linecmp = *cg == '/' ? c2 : c2+1;
1775 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1776 * Serge, do you know?
237e200e 1777 */
12c31268
CB
1778 if (*cg == '/' || !strncmp(cg, "./", 2))
1779 linecmp = c2;
1780 else
1781 linecmp = c2 + 1;
237e200e
SH
1782 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1783 if (nextcg) {
1784 *nextcg = get_next_cgroup_dir(linecmp, cg);
1785 }
1786 goto out;
1787 }
1788 answer = true;
1789
1790out:
1791 free(c2);
1792 return answer;
1793}
1794
1795/*
1796 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1797 */
1798static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1799{
1800 bool answer = false;
1801 char *c2, *task_cg;
1802 size_t target_len, task_len;
1803
f7bff426 1804 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
237e200e
SH
1805 return true;
1806
1807 c2 = get_pid_cgroup(pid, contrl);
1808 if (!c2)
1809 return false;
1810 prune_init_slice(c2);
1811
1812 task_cg = c2 + 1;
1813 target_len = strlen(cg);
1814 task_len = strlen(task_cg);
1815 if (task_len == 0) {
1816 /* Task is in the root cg, it can see everything. This case is
1817 * not handled by the strmcps below, since they test for the
1818 * last /, but that is the first / that we've chopped off
1819 * above.
1820 */
1821 answer = true;
1822 goto out;
1823 }
1824 if (strcmp(cg, task_cg) == 0) {
1825 answer = true;
1826 goto out;
1827 }
1828 if (target_len < task_len) {
1829 /* looking up a parent dir */
1830 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1831 answer = true;
1832 goto out;
1833 }
1834 if (target_len > task_len) {
1835 /* looking up a child dir */
1836 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1837 answer = true;
1838 goto out;
1839 }
1840
1841out:
1842 free(c2);
1843 return answer;
1844}
1845
1846/*
1847 * given /cgroup/freezer/a/b, return "freezer".
1848 * the returned char* should NOT be freed.
1849 */
1850static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1851{
1852 const char *p1;
1853 char *contr, *slash;
1854
99142521 1855 if (strlen(path) < 9) {
e254948f 1856 errno = EACCES;
237e200e 1857 return NULL;
99142521
CB
1858 }
1859 if (*(path + 7) != '/') {
1860 errno = EINVAL;
237e200e 1861 return NULL;
99142521 1862 }
3adc421c 1863 p1 = path + 8;
237e200e 1864 contr = strdupa(p1);
99142521
CB
1865 if (!contr) {
1866 errno = ENOMEM;
237e200e 1867 return NULL;
99142521 1868 }
237e200e
SH
1869 slash = strstr(contr, "/");
1870 if (slash)
1871 *slash = '\0';
1872
1873 int i;
3adc421c 1874 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
1875 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1876 return hierarchies[i];
1877 }
99142521 1878 errno = ENOENT;
237e200e
SH
1879 return NULL;
1880}
1881
1882/*
1883 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1884 * Note that the returned value may include files (keynames) etc
1885 */
1886static const char *find_cgroup_in_path(const char *path)
1887{
1888 const char *p1;
1889
bc70ba9b 1890 if (strlen(path) < 9) {
e254948f 1891 errno = EACCES;
237e200e 1892 return NULL;
bc70ba9b
CB
1893 }
1894 p1 = strstr(path + 8, "/");
1895 if (!p1) {
1896 errno = EINVAL;
237e200e 1897 return NULL;
bc70ba9b
CB
1898 }
1899 errno = 0;
1900 return p1 + 1;
237e200e
SH
1901}
1902
1903/*
1904 * split the last path element from the path in @cg.
1905 * @dir is newly allocated and should be freed, @last not
1906*/
1907static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1908{
1909 char *p;
1910
1911 do {
1912 *dir = strdup(cg);
1913 } while (!*dir);
1914 *last = strrchr(cg, '/');
1915 if (!*last) {
1916 *last = NULL;
1917 return;
1918 }
1919 p = strrchr(*dir, '/');
1920 *p = '\0';
1921}
1922
1923/*
1924 * FUSE ops for /cgroup
1925 */
1926
1927int cg_getattr(const char *path, struct stat *sb)
1928{
1929 struct timespec now;
1930 struct fuse_context *fc = fuse_get_context();
1931 char * cgdir = NULL;
1932 char *last = NULL, *path1, *path2;
1933 struct cgfs_files *k = NULL;
1934 const char *cgroup;
1935 const char *controller = NULL;
1936 int ret = -ENOENT;
1937
1938
1939 if (!fc)
1940 return -EIO;
1941
1942 memset(sb, 0, sizeof(struct stat));
1943
1944 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1945 return -EINVAL;
1946
1947 sb->st_uid = sb->st_gid = 0;
1948 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1949 sb->st_size = 0;
1950
1951 if (strcmp(path, "/cgroup") == 0) {
1952 sb->st_mode = S_IFDIR | 00755;
1953 sb->st_nlink = 2;
1954 return 0;
1955 }
1956
1957 controller = pick_controller_from_path(fc, path);
1958 if (!controller)
2f7036d0 1959 return -errno;
237e200e
SH
1960 cgroup = find_cgroup_in_path(path);
1961 if (!cgroup) {
1962 /* this is just /cgroup/controller, return it as a dir */
1963 sb->st_mode = S_IFDIR | 00755;
1964 sb->st_nlink = 2;
1965 return 0;
1966 }
1967
1968 get_cgdir_and_path(cgroup, &cgdir, &last);
1969
1970 if (!last) {
1971 path1 = "/";
1972 path2 = cgdir;
1973 } else {
1974 path1 = cgdir;
1975 path2 = last;
1976 }
1977
1978 pid_t initpid = lookup_initpid_in_store(fc->pid);
1979 if (initpid <= 0)
1980 initpid = fc->pid;
1981 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1982 * Then check that caller's cgroup is under path if last is a child
1983 * cgroup, or cgdir if last is a file */
1984
1985 if (is_child_cgroup(controller, path1, path2)) {
1986 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1987 ret = -ENOENT;
1988 goto out;
1989 }
1990 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1991 /* this is just /cgroup/controller, return it as a dir */
1992 sb->st_mode = S_IFDIR | 00555;
1993 sb->st_nlink = 2;
1994 ret = 0;
1995 goto out;
1996 }
1997 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1998 ret = -EACCES;
1999 goto out;
2000 }
2001
2002 // get uid, gid, from '/tasks' file and make up a mode
2003 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2004 sb->st_mode = S_IFDIR | 00755;
2005 k = cgfs_get_key(controller, cgroup, NULL);
2006 if (!k) {
2007 sb->st_uid = sb->st_gid = 0;
2008 } else {
2009 sb->st_uid = k->uid;
2010 sb->st_gid = k->gid;
2011 }
2012 free_key(k);
2013 sb->st_nlink = 2;
2014 ret = 0;
2015 goto out;
2016 }
2017
2018 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
2019 sb->st_mode = S_IFREG | k->mode;
2020 sb->st_nlink = 1;
2021 sb->st_uid = k->uid;
2022 sb->st_gid = k->gid;
2023 sb->st_size = 0;
2024 free_key(k);
2025 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2026 ret = -ENOENT;
2027 goto out;
2028 }
237e200e
SH
2029 ret = 0;
2030 }
2031
2032out:
2033 free(cgdir);
2034 return ret;
2035}
2036
2037int cg_opendir(const char *path, struct fuse_file_info *fi)
2038{
2039 struct fuse_context *fc = fuse_get_context();
2040 const char *cgroup;
2041 struct file_info *dir_info;
2042 char *controller = NULL;
2043
2044 if (!fc)
2045 return -EIO;
2046
2047 if (strcmp(path, "/cgroup") == 0) {
2048 cgroup = NULL;
2049 controller = NULL;
2050 } else {
2051 // return list of keys for the controller, and list of child cgroups
2052 controller = pick_controller_from_path(fc, path);
2053 if (!controller)
2f7036d0 2054 return -errno;
237e200e
SH
2055
2056 cgroup = find_cgroup_in_path(path);
2057 if (!cgroup) {
2058 /* this is just /cgroup/controller, return its contents */
2059 cgroup = "/";
2060 }
2061 }
2062
2063 pid_t initpid = lookup_initpid_in_store(fc->pid);
2064 if (initpid <= 0)
2065 initpid = fc->pid;
2066 if (cgroup) {
2067 if (!caller_may_see_dir(initpid, controller, cgroup))
2068 return -ENOENT;
2069 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2070 return -EACCES;
2071 }
2072
2073 /* we'll free this at cg_releasedir */
2074 dir_info = malloc(sizeof(*dir_info));
2075 if (!dir_info)
2076 return -ENOMEM;
2077 dir_info->controller = must_copy_string(controller);
2078 dir_info->cgroup = must_copy_string(cgroup);
2079 dir_info->type = LXC_TYPE_CGDIR;
2080 dir_info->buf = NULL;
2081 dir_info->file = NULL;
2082 dir_info->buflen = 0;
2083
2084 fi->fh = (unsigned long)dir_info;
2085 return 0;
2086}
2087
2088int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2089 struct fuse_file_info *fi)
2090{
2091 struct file_info *d = (struct file_info *)fi->fh;
2092 struct cgfs_files **list = NULL;
2093 int i, ret;
2094 char *nextcg = NULL;
2095 struct fuse_context *fc = fuse_get_context();
2096 char **clist = NULL;
2097
d639f863
CB
2098 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2099 return -EIO;
2100
237e200e 2101 if (d->type != LXC_TYPE_CGDIR) {
b8defc3d 2102 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
237e200e
SH
2103 return -EIO;
2104 }
2105 if (!d->cgroup && !d->controller) {
2106 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2107 int i;
2108
2109 for (i = 0; i < num_hierarchies; i++) {
2110 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2111 return -EIO;
2112 }
2113 }
2114 return 0;
2115 }
2116
2117 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2118 // not a valid cgroup
2119 ret = -EINVAL;
2120 goto out;
2121 }
2122
2123 pid_t initpid = lookup_initpid_in_store(fc->pid);
2124 if (initpid <= 0)
2125 initpid = fc->pid;
2126 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2127 if (nextcg) {
2128 ret = filler(buf, nextcg, NULL, 0);
2129 free(nextcg);
2130 if (ret != 0) {
2131 ret = -EIO;
2132 goto out;
2133 }
2134 }
2135 ret = 0;
2136 goto out;
2137 }
2138
b737a54a 2139 for (i = 0; list && list[i]; i++) {
237e200e
SH
2140 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2141 ret = -EIO;
2142 goto out;
2143 }
2144 }
2145
2146 // now get the list of child cgroups
2147
2148 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2149 ret = 0;
2150 goto out;
2151 }
f366da65
WB
2152 if (clist) {
2153 for (i = 0; clist[i]; i++) {
2154 if (filler(buf, clist[i], NULL, 0) != 0) {
2155 ret = -EIO;
2156 goto out;
2157 }
237e200e
SH
2158 }
2159 }
2160 ret = 0;
2161
2162out:
2163 free_keys(list);
2164 if (clist) {
2165 for (i = 0; clist[i]; i++)
2166 free(clist[i]);
2167 free(clist);
2168 }
2169 return ret;
2170}
2171
43215927 2172static void do_release_file_info(struct fuse_file_info *fi)
237e200e 2173{
43215927
SH
2174 struct file_info *f = (struct file_info *)fi->fh;
2175
237e200e
SH
2176 if (!f)
2177 return;
43215927
SH
2178
2179 fi->fh = 0;
2180
237e200e 2181 free(f->controller);
43215927 2182 f->controller = NULL;
237e200e 2183 free(f->cgroup);
43215927 2184 f->cgroup = NULL;
237e200e 2185 free(f->file);
43215927 2186 f->file = NULL;
237e200e 2187 free(f->buf);
43215927 2188 f->buf = NULL;
237e200e 2189 free(f);
bbb508dd 2190 f = NULL;
237e200e
SH
2191}
2192
2193int cg_releasedir(const char *path, struct fuse_file_info *fi)
2194{
43215927 2195 do_release_file_info(fi);
237e200e
SH
2196 return 0;
2197}
2198
2199int cg_open(const char *path, struct fuse_file_info *fi)
2200{
2201 const char *cgroup;
2202 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2203 struct cgfs_files *k = NULL;
2204 struct file_info *file_info;
2205 struct fuse_context *fc = fuse_get_context();
2206 int ret;
2207
2208 if (!fc)
2209 return -EIO;
2210
2211 controller = pick_controller_from_path(fc, path);
2212 if (!controller)
2f7036d0 2213 return -errno;
237e200e
SH
2214 cgroup = find_cgroup_in_path(path);
2215 if (!cgroup)
bc70ba9b 2216 return -errno;
237e200e
SH
2217
2218 get_cgdir_and_path(cgroup, &cgdir, &last);
2219 if (!last) {
2220 path1 = "/";
2221 path2 = cgdir;
2222 } else {
2223 path1 = cgdir;
2224 path2 = last;
2225 }
2226
2227 k = cgfs_get_key(controller, path1, path2);
2228 if (!k) {
2229 ret = -EINVAL;
2230 goto out;
2231 }
2232 free_key(k);
2233
2234 pid_t initpid = lookup_initpid_in_store(fc->pid);
2235 if (initpid <= 0)
2236 initpid = fc->pid;
2237 if (!caller_may_see_dir(initpid, controller, path1)) {
2238 ret = -ENOENT;
2239 goto out;
2240 }
2241 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
237e200e
SH
2242 ret = -EACCES;
2243 goto out;
2244 }
2245
2246 /* we'll free this at cg_release */
2247 file_info = malloc(sizeof(*file_info));
2248 if (!file_info) {
2249 ret = -ENOMEM;
2250 goto out;
2251 }
2252 file_info->controller = must_copy_string(controller);
2253 file_info->cgroup = must_copy_string(path1);
2254 file_info->file = must_copy_string(path2);
2255 file_info->type = LXC_TYPE_CGFILE;
2256 file_info->buf = NULL;
2257 file_info->buflen = 0;
2258
2259 fi->fh = (unsigned long)file_info;
2260 ret = 0;
2261
2262out:
2263 free(cgdir);
2264 return ret;
2265}
2266
bddbb106
SH
2267int cg_access(const char *path, int mode)
2268{
6f0f6b83 2269 int ret;
bddbb106 2270 const char *cgroup;
6f0f6b83
CB
2271 char *path1, *path2, *controller;
2272 char *last = NULL, *cgdir = NULL;
bddbb106
SH
2273 struct cgfs_files *k = NULL;
2274 struct fuse_context *fc = fuse_get_context();
6f0f6b83 2275
9873c5e8 2276 if (strcmp(path, "/cgroup") == 0)
6f0f6b83 2277 return 0;
bddbb106
SH
2278
2279 if (!fc)
2280 return -EIO;
2281
2282 controller = pick_controller_from_path(fc, path);
2283 if (!controller)
2f7036d0 2284 return -errno;
bddbb106 2285 cgroup = find_cgroup_in_path(path);
575316c4
SH
2286 if (!cgroup) {
2287 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
3f441bc7
SH
2288 if ((mode & W_OK) == 0)
2289 return 0;
2290 return -EACCES;
575316c4 2291 }
bddbb106
SH
2292
2293 get_cgdir_and_path(cgroup, &cgdir, &last);
2294 if (!last) {
2295 path1 = "/";
2296 path2 = cgdir;
2297 } else {
2298 path1 = cgdir;
2299 path2 = last;
2300 }
2301
2302 k = cgfs_get_key(controller, path1, path2);
2303 if (!k) {
3f441bc7
SH
2304 if ((mode & W_OK) == 0)
2305 ret = 0;
2306 else
2307 ret = -EACCES;
bddbb106
SH
2308 goto out;
2309 }
2310 free_key(k);
2311
2312 pid_t initpid = lookup_initpid_in_store(fc->pid);
2313 if (initpid <= 0)
2314 initpid = fc->pid;
2315 if (!caller_may_see_dir(initpid, controller, path1)) {
2316 ret = -ENOENT;
2317 goto out;
2318 }
2319 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2320 ret = -EACCES;
2321 goto out;
2322 }
2323
2324 ret = 0;
2325
2326out:
2327 free(cgdir);
2328 return ret;
2329}
2330
237e200e
SH
2331int cg_release(const char *path, struct fuse_file_info *fi)
2332{
43215927 2333 do_release_file_info(fi);
237e200e
SH
2334 return 0;
2335}
2336
2337#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2338
2339static bool wait_for_sock(int sock, int timeout)
2340{
2341 struct epoll_event ev;
2342 int epfd, ret, now, starttime, deltatime, saved_errno;
2343
2344 if ((starttime = time(NULL)) < 0)
2345 return false;
2346
2347 if ((epfd = epoll_create(1)) < 0) {
b8defc3d 2348 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
237e200e
SH
2349 return false;
2350 }
2351
2352 ev.events = POLLIN_SET;
2353 ev.data.fd = sock;
2354 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
b8defc3d 2355 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
237e200e
SH
2356 close(epfd);
2357 return false;
2358 }
2359
2360again:
2361 if ((now = time(NULL)) < 0) {
2362 close(epfd);
2363 return false;
2364 }
2365
2366 deltatime = (starttime + timeout) - now;
2367 if (deltatime < 0) { // timeout
2368 errno = 0;
2369 close(epfd);
2370 return false;
2371 }
2372 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2373 if (ret < 0 && errno == EINTR)
2374 goto again;
2375 saved_errno = errno;
2376 close(epfd);
2377
2378 if (ret <= 0) {
2379 errno = saved_errno;
2380 return false;
2381 }
2382 return true;
2383}
2384
2385static int msgrecv(int sockfd, void *buf, size_t len)
2386{
2387 if (!wait_for_sock(sockfd, 2))
2388 return -1;
2389 return recv(sockfd, buf, len, MSG_DONTWAIT);
2390}
2391
2392static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2393{
2394 struct msghdr msg = { 0 };
2395 struct iovec iov;
2396 struct cmsghdr *cmsg;
2397 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2398 char buf[1];
2399 buf[0] = 'p';
2400
2401 if (pingfirst) {
2402 if (msgrecv(sock, buf, 1) != 1) {
b8defc3d 2403 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
237e200e
SH
2404 return SEND_CREDS_FAIL;
2405 }
2406 }
2407
2408 msg.msg_control = cmsgbuf;
2409 msg.msg_controllen = sizeof(cmsgbuf);
2410
2411 cmsg = CMSG_FIRSTHDR(&msg);
2412 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2413 cmsg->cmsg_level = SOL_SOCKET;
2414 cmsg->cmsg_type = SCM_CREDENTIALS;
2415 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2416
2417 msg.msg_name = NULL;
2418 msg.msg_namelen = 0;
2419
2420 buf[0] = v;
2421 iov.iov_base = buf;
2422 iov.iov_len = sizeof(buf);
2423 msg.msg_iov = &iov;
2424 msg.msg_iovlen = 1;
2425
2426 if (sendmsg(sock, &msg, 0) < 0) {
b8defc3d 2427 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
237e200e
SH
2428 if (errno == 3)
2429 return SEND_CREDS_NOTSK;
2430 return SEND_CREDS_FAIL;
2431 }
2432
2433 return SEND_CREDS_OK;
2434}
2435
2436static bool recv_creds(int sock, struct ucred *cred, char *v)
2437{
2438 struct msghdr msg = { 0 };
2439 struct iovec iov;
2440 struct cmsghdr *cmsg;
2441 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2442 char buf[1];
2443 int ret;
2444 int optval = 1;
2445
2446 *v = '1';
2447
2448 cred->pid = -1;
2449 cred->uid = -1;
2450 cred->gid = -1;
2451
2452 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
b8defc3d 2453 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
237e200e
SH
2454 return false;
2455 }
2456 buf[0] = '1';
2457 if (write(sock, buf, 1) != 1) {
b8defc3d 2458 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
237e200e
SH
2459 return false;
2460 }
2461
2462 msg.msg_name = NULL;
2463 msg.msg_namelen = 0;
2464 msg.msg_control = cmsgbuf;
2465 msg.msg_controllen = sizeof(cmsgbuf);
2466
2467 iov.iov_base = buf;
2468 iov.iov_len = sizeof(buf);
2469 msg.msg_iov = &iov;
2470 msg.msg_iovlen = 1;
2471
2472 if (!wait_for_sock(sock, 2)) {
b8defc3d 2473 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
237e200e
SH
2474 return false;
2475 }
2476 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2477 if (ret < 0) {
b8defc3d 2478 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
237e200e
SH
2479 return false;
2480 }
2481
2482 cmsg = CMSG_FIRSTHDR(&msg);
2483
2484 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2485 cmsg->cmsg_level == SOL_SOCKET &&
2486 cmsg->cmsg_type == SCM_CREDENTIALS) {
2487 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2488 }
2489 *v = buf[0];
2490
2491 return true;
2492}
2493
35174b0f
FG
2494struct pid_ns_clone_args {
2495 int *cpipe;
2496 int sock;
2497 pid_t tpid;
2498 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2499};
2500
2501/*
2502 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2503 * with clone(). This simply writes '1' as ACK back to the parent
2504 * before calling the actual wrapped function.
2505 */
2506static int pid_ns_clone_wrapper(void *arg) {
2507 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2508 char b = '1';
2509
2510 close(args->cpipe[0]);
b8defc3d
CB
2511 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2512 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
35174b0f
FG
2513 close(args->cpipe[1]);
2514 return args->wrapped(args->sock, args->tpid);
2515}
237e200e
SH
2516
2517/*
2518 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2519 * int value back over the socket. This shifts the pid from the
2520 * sender's pidns into tpid's pidns.
2521 */
35174b0f 2522static int pid_to_ns(int sock, pid_t tpid)
237e200e
SH
2523{
2524 char v = '0';
2525 struct ucred cred;
2526
2527 while (recv_creds(sock, &cred, &v)) {
2528 if (v == '1')
35174b0f 2529 return 0;
237e200e 2530 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
35174b0f 2531 return 1;
237e200e 2532 }
35174b0f 2533 return 0;
237e200e
SH
2534}
2535
35174b0f 2536
237e200e
SH
2537/*
2538 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
35174b0f
FG
2539 * in your old pidns. Only children which you clone will be in the target
2540 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2541 * actually convert pids.
2542 *
2543 * Note: glibc's fork() does not respect pidns, which can lead to failed
2544 * assertions inside glibc (and thus failed forks) if the child's pid in
2545 * the pidns and the parent pid outside are identical. Using clone prevents
2546 * this issue.
237e200e
SH
2547 */
2548static void pid_to_ns_wrapper(int sock, pid_t tpid)
2549{
2550 int newnsfd = -1, ret, cpipe[2];
2551 char fnam[100];
2552 pid_t cpid;
2553 char v;
2554
2555 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2556 if (ret < 0 || ret >= sizeof(fnam))
2557 _exit(1);
2558 newnsfd = open(fnam, O_RDONLY);
2559 if (newnsfd < 0)
2560 _exit(1);
2561 if (setns(newnsfd, 0) < 0)
2562 _exit(1);
2563 close(newnsfd);
2564
2565 if (pipe(cpipe) < 0)
2566 _exit(1);
2567
35174b0f
FG
2568 struct pid_ns_clone_args args = {
2569 .cpipe = cpipe,
2570 .sock = sock,
2571 .tpid = tpid,
2572 .wrapped = &pid_to_ns
2573 };
2574 size_t stack_size = sysconf(_SC_PAGESIZE);
2575 void *stack = alloca(stack_size);
2576
2577 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2578 if (cpid < 0)
2579 _exit(1);
2580
237e200e
SH
2581 // give the child 1 second to be done forking and
2582 // write its ack
2583 if (!wait_for_sock(cpipe[0], 1))
2584 _exit(1);
2585 ret = read(cpipe[0], &v, 1);
2586 if (ret != sizeof(char) || v != '1')
2587 _exit(1);
2588
2589 if (!wait_for_pid(cpid))
2590 _exit(1);
2591 _exit(0);
2592}
2593
2594/*
2595 * To read cgroup files with a particular pid, we will setns into the child
2596 * pidns, open a pipe, fork a child - which will be the first to really be in
2597 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2598 */
2599bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2600{
2601 int sock[2] = {-1, -1};
2602 char *tmpdata = NULL;
2603 int ret;
2604 pid_t qpid, cpid = -1;
2605 bool answer = false;
2606 char v = '0';
2607 struct ucred cred;
2608 size_t sz = 0, asz = 0;
2609
2610 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2611 return false;
2612
2613 /*
2614 * Now we read the pids from returned data one by one, pass
2615 * them into a child in the target namespace, read back the
2616 * translated pids, and put them into our to-return data
2617 */
2618
2619 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2620 perror("socketpair");
2621 free(tmpdata);
2622 return false;
2623 }
2624
2625 cpid = fork();
2626 if (cpid == -1)
2627 goto out;
2628
2629 if (!cpid) // child - exits when done
2630 pid_to_ns_wrapper(sock[1], tpid);
2631
2632 char *ptr = tmpdata;
2633 cred.uid = 0;
2634 cred.gid = 0;
2635 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2636 cred.pid = qpid;
2637 ret = send_creds(sock[0], &cred, v, true);
2638
2639 if (ret == SEND_CREDS_NOTSK)
2640 goto next;
2641 if (ret == SEND_CREDS_FAIL)
2642 goto out;
2643
2644 // read converted results
2645 if (!wait_for_sock(sock[0], 2)) {
b8defc3d 2646 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
237e200e
SH
2647 goto out;
2648 }
2649 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2650 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
237e200e
SH
2651 goto out;
2652 }
2653 must_strcat_pid(d, &sz, &asz, qpid);
2654next:
2655 ptr = strchr(ptr, '\n');
2656 if (!ptr)
2657 break;
2658 ptr++;
2659 }
2660
2661 cred.pid = getpid();
2662 v = '1';
2663 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2664 // failed to ask child to exit
b8defc3d 2665 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
237e200e
SH
2666 goto out;
2667 }
2668
2669 answer = true;
2670
2671out:
2672 free(tmpdata);
2673 if (cpid != -1)
2674 wait_for_pid(cpid);
2675 if (sock[0] != -1) {
2676 close(sock[0]);
2677 close(sock[1]);
2678 }
2679 return answer;
2680}
2681
2682int cg_read(const char *path, char *buf, size_t size, off_t offset,
2683 struct fuse_file_info *fi)
2684{
2685 struct fuse_context *fc = fuse_get_context();
2686 struct file_info *f = (struct file_info *)fi->fh;
2687 struct cgfs_files *k = NULL;
2688 char *data = NULL;
2689 int ret, s;
2690 bool r;
2691
2692 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 2693 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
237e200e
SH
2694 return -EIO;
2695 }
2696
2697 if (offset)
2698 return 0;
2699
2700 if (!fc)
2701 return -EIO;
2702
2703 if (!f->controller)
2704 return -EINVAL;
2705
2706 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2707 return -EINVAL;
2708 }
2709 free_key(k);
2710
2711
888f8f3c 2712 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
237e200e
SH
2713 ret = -EACCES;
2714 goto out;
2715 }
2716
2717 if (strcmp(f->file, "tasks") == 0 ||
2718 strcmp(f->file, "/tasks") == 0 ||
2719 strcmp(f->file, "/cgroup.procs") == 0 ||
2720 strcmp(f->file, "cgroup.procs") == 0)
2721 // special case - we have to translate the pids
2722 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2723 else
2724 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2725
2726 if (!r) {
2727 ret = -EINVAL;
2728 goto out;
2729 }
2730
2731 if (!data) {
2732 ret = 0;
2733 goto out;
2734 }
2735 s = strlen(data);
2736 if (s > size)
2737 s = size;
2738 memcpy(buf, data, s);
2739 if (s > 0 && s < size && data[s-1] != '\n')
2740 buf[s++] = '\n';
2741
2742 ret = s;
2743
2744out:
2745 free(data);
2746 return ret;
2747}
2748
35174b0f 2749static int pid_from_ns(int sock, pid_t tpid)
237e200e
SH
2750{
2751 pid_t vpid;
2752 struct ucred cred;
2753 char v;
2754 int ret;
2755
2756 cred.uid = 0;
2757 cred.gid = 0;
2758 while (1) {
2759 if (!wait_for_sock(sock, 2)) {
b8defc3d 2760 lxcfs_error("%s\n", "Timeout reading from parent.");
35174b0f 2761 return 1;
237e200e
SH
2762 }
2763 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
b8defc3d 2764 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
35174b0f 2765 return 1;
237e200e
SH
2766 }
2767 if (vpid == -1) // done
2768 break;
2769 v = '0';
2770 cred.pid = vpid;
2771 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2772 v = '1';
2773 cred.pid = getpid();
2774 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
35174b0f 2775 return 1;
237e200e
SH
2776 }
2777 }
35174b0f 2778 return 0;
237e200e
SH
2779}
2780
2781static void pid_from_ns_wrapper(int sock, pid_t tpid)
2782{
2783 int newnsfd = -1, ret, cpipe[2];
2784 char fnam[100];
2785 pid_t cpid;
2786 char v;
2787
2788 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2789 if (ret < 0 || ret >= sizeof(fnam))
2790 _exit(1);
2791 newnsfd = open(fnam, O_RDONLY);
2792 if (newnsfd < 0)
2793 _exit(1);
2794 if (setns(newnsfd, 0) < 0)
2795 _exit(1);
2796 close(newnsfd);
2797
2798 if (pipe(cpipe) < 0)
2799 _exit(1);
2800
35174b0f
FG
2801 struct pid_ns_clone_args args = {
2802 .cpipe = cpipe,
2803 .sock = sock,
2804 .tpid = tpid,
2805 .wrapped = &pid_from_ns
2806 };
f0f8b851
SH
2807 size_t stack_size = sysconf(_SC_PAGESIZE);
2808 void *stack = alloca(stack_size);
35174b0f
FG
2809
2810 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2811 if (cpid < 0)
2812 _exit(1);
2813
237e200e
SH
2814 // give the child 1 second to be done forking and
2815 // write its ack
2816 if (!wait_for_sock(cpipe[0], 1))
f0f8b851 2817 _exit(1);
237e200e 2818 ret = read(cpipe[0], &v, 1);
f0f8b851
SH
2819 if (ret != sizeof(char) || v != '1')
2820 _exit(1);
237e200e
SH
2821
2822 if (!wait_for_pid(cpid))
2823 _exit(1);
2824 _exit(0);
237e200e
SH
2825}
2826
2827/*
2828 * Given host @uid, return the uid to which it maps in
2829 * @pid's user namespace, or -1 if none.
2830 */
2831bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2832{
2833 FILE *f;
2834 char line[400];
2835
2836 sprintf(line, "/proc/%d/uid_map", pid);
2837 if ((f = fopen(line, "r")) == NULL) {
2838 return false;
2839 }
2840
2841 *answer = convert_id_to_ns(f, uid);
2842 fclose(f);
2843
2844 if (*answer == -1)
2845 return false;
2846 return true;
2847}
2848
2849/*
2850 * get_pid_creds: get the real uid and gid of @pid from
2851 * /proc/$$/status
2852 * (XXX should we use euid here?)
2853 */
2854void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2855{
2856 char line[400];
2857 uid_t u;
2858 gid_t g;
2859 FILE *f;
2860
2861 *uid = -1;
2862 *gid = -1;
2863 sprintf(line, "/proc/%d/status", pid);
2864 if ((f = fopen(line, "r")) == NULL) {
b8defc3d 2865 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
237e200e
SH
2866 return;
2867 }
2868 while (fgets(line, 400, f)) {
2869 if (strncmp(line, "Uid:", 4) == 0) {
2870 if (sscanf(line+4, "%u", &u) != 1) {
b8defc3d 2871 lxcfs_error("bad uid line for pid %u\n", pid);
237e200e
SH
2872 fclose(f);
2873 return;
2874 }
2875 *uid = u;
2876 } else if (strncmp(line, "Gid:", 4) == 0) {
2877 if (sscanf(line+4, "%u", &g) != 1) {
b8defc3d 2878 lxcfs_error("bad gid line for pid %u\n", pid);
237e200e
SH
2879 fclose(f);
2880 return;
2881 }
2882 *gid = g;
2883 }
2884 }
2885 fclose(f);
2886}
2887
2888/*
2889 * May the requestor @r move victim @v to a new cgroup?
2890 * This is allowed if
2891 * . they are the same task
2892 * . they are ownedy by the same uid
2893 * . @r is root on the host, or
2894 * . @v's uid is mapped into @r's where @r is root.
2895 */
2896bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2897{
2898 uid_t v_uid, tmpuid;
2899 gid_t v_gid;
2900
2901 if (r == v)
2902 return true;
2903 if (r_uid == 0)
2904 return true;
2905 get_pid_creds(v, &v_uid, &v_gid);
2906 if (r_uid == v_uid)
2907 return true;
2908 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2909 && hostuid_to_ns(v_uid, r, &tmpuid))
2910 return true;
2911 return false;
2912}
2913
2914static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2915 const char *file, const char *buf)
2916{
2917 int sock[2] = {-1, -1};
2918 pid_t qpid, cpid = -1;
2919 FILE *pids_file = NULL;
2920 bool answer = false, fail = false;
2921
2922 pids_file = open_pids_file(contrl, cg);
2923 if (!pids_file)
2924 return false;
2925
2926 /*
2927 * write the pids to a socket, have helper in writer's pidns
2928 * call movepid for us
2929 */
2930 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2931 perror("socketpair");
2932 goto out;
2933 }
2934
2935 cpid = fork();
2936 if (cpid == -1)
2937 goto out;
2938
2939 if (!cpid) { // child
2940 fclose(pids_file);
2941 pid_from_ns_wrapper(sock[1], tpid);
2942 }
2943
2944 const char *ptr = buf;
2945 while (sscanf(ptr, "%d", &qpid) == 1) {
2946 struct ucred cred;
2947 char v;
2948
2949 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2950 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
237e200e
SH
2951 goto out;
2952 }
2953
2954 if (recv_creds(sock[0], &cred, &v)) {
2955 if (v == '0') {
2956 if (!may_move_pid(tpid, tuid, cred.pid)) {
2957 fail = true;
2958 break;
2959 }
2960 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2961 fail = true;
2962 }
2963 }
2964
2965 ptr = strchr(ptr, '\n');
2966 if (!ptr)
2967 break;
2968 ptr++;
2969 }
2970
2971 /* All good, write the value */
2972 qpid = -1;
2973 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
b8defc3d 2974 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
237e200e
SH
2975
2976 if (!fail)
2977 answer = true;
2978
2979out:
2980 if (cpid != -1)
2981 wait_for_pid(cpid);
2982 if (sock[0] != -1) {
2983 close(sock[0]);
2984 close(sock[1]);
2985 }
2986 if (pids_file) {
2987 if (fclose(pids_file) != 0)
2988 answer = false;
2989 }
2990 return answer;
2991}
2992
2993int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2994 struct fuse_file_info *fi)
2995{
2996 struct fuse_context *fc = fuse_get_context();
2997 char *localbuf = NULL;
2998 struct cgfs_files *k = NULL;
2999 struct file_info *f = (struct file_info *)fi->fh;
3000 bool r;
3001
3002 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 3003 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
237e200e
SH
3004 return -EIO;
3005 }
3006
3007 if (offset)
3008 return 0;
3009
3010 if (!fc)
3011 return -EIO;
3012
3013 localbuf = alloca(size+1);
3014 localbuf[size] = '\0';
3015 memcpy(localbuf, buf, size);
3016
3017 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
3018 size = -EINVAL;
3019 goto out;
3020 }
3021
3022 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
3023 size = -EACCES;
3024 goto out;
3025 }
3026
3027 if (strcmp(f->file, "tasks") == 0 ||
3028 strcmp(f->file, "/tasks") == 0 ||
3029 strcmp(f->file, "/cgroup.procs") == 0 ||
3030 strcmp(f->file, "cgroup.procs") == 0)
3031 // special case - we have to translate the pids
3032 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
3033 else
3034 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
3035
3036 if (!r)
3037 size = -EINVAL;
3038
3039out:
3040 free_key(k);
3041 return size;
3042}
3043
3044int cg_chown(const char *path, uid_t uid, gid_t gid)
3045{
3046 struct fuse_context *fc = fuse_get_context();
3047 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3048 struct cgfs_files *k = NULL;
3049 const char *cgroup;
3050 int ret;
3051
3052 if (!fc)
3053 return -EIO;
3054
3055 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 3056 return -EPERM;
237e200e
SH
3057
3058 controller = pick_controller_from_path(fc, path);
3059 if (!controller)
bc70ba9b
CB
3060 return errno == ENOENT ? -EPERM : -errno;
3061
237e200e
SH
3062 cgroup = find_cgroup_in_path(path);
3063 if (!cgroup)
3064 /* this is just /cgroup/controller */
bc70ba9b 3065 return -EPERM;
237e200e
SH
3066
3067 get_cgdir_and_path(cgroup, &cgdir, &last);
3068
3069 if (!last) {
3070 path1 = "/";
3071 path2 = cgdir;
3072 } else {
3073 path1 = cgdir;
3074 path2 = last;
3075 }
3076
3077 if (is_child_cgroup(controller, path1, path2)) {
3078 // get uid, gid, from '/tasks' file and make up a mode
3079 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3080 k = cgfs_get_key(controller, cgroup, "tasks");
3081
3082 } else
3083 k = cgfs_get_key(controller, path1, path2);
3084
3085 if (!k) {
3086 ret = -EINVAL;
3087 goto out;
3088 }
3089
3090 /*
3091 * This being a fuse request, the uid and gid must be valid
3092 * in the caller's namespace. So we can just check to make
3093 * sure that the caller is root in his uid, and privileged
3094 * over the file's current owner.
3095 */
3096 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3097 ret = -EACCES;
3098 goto out;
3099 }
3100
3101 ret = cgfs_chown_file(controller, cgroup, uid, gid);
3102
3103out:
3104 free_key(k);
3105 free(cgdir);
3106
3107 return ret;
3108}
3109
3110int cg_chmod(const char *path, mode_t mode)
3111{
3112 struct fuse_context *fc = fuse_get_context();
3113 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3114 struct cgfs_files *k = NULL;
3115 const char *cgroup;
3116 int ret;
3117
3118 if (!fc)
3119 return -EIO;
3120
3121 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 3122 return -EPERM;
237e200e
SH
3123
3124 controller = pick_controller_from_path(fc, path);
3125 if (!controller)
bc70ba9b
CB
3126 return errno == ENOENT ? -EPERM : -errno;
3127
237e200e
SH
3128 cgroup = find_cgroup_in_path(path);
3129 if (!cgroup)
3130 /* this is just /cgroup/controller */
bc70ba9b 3131 return -EPERM;
237e200e
SH
3132
3133 get_cgdir_and_path(cgroup, &cgdir, &last);
3134
3135 if (!last) {
3136 path1 = "/";
3137 path2 = cgdir;
3138 } else {
3139 path1 = cgdir;
3140 path2 = last;
3141 }
3142
3143 if (is_child_cgroup(controller, path1, path2)) {
3144 // get uid, gid, from '/tasks' file and make up a mode
3145 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3146 k = cgfs_get_key(controller, cgroup, "tasks");
3147
3148 } else
3149 k = cgfs_get_key(controller, path1, path2);
3150
3151 if (!k) {
3152 ret = -EINVAL;
3153 goto out;
3154 }
3155
3156 /*
3157 * This being a fuse request, the uid and gid must be valid
3158 * in the caller's namespace. So we can just check to make
3159 * sure that the caller is root in his uid, and privileged
3160 * over the file's current owner.
3161 */
3162 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3163 ret = -EPERM;
3164 goto out;
3165 }
3166
3167 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3168 ret = -EINVAL;
3169 goto out;
3170 }
3171
3172 ret = 0;
3173out:
3174 free_key(k);
3175 free(cgdir);
3176 return ret;
3177}
3178
3179int cg_mkdir(const char *path, mode_t mode)
3180{
3181 struct fuse_context *fc = fuse_get_context();
3182 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3183 const char *cgroup;
3184 int ret;
3185
3186 if (!fc)
3187 return -EIO;
3188
237e200e
SH
3189 controller = pick_controller_from_path(fc, path);
3190 if (!controller)
2f7036d0 3191 return errno == ENOENT ? -EPERM : -errno;
237e200e
SH
3192
3193 cgroup = find_cgroup_in_path(path);
3194 if (!cgroup)
bc70ba9b 3195 return -errno;
237e200e
SH
3196
3197 get_cgdir_and_path(cgroup, &cgdir, &last);
3198 if (!last)
3199 path1 = "/";
3200 else
3201 path1 = cgdir;
3202
3203 pid_t initpid = lookup_initpid_in_store(fc->pid);
3204 if (initpid <= 0)
3205 initpid = fc->pid;
3206 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3207 if (!next)
3208 ret = -EINVAL;
3209 else if (last && strcmp(next, last) == 0)
3210 ret = -EEXIST;
3211 else
2f7036d0 3212 ret = -EPERM;
237e200e
SH
3213 goto out;
3214 }
3215
3216 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3217 ret = -EACCES;
3218 goto out;
3219 }
3220 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3221 ret = -EACCES;
3222 goto out;
3223 }
3224
3225 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3226
3227out:
3228 free(cgdir);
3229 free(next);
3230 return ret;
3231}
3232
3233int cg_rmdir(const char *path)
3234{
3235 struct fuse_context *fc = fuse_get_context();
3236 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3237 const char *cgroup;
3238 int ret;
3239
3240 if (!fc)
3241 return -EIO;
3242
3243 controller = pick_controller_from_path(fc, path);
e254948f
CB
3244 if (!controller) /* Someone's trying to delete "/cgroup". */
3245 return -EPERM;
237e200e
SH
3246
3247 cgroup = find_cgroup_in_path(path);
e254948f
CB
3248 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3249 return -EPERM;
237e200e
SH
3250
3251 get_cgdir_and_path(cgroup, &cgdir, &last);
3252 if (!last) {
e254948f
CB
3253 /* Someone's trying to delete a cgroup on the same level as the
3254 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3255 * rmdir "/cgroup/blkio/init.slice".
3256 */
3257 ret = -EPERM;
237e200e
SH
3258 goto out;
3259 }
3260
3261 pid_t initpid = lookup_initpid_in_store(fc->pid);
3262 if (initpid <= 0)
3263 initpid = fc->pid;
3264 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
de77249b 3265 if (!last || (next && (strcmp(next, last) == 0)))
237e200e
SH
3266 ret = -EBUSY;
3267 else
3268 ret = -ENOENT;
3269 goto out;
3270 }
3271
3272 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3273 ret = -EACCES;
3274 goto out;
3275 }
3276 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3277 ret = -EACCES;
3278 goto out;
3279 }
3280
3281 if (!cgfs_remove(controller, cgroup)) {
3282 ret = -EINVAL;
3283 goto out;
3284 }
3285
3286 ret = 0;
3287
3288out:
3289 free(cgdir);
3290 free(next);
3291 return ret;
3292}
3293
3294static bool startswith(const char *line, const char *pref)
3295{
3296 if (strncmp(line, pref, strlen(pref)) == 0)
3297 return true;
3298 return false;
3299}
3300
c6095b08
SH
3301static void parse_memstat(char *memstat, unsigned long *cached,
3302 unsigned long *active_anon, unsigned long *inactive_anon,
3303 unsigned long *active_file, unsigned long *inactive_file,
559eaa8f 3304 unsigned long *unevictable, unsigned long *shmem)
237e200e
SH
3305{
3306 char *eol;
3307
237e200e 3308 while (*memstat) {
4accebfb
AS
3309 if (startswith(memstat, "total_cache")) {
3310 sscanf(memstat + 11, "%lu", cached);
c6095b08 3311 *cached /= 1024;
4accebfb
AS
3312 } else if (startswith(memstat, "total_active_anon")) {
3313 sscanf(memstat + 17, "%lu", active_anon);
c6095b08 3314 *active_anon /= 1024;
4accebfb
AS
3315 } else if (startswith(memstat, "total_inactive_anon")) {
3316 sscanf(memstat + 19, "%lu", inactive_anon);
c6095b08 3317 *inactive_anon /= 1024;
4accebfb
AS
3318 } else if (startswith(memstat, "total_active_file")) {
3319 sscanf(memstat + 17, "%lu", active_file);
c6095b08 3320 *active_file /= 1024;
4accebfb
AS
3321 } else if (startswith(memstat, "total_inactive_file")) {
3322 sscanf(memstat + 19, "%lu", inactive_file);
c6095b08 3323 *inactive_file /= 1024;
4accebfb
AS
3324 } else if (startswith(memstat, "total_unevictable")) {
3325 sscanf(memstat + 17, "%lu", unevictable);
c6095b08 3326 *unevictable /= 1024;
559eaa8f
JS
3327 } else if (startswith(memstat, "total_shmem")) {
3328 sscanf(memstat + 11, "%lu", shmem);
3329 *shmem /= 1024;
237e200e
SH
3330 }
3331 eol = strchr(memstat, '\n');
3332 if (!eol)
3333 return;
3334 memstat = eol+1;
3335 }
3336}
3337
3338static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3339{
3340 char *eol;
3341 char key[32];
3342
3343 memset(key, 0, 32);
3344 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3345
3346 size_t len = strlen(key);
3347 *v = 0;
3348
3349 while (*str) {
3350 if (startswith(str, key)) {
3351 sscanf(str + len, "%lu", v);
3352 return;
3353 }
3354 eol = strchr(str, '\n');
3355 if (!eol)
3356 return;
3357 str = eol+1;
3358 }
3359}
3360
3361static int read_file(const char *path, char *buf, size_t size,
3362 struct file_info *d)
3363{
3364 size_t linelen = 0, total_len = 0, rv = 0;
3365 char *line = NULL;
3366 char *cache = d->buf;
3367 size_t cache_size = d->buflen;
3368 FILE *f = fopen(path, "r");
3369 if (!f)
3370 return 0;
3371
3372 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3373 ssize_t l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
3374 if (l < 0) {
3375 perror("Error writing to cache");
3376 rv = 0;
3377 goto err;
3378 }
3379 if (l >= cache_size) {
b8defc3d 3380 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3381 rv = 0;
3382 goto err;
3383 }
3384 cache += l;
3385 cache_size -= l;
3386 total_len += l;
3387 }
3388
3389 d->size = total_len;
a262ddb7
CB
3390 if (total_len > size)
3391 total_len = size;
237e200e
SH
3392
3393 /* read from off 0 */
3394 memcpy(buf, d->buf, total_len);
3395 rv = total_len;
3396 err:
3397 fclose(f);
3398 free(line);
3399 return rv;
3400}
3401
3402/*
3403 * FUSE ops for /proc
3404 */
3405
018246ff 3406static unsigned long get_memlimit(const char *cgroup, const char *file)
237e200e
SH
3407{
3408 char *memlimit_str = NULL;
3409 unsigned long memlimit = -1;
3410
018246ff 3411 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
237e200e
SH
3412 memlimit = strtoul(memlimit_str, NULL, 10);
3413
3414 free(memlimit_str);
3415
3416 return memlimit;
3417}
3418
018246ff 3419static unsigned long get_min_memlimit(const char *cgroup, const char *file)
237e200e
SH
3420{
3421 char *copy = strdupa(cgroup);
3422 unsigned long memlimit = 0, retlimit;
3423
018246ff 3424 retlimit = get_memlimit(copy, file);
237e200e
SH
3425
3426 while (strcmp(copy, "/") != 0) {
3427 copy = dirname(copy);
018246ff 3428 memlimit = get_memlimit(copy, file);
237e200e
SH
3429 if (memlimit != -1 && memlimit < retlimit)
3430 retlimit = memlimit;
3431 };
3432
3433 return retlimit;
3434}
3435
3436static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3437 struct fuse_file_info *fi)
3438{
3439 struct fuse_context *fc = fuse_get_context();
3440 struct file_info *d = (struct file_info *)fi->fh;
3441 char *cg;
3442 char *memusage_str = NULL, *memstat_str = NULL,
018246ff 3443 *memswlimit_str = NULL, *memswusage_str = NULL;
237e200e 3444 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
c6095b08 3445 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
559eaa8f 3446 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
594a10e6 3447 hostswtotal = 0;
237e200e
SH
3448 char *line = NULL;
3449 size_t linelen = 0, total_len = 0, rv = 0;
3450 char *cache = d->buf;
3451 size_t cache_size = d->buflen;
3452 FILE *f = NULL;
3453
3454 if (offset){
3455 if (offset > d->size)
3456 return -EINVAL;
3457 if (!d->cached)
3458 return 0;
3459 int left = d->size - offset;
3460 total_len = left > size ? size: left;
3461 memcpy(buf, cache + offset, total_len);
3462 return total_len;
3463 }
3464
3465 pid_t initpid = lookup_initpid_in_store(fc->pid);
3466 if (initpid <= 0)
3467 initpid = fc->pid;
3468 cg = get_pid_cgroup(initpid, "memory");
3469 if (!cg)
3470 return read_file("/proc/meminfo", buf, size, d);
6d2f6996 3471 prune_init_slice(cg);
237e200e 3472
018246ff 3473 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
237e200e
SH
3474 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3475 goto err;
3476 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3477 goto err;
3478
3479 // Following values are allowed to fail, because swapaccount might be turned
3480 // off for current kernel
3481 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3482 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3483 {
018246ff 3484 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
237e200e
SH
3485 memswusage = strtoul(memswusage_str, NULL, 10);
3486
237e200e
SH
3487 memswlimit = memswlimit / 1024;
3488 memswusage = memswusage / 1024;
3489 }
3490
3491 memusage = strtoul(memusage_str, NULL, 10);
3492 memlimit /= 1024;
3493 memusage /= 1024;
3494
c6095b08
SH
3495 parse_memstat(memstat_str, &cached, &active_anon,
3496 &inactive_anon, &active_file, &inactive_file,
559eaa8f 3497 &unevictable, &shmem);
237e200e
SH
3498
3499 f = fopen("/proc/meminfo", "r");
3500 if (!f)
3501 goto err;
3502
3503 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3504 ssize_t l;
237e200e
SH
3505 char *printme, lbuf[100];
3506
3507 memset(lbuf, 0, 100);
3508 if (startswith(line, "MemTotal:")) {
594a10e6 3509 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
237e200e
SH
3510 if (hosttotal < memlimit)
3511 memlimit = hosttotal;
3512 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3513 printme = lbuf;
3514 } else if (startswith(line, "MemFree:")) {
3515 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3516 printme = lbuf;
3517 } else if (startswith(line, "MemAvailable:")) {
ad19b86d 3518 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
237e200e
SH
3519 printme = lbuf;
3520 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
594a10e6 3521 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
4127e51b 3522 if (hostswtotal < memswlimit)
3523 memswlimit = hostswtotal;
3524 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
237e200e
SH
3525 printme = lbuf;
3526 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
4127e51b 3527 unsigned long swaptotal = memswlimit,
b4665ce0
SH
3528 swapusage = memswusage - memusage,
3529 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3530 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
237e200e 3531 printme = lbuf;
da35d72a
SH
3532 } else if (startswith(line, "Slab:")) {
3533 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3534 printme = lbuf;
237e200e
SH
3535 } else if (startswith(line, "Buffers:")) {
3536 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3537 printme = lbuf;
3538 } else if (startswith(line, "Cached:")) {
3539 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3540 printme = lbuf;
3541 } else if (startswith(line, "SwapCached:")) {
3542 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3543 printme = lbuf;
2f306ad3 3544 } else if (startswith(line, "Active:")) {
c6095b08
SH
3545 snprintf(lbuf, 100, "Active: %8lu kB\n",
3546 active_anon + active_file);
3547 printme = lbuf;
2f306ad3 3548 } else if (startswith(line, "Inactive:")) {
c6095b08
SH
3549 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3550 inactive_anon + inactive_file);
3551 printme = lbuf;
3552 } else if (startswith(line, "Active(anon)")) {
3553 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3554 printme = lbuf;
3555 } else if (startswith(line, "Inactive(anon)")) {
3556 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3557 printme = lbuf;
3558 } else if (startswith(line, "Active(file)")) {
3559 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3560 printme = lbuf;
3561 } else if (startswith(line, "Inactive(file)")) {
3562 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3563 printme = lbuf;
3564 } else if (startswith(line, "Unevictable")) {
3565 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3566 printme = lbuf;
3567 } else if (startswith(line, "SReclaimable")) {
3568 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3569 printme = lbuf;
3570 } else if (startswith(line, "SUnreclaim")) {
3571 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3572 printme = lbuf;
559eaa8f
JS
3573 } else if (startswith(line, "Shmem:")) {
3574 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3575 printme = lbuf;
28cdea9b
JS
3576 } else if (startswith(line, "ShmemHugePages")) {
3577 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3578 printme = lbuf;
3579 } else if (startswith(line, "ShmemPmdMapped")) {
3580 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3581 printme = lbuf;
237e200e
SH
3582 } else
3583 printme = line;
3584
3585 l = snprintf(cache, cache_size, "%s", printme);
3586 if (l < 0) {
3587 perror("Error writing to cache");
3588 rv = 0;
3589 goto err;
3590
3591 }
3592 if (l >= cache_size) {
b8defc3d 3593 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3594 rv = 0;
3595 goto err;
3596 }
3597
3598 cache += l;
3599 cache_size -= l;
3600 total_len += l;
3601 }
3602
3603 d->cached = 1;
3604 d->size = total_len;
3605 if (total_len > size ) total_len = size;
3606 memcpy(buf, d->buf, total_len);
3607
3608 rv = total_len;
3609err:
3610 if (f)
3611 fclose(f);
3612 free(line);
3613 free(cg);
3614 free(memusage_str);
3615 free(memswlimit_str);
3616 free(memswusage_str);
3617 free(memstat_str);
237e200e
SH
3618 return rv;
3619}
3620
3621/*
3622 * Read the cpuset.cpus for cg
3623 * Return the answer in a newly allocated string which must be freed
3624 */
3625static char *get_cpuset(const char *cg)
3626{
3627 char *answer;
3628
3629 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3630 return NULL;
3631 return answer;
3632}
3633
3634bool cpu_in_cpuset(int cpu, const char *cpuset);
3635
3636static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3637{
3638 int cpu;
3639
3640 if (sscanf(line, "processor : %d", &cpu) != 1)
3641 return false;
3642 return cpu_in_cpuset(cpu, cpuset);
3643}
3644
c59d6a55
JS
3645/*
3646 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3647 * depending on `param`. Parameter value is returned throuh `value`.
3648 */
3649static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3650{
3651 bool rv = false;
3652 char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3653 char *str = NULL;
3654
3655 sprintf(file, "cpu.cfs_%s_us", param);
3656
3657 if (!cgfs_get_value("cpu", cg, file, &str))
3658 goto err;
3659
3660 if (sscanf(str, "%ld", value) != 1)
3661 goto err;
3662
3663 rv = true;
3664
3665err:
3666 if (str)
3667 free(str);
3668 return rv;
3669}
3670
3671/*
3672 * Return the maximum number of visible CPUs based on CPU quotas.
3673 * If there is no quota set, zero is returned.
3674 */
3675int max_cpu_count(const char *cg)
3676{
3677 int rv, nprocs;
3678 int64_t cfs_quota, cfs_period;
3679
3680 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3681 return 0;
3682
3683 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3684 return 0;
3685
3686 if (cfs_quota <= 0 || cfs_period <= 0)
3687 return 0;
3688
3689 rv = cfs_quota / cfs_period;
3690
3691 /* In case quota/period does not yield a whole number, add one CPU for
3692 * the remainder.
3693 */
3694 if ((cfs_quota % cfs_period) > 0)
3695 rv += 1;
3696
3697 nprocs = get_nprocs();
3698
3699 if (rv > nprocs)
3700 rv = nprocs;
3701
3702 return rv;
3703}
3704
3705/*
3706 * Determine whether CPU views should be used or not.
3707 */
3708bool use_cpuview(const char *cg)
3709{
3710 int cfd;
3711 char *tmpc;
3712
3713 tmpc = find_mounted_controller("cpu", &cfd);
3714 if (!tmpc)
3715 return false;
3716
3717 tmpc = find_mounted_controller("cpuacct", &cfd);
3718 if (!tmpc)
3719 return false;
3720
3721 return true;
3722}
3723
237e200e
SH
3724/*
3725 * check whether this is a '^processor" line in /proc/cpuinfo
3726 */
3727static bool is_processor_line(const char *line)
3728{
3729 int cpu;
3730
3731 if (sscanf(line, "processor : %d", &cpu) == 1)
3732 return true;
3733 return false;
3734}
3735
3736static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3737 struct fuse_file_info *fi)
3738{
3739 struct fuse_context *fc = fuse_get_context();
3740 struct file_info *d = (struct file_info *)fi->fh;
3741 char *cg;
3742 char *cpuset = NULL;
3743 char *line = NULL;
3744 size_t linelen = 0, total_len = 0, rv = 0;
f676eb79 3745 bool am_printing = false, firstline = true, is_s390x = false;
c59d6a55
JS
3746 int curcpu = -1, cpu, max_cpus = 0;
3747 bool use_view;
237e200e
SH
3748 char *cache = d->buf;
3749 size_t cache_size = d->buflen;
3750 FILE *f = NULL;
3751
3752 if (offset){
3753 if (offset > d->size)
3754 return -EINVAL;
3755 if (!d->cached)
3756 return 0;
3757 int left = d->size - offset;
3758 total_len = left > size ? size: left;
3759 memcpy(buf, cache + offset, total_len);
3760 return total_len;
3761 }
3762
3763 pid_t initpid = lookup_initpid_in_store(fc->pid);
3764 if (initpid <= 0)
3765 initpid = fc->pid;
3766 cg = get_pid_cgroup(initpid, "cpuset");
3767 if (!cg)
3768 return read_file("proc/cpuinfo", buf, size, d);
6d2f6996 3769 prune_init_slice(cg);
237e200e
SH
3770
3771 cpuset = get_cpuset(cg);
3772 if (!cpuset)
3773 goto err;
3774
c59d6a55
JS
3775 use_view = use_cpuview(cg);
3776
3777 if (use_view)
3778 max_cpus = max_cpu_count(cg);
3779
237e200e
SH
3780 f = fopen("/proc/cpuinfo", "r");
3781 if (!f)
3782 goto err;
3783
3784 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3785 ssize_t l;
f676eb79
SH
3786 if (firstline) {
3787 firstline = false;
3788 if (strstr(line, "IBM/S390") != NULL) {
3789 is_s390x = true;
3790 am_printing = true;
5ed9d4e2 3791 continue;
f676eb79
SH
3792 }
3793 }
5ed9d4e2
SH
3794 if (strncmp(line, "# processors:", 12) == 0)
3795 continue;
237e200e 3796 if (is_processor_line(line)) {
c59d6a55
JS
3797 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3798 break;
237e200e
SH
3799 am_printing = cpuline_in_cpuset(line, cpuset);
3800 if (am_printing) {
3801 curcpu ++;
3802 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3803 if (l < 0) {
3804 perror("Error writing to cache");
3805 rv = 0;
3806 goto err;
3807 }
3808 if (l >= cache_size) {
b8defc3d 3809 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3810 rv = 0;
3811 goto err;
3812 }
3813 cache += l;
3814 cache_size -= l;
3815 total_len += l;
3816 }
3817 continue;
f676eb79
SH
3818 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3819 char *p;
c59d6a55
JS
3820 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3821 break;
f676eb79
SH
3822 if (!cpu_in_cpuset(cpu, cpuset))
3823 continue;
3824 curcpu ++;
3825 p = strchr(line, ':');
3826 if (!p || !*p)
3827 goto err;
3828 p++;
5ed9d4e2 3829 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
f676eb79
SH
3830 if (l < 0) {
3831 perror("Error writing to cache");
3832 rv = 0;
3833 goto err;
3834 }
3835 if (l >= cache_size) {
b8defc3d 3836 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
f676eb79
SH
3837 rv = 0;
3838 goto err;
3839 }
3840 cache += l;
3841 cache_size -= l;
3842 total_len += l;
3843 continue;
3844
237e200e
SH
3845 }
3846 if (am_printing) {
3847 l = snprintf(cache, cache_size, "%s", line);
3848 if (l < 0) {
3849 perror("Error writing to cache");
3850 rv = 0;
3851 goto err;
3852 }
3853 if (l >= cache_size) {
b8defc3d 3854 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3855 rv = 0;
3856 goto err;
3857 }
3858 cache += l;
3859 cache_size -= l;
3860 total_len += l;
3861 }
3862 }
3863
5ed9d4e2
SH
3864 if (is_s390x) {
3865 char *origcache = d->buf;
a262ddb7 3866 ssize_t l;
5ed9d4e2
SH
3867 do {
3868 d->buf = malloc(d->buflen);
3869 } while (!d->buf);
3870 cache = d->buf;
3871 cache_size = d->buflen;
3872 total_len = 0;
3873 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3874 if (l < 0 || l >= cache_size) {
3875 free(origcache);
3876 goto err;
3877 }
3878 cache_size -= l;
3879 cache += l;
3880 total_len += l;
3881 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3882 if (l < 0 || l >= cache_size) {
3883 free(origcache);
3884 goto err;
3885 }
3886 cache_size -= l;
3887 cache += l;
3888 total_len += l;
3889 l = snprintf(cache, cache_size, "%s", origcache);
3890 free(origcache);
3891 if (l < 0 || l >= cache_size)
3892 goto err;
3893 total_len += l;
3894 }
3895
237e200e
SH
3896 d->cached = 1;
3897 d->size = total_len;
3898 if (total_len > size ) total_len = size;
3899
3900 /* read from off 0 */
3901 memcpy(buf, d->buf, total_len);
3902 rv = total_len;
3903err:
3904 if (f)
3905 fclose(f);
3906 free(line);
3907 free(cpuset);
3908 free(cg);
3909 return rv;
3910}
3911
0ecddf02 3912static uint64_t get_reaper_start_time(pid_t pid)
9ac264cf 3913{
9ac264cf 3914 int ret;
0ecddf02
CB
3915 FILE *f;
3916 uint64_t starttime;
3917 /* strlen("/proc/") = 6
3918 * +
3919 * LXCFS_NUMSTRLEN64
3920 * +
3921 * strlen("/stat") = 5
3922 * +
3923 * \0 = 1
3924 * */
3925#define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3926 char path[__PROC_PID_STAT_LEN];
9ac264cf
JB
3927 pid_t qpid;
3928
3929 qpid = lookup_initpid_in_store(pid);
0ecddf02
CB
3930 if (qpid <= 0) {
3931 /* Caller can check for EINVAL on 0. */
3932 errno = EINVAL;
9ac264cf 3933 return 0;
0ecddf02 3934 }
9ac264cf 3935
0ecddf02
CB
3936 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3937 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3938 /* Caller can check for EINVAL on 0. */
3939 errno = EINVAL;
9ac264cf 3940 return 0;
0ecddf02 3941 }
9ac264cf 3942
0ecddf02
CB
3943 f = fopen(path, "r");
3944 if (!f) {
3945 /* Caller can check for EINVAL on 0. */
3946 errno = EINVAL;
9ac264cf 3947 return 0;
0ecddf02 3948 }
9ac264cf 3949
0ecddf02
CB
3950 /* Note that the *scanf() argument supression requires that length
3951 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3952 * at us. It's like telling someone you're not married and then asking
3953 * if you can bring your wife to the party.
3954 */
3955 ret = fscanf(f, "%*d " /* (1) pid %d */
3956 "%*s " /* (2) comm %s */
3957 "%*c " /* (3) state %c */
3958 "%*d " /* (4) ppid %d */
3959 "%*d " /* (5) pgrp %d */
3960 "%*d " /* (6) session %d */
3961 "%*d " /* (7) tty_nr %d */
3962 "%*d " /* (8) tpgid %d */
3963 "%*u " /* (9) flags %u */
3964 "%*u " /* (10) minflt %lu */
3965 "%*u " /* (11) cminflt %lu */
3966 "%*u " /* (12) majflt %lu */
3967 "%*u " /* (13) cmajflt %lu */
3968 "%*u " /* (14) utime %lu */
3969 "%*u " /* (15) stime %lu */
3970 "%*d " /* (16) cutime %ld */
3971 "%*d " /* (17) cstime %ld */
3972 "%*d " /* (18) priority %ld */
3973 "%*d " /* (19) nice %ld */
3974 "%*d " /* (20) num_threads %ld */
3975 "%*d " /* (21) itrealvalue %ld */
3976 "%" PRIu64, /* (22) starttime %llu */
3977 &starttime);
3978 if (ret != 1) {
3979 fclose(f);
3980 /* Caller can check for EINVAL on 0. */
3981 errno = EINVAL;
3982 return 0;
3983 }
3984
3985 fclose(f);
3986
3987 errno = 0;
3988 return starttime;
3989}
3990
3991static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3992{
3993 uint64_t clockticks;
3994 int64_t ticks_per_sec;
3995
3996 clockticks = get_reaper_start_time(pid);
3997 if (clockticks == 0 && errno == EINVAL) {
3998 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3999 return 0;
4000 }
4001
4002 ticks_per_sec = sysconf(_SC_CLK_TCK);
4003 if (ticks_per_sec < 0 && errno == EINVAL) {
4004 lxcfs_debug(
4005 "%s\n",
4006 "failed to determine number of clock ticks in a second");
4007 return 0;
4008 }
4009
4010 return (clockticks /= ticks_per_sec);
4011}
4012
4013static uint64_t get_reaper_age(pid_t pid)
4014{
4015 uint64_t procstart, uptime, procage;
4016
4017 /* We need to substract the time the process has started since system
4018 * boot minus the time when the system has started to get the actual
4019 * reaper age.
4020 */
4021 procstart = get_reaper_start_time_in_sec(pid);
4022 procage = procstart;
4023 if (procstart > 0) {
4024 int ret;
4025 struct timespec spec;
4026
4027 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
4028 if (ret < 0)
4029 return 0;
4030 /* We could make this more precise here by using the tv_nsec
4031 * field in the timespec struct and convert it to milliseconds
4032 * and then create a double for the seconds and milliseconds but
4033 * that seems more work than it is worth.
4034 */
4035 uptime = spec.tv_sec;
4036 procage = uptime - procstart;
4037 }
4038
4039 return procage;
4040}
4041
8be92dd1
JS
4042/*
4043 * Returns 0 on success.
4044 * It is the caller's responsibility to free `return_usage`, unless this
4045 * function returns an error.
4046 */
79612c8b 4047static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
8be92dd1 4048{
77005a6c 4049 int cpucount = get_nprocs_conf();
8be92dd1
JS
4050 struct cpuacct_usage *cpu_usage;
4051 int rv = 0, i, j, ret, read_pos = 0, read_cnt;
4052 int cg_cpu;
4053 uint64_t cg_user, cg_system;
4054 int64_t ticks_per_sec;
4055 char *usage_str = NULL;
4056
4057 ticks_per_sec = sysconf(_SC_CLK_TCK);
4058
4059 if (ticks_per_sec < 0 && errno == EINVAL) {
4060 lxcfs_debug(
4061 "%s\n",
4062 "read_cpuacct_usage_all failed to determine number of clock ticks "
4063 "in a second");
4064 return -1;
4065 }
4066
4067 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4068 if (!cpu_usage)
4069 return -ENOMEM;
4070
4071 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4072 rv = -1;
4073 goto err;
4074 }
4075
4076 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4077 lxcfs_error("read_cpuacct_usage_all reading first line from "
4078 "%s/cpuacct.usage_all failed.\n", cg);
4079 rv = -1;
4080 goto err;
4081 }
4082
4083 read_pos += read_cnt;
4084
4085 for (i = 0, j = 0; i < cpucount; i++) {
4086 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4087 &cg_system, &read_cnt);
4088
4089 if (ret == EOF)
4090 break;
4091
4092 if (ret != 3) {
4093 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4094 "failed.\n", cg);
4095 rv = -1;
4096 goto err;
4097 }
4098
4099 read_pos += read_cnt;
4100
8be92dd1
JS
4101 /* Convert the time from nanoseconds to USER_HZ */
4102 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4103 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4104 j++;
4105 }
4106
4107 rv = 0;
4108 *return_usage = cpu_usage;
79612c8b 4109 *size = cpucount;
8be92dd1
JS
4110
4111err:
4112 if (usage_str)
4113 free(usage_str);
4114
4115 if (rv != 0) {
4116 free(cpu_usage);
4117 *return_usage = NULL;
4118 }
4119
4120 return rv;
4121}
4122
056adcef
JS
4123static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4124{
4125 int i;
4126 unsigned long sum = 0;
4127
4128 for (i = 0; i < cpu_count; i++) {
77005a6c
JS
4129 if (!newer[i].online)
4130 continue;
4131
056adcef
JS
4132 /* When cpuset is changed on the fly, the CPUs might get reordered.
4133 * We could either reset all counters, or check that the substractions
4134 * below will return expected results.
4135 */
4136 if (newer[i].user > older[i].user)
4137 diff[i].user = newer[i].user - older[i].user;
4138 else
4139 diff[i].user = 0;
4140
4141 if (newer[i].system > older[i].system)
4142 diff[i].system = newer[i].system - older[i].system;
4143 else
4144 diff[i].system = 0;
4145
4146 if (newer[i].idle > older[i].idle)
4147 diff[i].idle = newer[i].idle - older[i].idle;
4148 else
4149 diff[i].idle = 0;
4150
4151 sum += diff[i].user;
4152 sum += diff[i].system;
4153 sum += diff[i].idle;
4154 }
4155
4156 return sum;
4157}
4158
4159static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4160{
4161 unsigned long free_space, to_add;
4162
4163 free_space = threshold - usage->user - usage->system;
4164
4165 if (free_space > usage->idle)
4166 free_space = usage->idle;
4167
4168 to_add = free_space > *surplus ? *surplus : free_space;
4169
4170 *counter += to_add;
4171 usage->idle -= to_add;
4172 *surplus -= to_add;
4173}
4174
951acc94
JS
4175static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4176{
4177 struct cg_proc_stat *first = NULL, *prev, *tmp;
4178
4179 for (prev = NULL; node; ) {
4180 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4181 tmp = node;
4182 lxcfs_debug("Removing stat node for %s\n", node->cg);
4183
4184 if (prev)
4185 prev->next = node->next;
4186 else
4187 first = node->next;
4188
4189 node = node->next;
4190 free_proc_stat_node(tmp);
4191 } else {
4192 if (!first)
4193 first = node;
4194 prev = node;
4195 node = node->next;
4196 }
4197 }
4198
4199 return first;
4200}
4201
4202#define PROC_STAT_PRUNE_INTERVAL 10
4203static void prune_proc_stat_history(void)
4204{
4205 int i;
4206 time_t now = time(NULL);
4207
4208 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
2f49b662
JS
4209 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4210
4211 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4212 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
951acc94 4213 return;
2f49b662 4214 }
951acc94 4215
2f49b662
JS
4216 if (proc_stat_history[i]->next) {
4217 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4218 proc_stat_history[i]->lastcheck = now;
4219 }
951acc94 4220
2f49b662 4221 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
951acc94
JS
4222 }
4223}
4224
2f49b662 4225static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
056adcef 4226{
056adcef
JS
4227 struct cg_proc_stat *node;
4228
2f49b662
JS
4229 pthread_rwlock_rdlock(&head->lock);
4230
4231 if (!head->next) {
4232 pthread_rwlock_unlock(&head->lock);
056adcef 4233 return NULL;
2f49b662 4234 }
056adcef
JS
4235
4236 node = head->next;
4237
4238 do {
4239 if (strcmp(cg, node->cg) == 0)
951acc94 4240 goto out;
056adcef
JS
4241 } while ((node = node->next));
4242
951acc94
JS
4243 node = NULL;
4244
4245out:
2f49b662 4246 pthread_rwlock_unlock(&head->lock);
951acc94
JS
4247 prune_proc_stat_history();
4248 return node;
056adcef
JS
4249}
4250
4251static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4252{
4253 struct cg_proc_stat *node;
4254 int i;
4255
4256 node = malloc(sizeof(struct cg_proc_stat));
4257 if (!node)
4258 goto err;
4259
4260 node->cg = NULL;
4261 node->usage = NULL;
4262 node->view = NULL;
4263
4264 node->cg = malloc(strlen(cg) + 1);
4265 if (!node->cg)
4266 goto err;
4267
4268 strcpy(node->cg, cg);
4269
4270 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4271 if (!node->usage)
4272 goto err;
4273
4274 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4275
4276 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4277 if (!node->view)
4278 goto err;
4279
4280 node->cpu_count = cpu_count;
4281 node->next = NULL;
4282
2f49b662
JS
4283 if (pthread_mutex_init(&node->lock, NULL) != 0) {
4284 lxcfs_error("%s\n", "Failed to initialize node lock");
4285 goto err;
4286 }
4287
056adcef
JS
4288 for (i = 0; i < cpu_count; i++) {
4289 node->view[i].user = 0;
4290 node->view[i].system = 0;
4291 node->view[i].idle = 0;
4292 }
4293
4294 return node;
4295
4296err:
4297 if (node && node->cg)
4298 free(node->cg);
4299 if (node && node->usage)
4300 free(node->usage);
4301 if (node && node->view)
4302 free(node->view);
4303 if (node)
4304 free(node);
4305
4306 return NULL;
4307}
4308
2f49b662 4309static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
056adcef
JS
4310{
4311 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4312 struct cg_proc_stat_head *head = proc_stat_history[hash];
2f49b662
JS
4313 struct cg_proc_stat *node, *rv = new_node;
4314
4315 pthread_rwlock_wrlock(&head->lock);
056adcef
JS
4316
4317 if (!head->next) {
4318 head->next = new_node;
2f49b662 4319 goto out;
056adcef
JS
4320 }
4321
2f49b662
JS
4322 node = head->next;
4323
056adcef 4324 for (;;) {
2f49b662
JS
4325 if (strcmp(node->cg, new_node->cg) == 0) {
4326 /* The node is already present, return it */
4327 free_proc_stat_node(new_node);
4328 rv = node;
4329 goto out;
4330 }
056adcef
JS
4331
4332 if (node->next) {
4333 node = node->next;
4334 continue;
4335 }
4336
4337 node->next = new_node;
2f49b662
JS
4338 goto out;
4339 }
4340
4341out:
4342 pthread_rwlock_unlock(&head->lock);
4343 return rv;
4344}
4345
895f28e5
JS
4346static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4347{
4348 struct cpuacct_usage *new_usage, *new_view;
4349 int i;
4350
4351 /* Allocate new memory */
4352 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4353 if (!new_usage)
4354 return false;
4355
4356 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4357 if (!new_view) {
4358 free(new_usage);
4359 return false;
4360 }
4361
4362 /* Copy existing data & initialize new elements */
4363 for (i = 0; i < cpu_count; i++) {
4364 if (i < node->cpu_count) {
4365 new_usage[i].user = node->usage[i].user;
4366 new_usage[i].system = node->usage[i].system;
4367 new_usage[i].idle = node->usage[i].idle;
4368
4369 new_view[i].user = node->view[i].user;
4370 new_view[i].system = node->view[i].system;
4371 new_view[i].idle = node->view[i].idle;
4372 } else {
4373 new_usage[i].user = 0;
4374 new_usage[i].system = 0;
4375 new_usage[i].idle = 0;
4376
4377 new_view[i].user = 0;
4378 new_view[i].system = 0;
4379 new_view[i].idle = 0;
4380 }
4381 }
4382
4383 free(node->usage);
4384 free(node->view);
4385
4386 node->usage = new_usage;
4387 node->view = new_view;
4388 node->cpu_count = cpu_count;
4389
4390 return true;
4391}
4392
2f49b662
JS
4393static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4394{
4395 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4396 struct cg_proc_stat_head *head = proc_stat_history[hash];
4397 struct cg_proc_stat *node;
4398
4399 node = find_proc_stat_node(head, cg);
4400
4401 if (!node) {
4402 node = new_proc_stat_node(usage, cpu_count, cg);
4403 if (!node)
4404 return NULL;
4405
4406 node = add_proc_stat_node(node);
4407 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
056adcef 4408 }
2f49b662
JS
4409
4410 pthread_mutex_lock(&node->lock);
895f28e5
JS
4411
4412 /* If additional CPUs on the host have been enabled, CPU usage counter
4413 * arrays have to be expanded */
4414 if (node->cpu_count < cpu_count) {
4415 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4416 node->cpu_count, cpu_count, cg);
4417
4418 if (!expand_proc_stat_node(node, cpu_count)) {
4419 pthread_mutex_unlock(&node->lock);
4420 lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4421 node->cpu_count, cpu_count, cg);
4422 return NULL;
4423 }
4424 }
4425
2f49b662 4426 return node;
056adcef
JS
4427}
4428
4429static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4430{
4431 int i;
4432
4433 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4434 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4435
4436 for (i = 0; i < cpu_count; i++) {
4437 node->view[i].user = 0;
4438 node->view[i].system = 0;
4439 node->view[i].idle = 0;
4440 }
4441
4442 node->cpu_count = cpu_count;
4443}
4444
79612c8b 4445static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, FILE *f, char *buf, size_t buf_size)
056adcef
JS
4446{
4447 char *line = NULL;
4448 size_t linelen = 0, total_len = 0, rv = 0, l;
4449 int curcpu = -1; /* cpu numbering starts at 0 */
77005a6c 4450 int physcpu, i;
056adcef
JS
4451 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4452 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4453 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4454 unsigned long user_surplus = 0, system_surplus = 0;
4455 unsigned long total_sum, threshold;
4456 struct cg_proc_stat *stat_node;
4457 struct cpuacct_usage *diff = NULL;
77005a6c 4458 int nprocs = get_nprocs_conf();
056adcef 4459
79612c8b
JS
4460 if (cg_cpu_usage_size < nprocs)
4461 nprocs = cg_cpu_usage_size;
4462
056adcef
JS
4463 /* Read all CPU stats and stop when we've encountered other lines */
4464 while (getline(&line, &linelen, f) != -1) {
77005a6c 4465 int ret;
056adcef
JS
4466 char cpu_char[10]; /* That's a lot of cores */
4467 uint64_t all_used, cg_used;
4468
4469 if (strlen(line) == 0)
4470 continue;
4471 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4472 /* not a ^cpuN line containing a number N */
4473 break;
4474 }
4475
77005a6c 4476 if (sscanf(cpu_char, "%d", &physcpu) != 1)
056adcef 4477 continue;
77005a6c 4478
79612c8b
JS
4479 if (physcpu >= cg_cpu_usage_size)
4480 continue;
4481
056adcef
JS
4482 curcpu ++;
4483 cpu_cnt ++;
4484
77005a6c
JS
4485 if (!cpu_in_cpuset(physcpu, cpuset)) {
4486 for (i = curcpu; i <= physcpu; i++) {
4487 cg_cpu_usage[i].online = false;
4488 }
4489 continue;
4490 }
4491
4492 if (curcpu < physcpu) {
4493 /* Some CPUs may be disabled */
4494 for (i = curcpu; i < physcpu; i++)
4495 cg_cpu_usage[i].online = false;
4496
4497 curcpu = physcpu;
4498 }
4499
4500 cg_cpu_usage[curcpu].online = true;
4501
056adcef
JS
4502 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4503 &user,
4504 &nice,
4505 &system,
4506 &idle,
4507 &iowait,
4508 &irq,
4509 &softirq,
4510 &steal,
4511 &guest,
4512 &guest_nice);
4513
4514 if (ret != 10)
4515 continue;
4516
4517 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4518 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4519
4520 if (all_used >= cg_used) {
4521 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4522
4523 } else {
4524 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4525 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4526 curcpu, cg, all_used, cg_used);
4527 cg_cpu_usage[curcpu].idle = idle;
4528 }
4529 }
4530
4531 /* Cannot use more CPUs than is available due to cpuset */
4532 if (max_cpus > cpu_cnt)
4533 max_cpus = cpu_cnt;
4534
2f49b662 4535 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
056adcef
JS
4536
4537 if (!stat_node) {
2f49b662
JS
4538 lxcfs_error("unable to find/create stat node for %s\n", cg);
4539 rv = 0;
4540 goto err;
056adcef
JS
4541 }
4542
4543 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4544 if (!diff) {
4545 rv = 0;
4546 goto err;
4547 }
4548
4549 /*
4550 * If the new values are LOWER than values stored in memory, it means
4551 * the cgroup has been reset/recreated and we should reset too.
4552 */
77005a6c
JS
4553 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4554 if (!cg_cpu_usage[curcpu].online)
4555 continue;
4556
4557 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4558 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4559
4560 break;
4561 }
056adcef 4562
77005a6c
JS
4563 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4564
4565 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4566 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4567
4568 if (!stat_node->usage[curcpu].online)
4569 continue;
4570
4571 i++;
056adcef 4572
056adcef
JS
4573 stat_node->usage[curcpu].user += diff[curcpu].user;
4574 stat_node->usage[curcpu].system += diff[curcpu].system;
4575 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4576
77005a6c 4577 if (max_cpus > 0 && i >= max_cpus) {
056adcef
JS
4578 user_surplus += diff[curcpu].user;
4579 system_surplus += diff[curcpu].system;
4580 }
4581 }
4582
4583 /* Calculate usage counters of visible CPUs */
4584 if (max_cpus > 0) {
4585 /* threshold = maximum usage per cpu, including idle */
4586 threshold = total_sum / cpu_cnt * max_cpus;
4587
77005a6c
JS
4588 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4589 if (i == max_cpus)
4590 break;
4591
4592 if (!stat_node->usage[curcpu].online)
4593 continue;
4594
4595 i++;
4596
056adcef
JS
4597 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4598 continue;
4599
4600 /* Add user */
4601 add_cpu_usage(
4602 &user_surplus,
4603 &diff[curcpu],
4604 &diff[curcpu].user,
4605 threshold);
4606
4607 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4608 continue;
4609
4610 /* If there is still room, add system */
4611 add_cpu_usage(
4612 &system_surplus,
4613 &diff[curcpu],
4614 &diff[curcpu].system,
4615 threshold);
4616 }
4617
4618 if (user_surplus > 0)
4619 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4620 if (system_surplus > 0)
4621 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4622
77005a6c
JS
4623 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4624 if (i == max_cpus)
4625 break;
4626
4627 if (!stat_node->usage[curcpu].online)
4628 continue;
4629
4630 i++;
4631
056adcef
JS
4632 stat_node->view[curcpu].user += diff[curcpu].user;
4633 stat_node->view[curcpu].system += diff[curcpu].system;
4634 stat_node->view[curcpu].idle += diff[curcpu].idle;
4635
4636 user_sum += stat_node->view[curcpu].user;
4637 system_sum += stat_node->view[curcpu].system;
4638 idle_sum += stat_node->view[curcpu].idle;
4639 }
4640
4641 } else {
77005a6c
JS
4642 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4643 if (!stat_node->usage[curcpu].online)
4644 continue;
4645
056adcef
JS
4646 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4647 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4648 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4649
4650 user_sum += stat_node->view[curcpu].user;
4651 system_sum += stat_node->view[curcpu].system;
4652 idle_sum += stat_node->view[curcpu].idle;
4653 }
4654 }
4655
4656 /* Render the file */
4657 /* cpu-all */
4658 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4659 user_sum,
4660 system_sum,
4661 idle_sum);
4662
4663 if (l < 0) {
4664 perror("Error writing to cache");
4665 rv = 0;
4666 goto err;
4667
4668 }
4669 if (l >= buf_size) {
4670 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4671 rv = 0;
4672 goto err;
4673 }
4674
4675 buf += l;
4676 buf_size -= l;
4677 total_len += l;
4678
4679 /* Render visible CPUs */
77005a6c
JS
4680 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4681 if (!stat_node->usage[curcpu].online)
4682 continue;
4683
4684 i++;
4685
4686 if (max_cpus > 0 && i == max_cpus)
056adcef
JS
4687 break;
4688
4689 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
77005a6c 4690 i,
056adcef
JS
4691 stat_node->view[curcpu].user,
4692 stat_node->view[curcpu].system,
4693 stat_node->view[curcpu].idle);
4694
4695 if (l < 0) {
4696 perror("Error writing to cache");
4697 rv = 0;
4698 goto err;
4699
4700 }
4701 if (l >= buf_size) {
4702 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4703 rv = 0;
4704 goto err;
4705 }
4706
4707 buf += l;
4708 buf_size -= l;
4709 total_len += l;
4710 }
4711
4712 /* Pass the rest of /proc/stat, start with the last line read */
4713 l = snprintf(buf, buf_size, "%s", line);
4714
4715 if (l < 0) {
4716 perror("Error writing to cache");
4717 rv = 0;
4718 goto err;
4719
4720 }
4721 if (l >= buf_size) {
4722 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4723 rv = 0;
4724 goto err;
4725 }
4726
4727 buf += l;
4728 buf_size -= l;
4729 total_len += l;
4730
4731 /* Pass the rest of the host's /proc/stat */
4732 while (getline(&line, &linelen, f) != -1) {
4733 l = snprintf(buf, buf_size, "%s", line);
4734 if (l < 0) {
4735 perror("Error writing to cache");
4736 rv = 0;
4737 goto err;
4738 }
4739 if (l >= buf_size) {
4740 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4741 rv = 0;
4742 goto err;
4743 }
4744 buf += l;
4745 buf_size -= l;
4746 total_len += l;
4747 }
4748
4749 rv = total_len;
4750
4751err:
2f49b662
JS
4752 if (stat_node)
4753 pthread_mutex_unlock(&stat_node->lock);
056adcef
JS
4754 if (line)
4755 free(line);
4756 if (diff)
4757 free(diff);
4758 return rv;
4759}
4760
f34de69a 4761#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
237e200e
SH
4762static int proc_stat_read(char *buf, size_t size, off_t offset,
4763 struct fuse_file_info *fi)
4764{
4765 struct fuse_context *fc = fuse_get_context();
4766 struct file_info *d = (struct file_info *)fi->fh;
4767 char *cg;
4768 char *cpuset = NULL;
4769 char *line = NULL;
4770 size_t linelen = 0, total_len = 0, rv = 0;
4771 int curcpu = -1; /* cpu numbering starts at 0 */
77005a6c 4772 int physcpu = 0;
7144f069 4773 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
237e200e 4774 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
7144f069 4775 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
237e200e
SH
4776 char cpuall[CPUALL_MAX_SIZE];
4777 /* reserve for cpu all */
4778 char *cache = d->buf + CPUALL_MAX_SIZE;
4779 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4780 FILE *f = NULL;
8be92dd1 4781 struct cpuacct_usage *cg_cpu_usage = NULL;
79612c8b 4782 int cg_cpu_usage_size = 0;
237e200e
SH
4783
4784 if (offset){
4785 if (offset > d->size)
4786 return -EINVAL;
4787 if (!d->cached)
4788 return 0;
4789 int left = d->size - offset;
4790 total_len = left > size ? size: left;
4791 memcpy(buf, d->buf + offset, total_len);
4792 return total_len;
4793 }
4794
4795 pid_t initpid = lookup_initpid_in_store(fc->pid);
4796 if (initpid <= 0)
4797 initpid = fc->pid;
4798 cg = get_pid_cgroup(initpid, "cpuset");
4799 if (!cg)
4800 return read_file("/proc/stat", buf, size, d);
6d2f6996 4801 prune_init_slice(cg);
237e200e
SH
4802
4803 cpuset = get_cpuset(cg);
4804 if (!cpuset)
4805 goto err;
4806
8be92dd1
JS
4807 /*
4808 * Read cpuacct.usage_all for all CPUs.
4809 * If the cpuacct cgroup is present, it is used to calculate the container's
4810 * CPU usage. If not, values from the host's /proc/stat are used.
4811 */
79612c8b 4812 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
8be92dd1
JS
4813 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
4814 "falling back to the host's /proc/stat");
4815 }
4816
237e200e
SH
4817 f = fopen("/proc/stat", "r");
4818 if (!f)
4819 goto err;
4820
4821 //skip first line
4822 if (getline(&line, &linelen, f) < 0) {
b8defc3d 4823 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
237e200e
SH
4824 goto err;
4825 }
4826
056adcef 4827 if (use_cpuview(cg) && cg_cpu_usage) {
79612c8b
JS
4828 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4829 f, d->buf, d->buflen);
056adcef
JS
4830 goto out;
4831 }
4832
237e200e 4833 while (getline(&line, &linelen, f) != -1) {
a262ddb7 4834 ssize_t l;
237e200e
SH
4835 char cpu_char[10]; /* That's a lot of cores */
4836 char *c;
8be92dd1
JS
4837 uint64_t all_used, cg_used, new_idle;
4838 int ret;
237e200e 4839
b4665ce0
SH
4840 if (strlen(line) == 0)
4841 continue;
237e200e
SH
4842 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4843 /* not a ^cpuN line containing a number N, just print it */
9502bae2 4844 l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
4845 if (l < 0) {
4846 perror("Error writing to cache");
4847 rv = 0;
4848 goto err;
4849 }
4850 if (l >= cache_size) {
b8defc3d 4851 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
4852 rv = 0;
4853 goto err;
4854 }
4855 cache += l;
4856 cache_size -= l;
4857 total_len += l;
4858 continue;
4859 }
4860
77005a6c 4861 if (sscanf(cpu_char, "%d", &physcpu) != 1)
237e200e 4862 continue;
77005a6c 4863 if (!cpu_in_cpuset(physcpu, cpuset))
237e200e
SH
4864 continue;
4865 curcpu ++;
4866
8be92dd1 4867 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
7144f069
CB
4868 &user,
4869 &nice,
4870 &system,
4871 &idle,
4872 &iowait,
4873 &irq,
4874 &softirq,
4875 &steal,
4876 &guest,
8be92dd1
JS
4877 &guest_nice);
4878
4879 if (ret != 10 || !cg_cpu_usage) {
4880 c = strchr(line, ' ');
4881 if (!c)
4882 continue;
4883 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4884 if (l < 0) {
4885 perror("Error writing to cache");
4886 rv = 0;
4887 goto err;
4888
4889 }
4890 if (l >= cache_size) {
4891 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4892 rv = 0;
4893 goto err;
4894 }
4895
4896 cache += l;
4897 cache_size -= l;
4898 total_len += l;
4899
4900 if (ret != 10)
4901 continue;
4902 }
4903
4904 if (cg_cpu_usage) {
79612c8b
JS
4905 if (physcpu >= cg_cpu_usage_size)
4906 break;
4907
8be92dd1 4908 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
77005a6c 4909 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
8be92dd1
JS
4910
4911 if (all_used >= cg_used) {
4912 new_idle = idle + (all_used - cg_used);
4913
4914 } else {
4915 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4916 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4917 curcpu, cg, all_used, cg_used);
4918 new_idle = idle;
4919 }
4920
4921 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
77005a6c 4922 curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
8be92dd1
JS
4923 new_idle);
4924
4925 if (l < 0) {
4926 perror("Error writing to cache");
4927 rv = 0;
4928 goto err;
4929
4930 }
4931 if (l >= cache_size) {
4932 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4933 rv = 0;
4934 goto err;
4935 }
4936
4937 cache += l;
4938 cache_size -= l;
4939 total_len += l;
4940
77005a6c
JS
4941 user_sum += cg_cpu_usage[physcpu].user;
4942 system_sum += cg_cpu_usage[physcpu].system;
8be92dd1
JS
4943 idle_sum += new_idle;
4944
4945 } else {
4946 user_sum += user;
4947 nice_sum += nice;
4948 system_sum += system;
4949 idle_sum += idle;
4950 iowait_sum += iowait;
4951 irq_sum += irq;
4952 softirq_sum += softirq;
4953 steal_sum += steal;
4954 guest_sum += guest;
4955 guest_nice_sum += guest_nice;
4956 }
237e200e
SH
4957 }
4958
4959 cache = d->buf;
4960
7144f069
CB
4961 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4962 user_sum,
4963 nice_sum,
4964 system_sum,
4965 idle_sum,
4966 iowait_sum,
4967 irq_sum,
4968 softirq_sum,
4969 steal_sum,
4970 guest_sum,
4971 guest_nice_sum);
4972 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
237e200e
SH
4973 memcpy(cache, cpuall, cpuall_len);
4974 cache += cpuall_len;
7144f069 4975 } else {
237e200e 4976 /* shouldn't happen */
b8defc3d 4977 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
237e200e
SH
4978 cpuall_len = 0;
4979 }
4980
4981 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4982 total_len += cpuall_len;
056adcef
JS
4983
4984out:
237e200e
SH
4985 d->cached = 1;
4986 d->size = total_len;
7144f069
CB
4987 if (total_len > size)
4988 total_len = size;
237e200e
SH
4989
4990 memcpy(buf, d->buf, total_len);
4991 rv = total_len;
4992
4993err:
4994 if (f)
4995 fclose(f);
8be92dd1
JS
4996 if (cg_cpu_usage)
4997 free(cg_cpu_usage);
237e200e
SH
4998 free(line);
4999 free(cpuset);
5000 free(cg);
5001 return rv;
5002}
5003
0ecddf02
CB
5004/* This function retrieves the busy time of a group of tasks by looking at
5005 * cpuacct.usage. Unfortunately, this only makes sense when the container has
5006 * been given it's own cpuacct cgroup. If not, this function will take the busy
5007 * time of all other taks that do not actually belong to the container into
5008 * account as well. If someone has a clever solution for this please send a
5009 * patch!
5010 */
237e200e
SH
5011static unsigned long get_reaper_busy(pid_t task)
5012{
5013 pid_t initpid = lookup_initpid_in_store(task);
5014 char *cgroup = NULL, *usage_str = NULL;
5015 unsigned long usage = 0;
5016
5017 if (initpid <= 0)
5018 return 0;
5019
5020 cgroup = get_pid_cgroup(initpid, "cpuacct");
5021 if (!cgroup)
5022 goto out;
6d2f6996 5023 prune_init_slice(cgroup);
237e200e
SH
5024 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
5025 goto out;
5026 usage = strtoul(usage_str, NULL, 10);
5027 usage /= 1000000000;
5028
5029out:
5030 free(cgroup);
5031 free(usage_str);
5032 return usage;
5033}
5034
5035#if RELOADTEST
5036void iwashere(void)
5037{
237e200e
SH
5038 int fd;
5039
ec2b5e7c 5040 fd = creat("/tmp/lxcfs-iwashere", 0644);
237e200e
SH
5041 if (fd >= 0)
5042 close(fd);
5043}
5044#endif
5045
5046/*
5047 * We read /proc/uptime and reuse its second field.
5048 * For the first field, we use the mtime for the reaper for
5049 * the calling pid as returned by getreaperage
5050 */
5051static int proc_uptime_read(char *buf, size_t size, off_t offset,
5052 struct fuse_file_info *fi)
5053{
5054 struct fuse_context *fc = fuse_get_context();
5055 struct file_info *d = (struct file_info *)fi->fh;
0ecddf02 5056 unsigned long int busytime = get_reaper_busy(fc->pid);
237e200e 5057 char *cache = d->buf;
a262ddb7 5058 ssize_t total_len = 0;
0ecddf02 5059 uint64_t idletime, reaperage;
237e200e
SH
5060
5061#if RELOADTEST
5062 iwashere();
5063#endif
5064
5065 if (offset){
237e200e
SH
5066 if (!d->cached)
5067 return 0;
bbdf646b
BM
5068 if (offset > d->size)
5069 return -EINVAL;
237e200e
SH
5070 int left = d->size - offset;
5071 total_len = left > size ? size: left;
5072 memcpy(buf, cache + offset, total_len);
5073 return total_len;
5074 }
5075
0ecddf02
CB
5076 reaperage = get_reaper_age(fc->pid);
5077 /* To understand why this is done, please read the comment to the
5078 * get_reaper_busy() function.
5079 */
5080 idletime = reaperage;
5081 if (reaperage >= busytime)
5082 idletime = reaperage - busytime;
237e200e 5083
bbdf646b
BM
5084 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
5085 if (total_len < 0 || total_len >= d->buflen){
0ecddf02 5086 lxcfs_error("%s\n", "failed to write to cache");
237e200e
SH
5087 return 0;
5088 }
5089
5090 d->size = (int)total_len;
5091 d->cached = 1;
5092
5093 if (total_len > size) total_len = size;
5094
5095 memcpy(buf, d->buf, total_len);
5096 return total_len;
5097}
5098
5099static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5100 struct fuse_file_info *fi)
5101{
5102 char dev_name[72];
5103 struct fuse_context *fc = fuse_get_context();
5104 struct file_info *d = (struct file_info *)fi->fh;
5105 char *cg;
5106 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5107 *io_wait_time_str = NULL, *io_service_time_str = NULL;
5108 unsigned long read = 0, write = 0;
5109 unsigned long read_merged = 0, write_merged = 0;
5110 unsigned long read_sectors = 0, write_sectors = 0;
5111 unsigned long read_ticks = 0, write_ticks = 0;
5112 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5113 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5114 char *cache = d->buf;
5115 size_t cache_size = d->buflen;
5116 char *line = NULL;
5117 size_t linelen = 0, total_len = 0, rv = 0;
5118 unsigned int major = 0, minor = 0;
5119 int i = 0;
5120 FILE *f = NULL;
5121
5122 if (offset){
5123 if (offset > d->size)
5124 return -EINVAL;
5125 if (!d->cached)
5126 return 0;
5127 int left = d->size - offset;
5128 total_len = left > size ? size: left;
5129 memcpy(buf, cache + offset, total_len);
5130 return total_len;
5131 }
5132
5133 pid_t initpid = lookup_initpid_in_store(fc->pid);
5134 if (initpid <= 0)
5135 initpid = fc->pid;
5136 cg = get_pid_cgroup(initpid, "blkio");
5137 if (!cg)
5138 return read_file("/proc/diskstats", buf, size, d);
6d2f6996 5139 prune_init_slice(cg);
237e200e 5140
2209fe50 5141 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
237e200e 5142 goto err;
2209fe50 5143 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
237e200e 5144 goto err;
2209fe50 5145 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
237e200e 5146 goto err;
2209fe50 5147 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
237e200e 5148 goto err;
2209fe50 5149 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
237e200e
SH
5150 goto err;
5151
5152
5153 f = fopen("/proc/diskstats", "r");
5154 if (!f)
5155 goto err;
5156
5157 while (getline(&line, &linelen, f) != -1) {
a262ddb7 5158 ssize_t l;
2209fe50 5159 char lbuf[256];
237e200e
SH
5160
5161 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2209fe50 5162 if (i != 3)
237e200e 5163 continue;
2209fe50
SH
5164
5165 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5166 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5167 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5168 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5169 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5170 read_sectors = read_sectors/512;
5171 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5172 write_sectors = write_sectors/512;
5173
5174 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5175 rd_svctm = rd_svctm/1000000;
5176 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5177 rd_wait = rd_wait/1000000;
5178 read_ticks = rd_svctm + rd_wait;
5179
5180 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5181 wr_svctm = wr_svctm/1000000;
5182 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5183 wr_wait = wr_wait/1000000;
5184 write_ticks = wr_svctm + wr_wait;
5185
5186 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5187 tot_ticks = tot_ticks/1000000;
237e200e
SH
5188
5189 memset(lbuf, 0, 256);
2db31eb6
SH
5190 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5191 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5192 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5193 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5194 else
5195 continue;
237e200e 5196
2209fe50 5197 l = snprintf(cache, cache_size, "%s", lbuf);
237e200e
SH
5198 if (l < 0) {
5199 perror("Error writing to fuse buf");
5200 rv = 0;
5201 goto err;
5202 }
5203 if (l >= cache_size) {
b8defc3d 5204 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
5205 rv = 0;
5206 goto err;
5207 }
5208 cache += l;
5209 cache_size -= l;
5210 total_len += l;
5211 }
5212
5213 d->cached = 1;
5214 d->size = total_len;
5215 if (total_len > size ) total_len = size;
5216 memcpy(buf, d->buf, total_len);
5217
5218 rv = total_len;
5219err:
5220 free(cg);
5221 if (f)
5222 fclose(f);
5223 free(line);
5224 free(io_serviced_str);
5225 free(io_merged_str);
5226 free(io_service_bytes_str);
5227 free(io_wait_time_str);
5228 free(io_service_time_str);
5229 return rv;
5230}
5231
70dcc12e
SH
5232static int proc_swaps_read(char *buf, size_t size, off_t offset,
5233 struct fuse_file_info *fi)
5234{
5235 struct fuse_context *fc = fuse_get_context();
5236 struct file_info *d = (struct file_info *)fi->fh;
5237 char *cg = NULL;
018246ff 5238 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
70dcc12e 5239 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
a262ddb7
CB
5240 ssize_t total_len = 0, rv = 0;
5241 ssize_t l = 0;
70dcc12e
SH
5242 char *cache = d->buf;
5243
5244 if (offset) {
5245 if (offset > d->size)
5246 return -EINVAL;
5247 if (!d->cached)
5248 return 0;
5249 int left = d->size - offset;
5250 total_len = left > size ? size: left;
5251 memcpy(buf, cache + offset, total_len);
5252 return total_len;
5253 }
5254
5255 pid_t initpid = lookup_initpid_in_store(fc->pid);
5256 if (initpid <= 0)
5257 initpid = fc->pid;
5258 cg = get_pid_cgroup(initpid, "memory");
5259 if (!cg)
5260 return read_file("/proc/swaps", buf, size, d);
6d2f6996 5261 prune_init_slice(cg);
70dcc12e 5262
018246ff 5263 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
70dcc12e
SH
5264
5265 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
5266 goto err;
5267
70dcc12e
SH
5268 memusage = strtoul(memusage_str, NULL, 10);
5269
5270 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5271 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5272
018246ff 5273 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
70dcc12e
SH
5274 memswusage = strtoul(memswusage_str, NULL, 10);
5275
70dcc12e
SH
5276 swap_total = (memswlimit - memlimit) / 1024;
5277 swap_free = (memswusage - memusage) / 1024;
5278 }
5279
5280 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5281
5282 /* When no mem + swap limit is specified or swapaccount=0*/
5283 if (!memswlimit) {
5284 char *line = NULL;
5285 size_t linelen = 0;
5286 FILE *f = fopen("/proc/meminfo", "r");
5287
5288 if (!f)
5289 goto err;
5290
5291 while (getline(&line, &linelen, f) != -1) {
5292 if (startswith(line, "SwapTotal:")) {
5293 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
5294 } else if (startswith(line, "SwapFree:")) {
5295 sscanf(line, "SwapFree: %8lu kB", &swap_free);
5296 }
5297 }
5298
5299 free(line);
5300 fclose(f);
5301 }
5302
5303 if (swap_total > 0) {
a262ddb7
CB
5304 l = snprintf(d->buf + total_len, d->size - total_len,
5305 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5306 swap_total, swap_free);
5307 total_len += l;
70dcc12e
SH
5308 }
5309
a262ddb7 5310 if (total_len < 0 || l < 0) {
70dcc12e
SH
5311 perror("Error writing to cache");
5312 rv = 0;
5313 goto err;
5314 }
5315
5316 d->cached = 1;
5317 d->size = (int)total_len;
5318
5319 if (total_len > size) total_len = size;
5320 memcpy(buf, d->buf, total_len);
5321 rv = total_len;
5322
5323err:
5324 free(cg);
5325 free(memswlimit_str);
5326 free(memlimit_str);
5327 free(memusage_str);
5328 free(memswusage_str);
70dcc12e
SH
5329 return rv;
5330}
6db4f7a3 5331/*
5332 * Find the process pid from cgroup path.
5333 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5334 * @pid_buf : put pid to pid_buf.
5335 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5336 * @depth : the depth of cgroup in container.
5337 * @sum : return the number of pid.
5338 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5339 */
5340static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5341{
5342 DIR *dir;
5343 int fd;
5344 struct dirent *file;
5345 FILE *f = NULL;
5346 size_t linelen = 0;
5347 char *line = NULL;
5348 int pd;
5349 char *path_dir, *path;
5350 char **pid;
5351
5352 /* path = dpath + "/cgroup.procs" + /0 */
5353 do {
5354 path = malloc(strlen(dpath) + 20);
5355 } while (!path);
5356
5357 strcpy(path, dpath);
5358 fd = openat(cfd, path, O_RDONLY);
5359 if (fd < 0)
5360 goto out;
5361
5362 dir = fdopendir(fd);
5363 if (dir == NULL) {
5364 close(fd);
5365 goto out;
5366 }
5367
5368 while (((file = readdir(dir)) != NULL) && depth > 0) {
5369 if (strncmp(file->d_name, ".", 1) == 0)
5370 continue;
5371 if (strncmp(file->d_name, "..", 1) == 0)
5372 continue;
5373 if (file->d_type == DT_DIR) {
5374 /* path + '/' + d_name +/0 */
5375 do {
5376 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5377 } while (!path_dir);
5378 strcpy(path_dir, path);
5379 strcat(path_dir, "/");
5380 strcat(path_dir, file->d_name);
5381 pd = depth - 1;
5382 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5383 free(path_dir);
5384 }
5385 }
5386 closedir(dir);
5387
5388 strcat(path, "/cgroup.procs");
5389 fd = openat(cfd, path, O_RDONLY);
5390 if (fd < 0)
5391 goto out;
5392
5393 f = fdopen(fd, "r");
5394 if (!f) {
5395 close(fd);
5396 goto out;
5397 }
5398
5399 while (getline(&line, &linelen, f) != -1) {
5400 do {
5401 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5402 } while (!pid);
5403 *pid_buf = pid;
5404 do {
5405 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5406 } while (*(*pid_buf + sum) == NULL);
5407 strcpy(*(*pid_buf + sum), line);
5408 sum++;
5409 }
5410 fclose(f);
5411out:
832904c1
JS
5412 if (line)
5413 free(line);
6db4f7a3 5414 free(path);
5415 return sum;
5416}
5417/*
5418 * calc_load calculates the load according to the following formula:
5419 * load1 = load0 * exp + active * (1 - exp)
5420 *
5421 * @load1: the new loadavg.
5422 * @load0: the former loadavg.
5423 * @active: the total number of running pid at this moment.
5424 * @exp: the fixed-point defined in the beginning.
5425 */
5426static unsigned long
5427calc_load(unsigned long load, unsigned long exp, unsigned long active)
5428{
5429 unsigned long newload;
5430
5431 active = active > 0 ? active * FIXED_1 : 0;
5432 newload = load * exp + active * (FIXED_1 - exp);
5433 if (active >= load)
5434 newload += FIXED_1 - 1;
5435
5436 return newload / FIXED_1;
5437}
5438
5439/*
5440 * Return 0 means that container p->cg is closed.
5441 * Return -1 means that error occurred in refresh.
5442 * Positive num equals the total number of pid.
5443 */
5444static int refresh_load(struct load_node *p, char *path)
5445{
5446 FILE *f = NULL;
5447 char **idbuf;
5448 char proc_path[256];
5449 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5450 char *line = NULL;
5451 size_t linelen = 0;
5452 int sum, length;
5453 DIR *dp;
5454 struct dirent *file;
5455
5456 do {
5457 idbuf = malloc(sizeof(char *));
5458 } while (!idbuf);
5459 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5460 /* normal exit */
5461 if (sum == 0)
5462 goto out;
5463
5464 for (i = 0; i < sum; i++) {
5465 /*clean up '\n' */
5466 length = strlen(idbuf[i])-1;
5467 idbuf[i][length] = '\0';
5468 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5469 if (ret < 0 || ret > 255) {
5470 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5471 i = sum;
5472 sum = -1;
5473 goto err_out;
5474 }
5475
5476 dp = opendir(proc_path);
5477 if (!dp) {
5478 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5479 continue;
5480 }
5481 while ((file = readdir(dp)) != NULL) {
5482 if (strncmp(file->d_name, ".", 1) == 0)
5483 continue;
5484 if (strncmp(file->d_name, "..", 1) == 0)
5485 continue;
5486 total_pid++;
5487 /* We make the biggest pid become last_pid.*/
5488 ret = atof(file->d_name);
5489 last_pid = (ret > last_pid) ? ret : last_pid;
5490
5491 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5492 if (ret < 0 || ret > 255) {
5493 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5494 i = sum;
5495 sum = -1;
5496 closedir(dp);
5497 goto err_out;
5498 }
5499 f = fopen(proc_path, "r");
5500 if (f != NULL) {
5501 while (getline(&line, &linelen, f) != -1) {
5502 /* Find State */
5503 if ((line[0] == 'S') && (line[1] == 't'))
5504 break;
5505 }
5506 if ((line[7] == 'R') || (line[7] == 'D'))
5507 run_pid++;
5508 fclose(f);
5509 }
5510 }
5511 closedir(dp);
5512 }
5513 /*Calculate the loadavg.*/
5514 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5515 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5516 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5517 p->run_pid = run_pid;
5518 p->total_pid = total_pid;
5519 p->last_pid = last_pid;
5520
5521 free(line);
beb5024e 5522err_out:
6db4f7a3 5523 for (; i > 0; i--)
5524 free(idbuf[i-1]);
5525out:
5526 free(idbuf);
5527 return sum;
5528}
5529/*
5530 * Traverse the hash table and update it.
5531 */
5532void *load_begin(void *arg)
5533{
5534
5535 char *path = NULL;
5536 int i, sum, length, ret;
5537 struct load_node *f;
5538 int first_node;
5539 clock_t time1, time2;
5540
5541 while (1) {
a83618e2
JS
5542 if (loadavg_stop == 1)
5543 return NULL;
5544
6db4f7a3 5545 time1 = clock();
5546 for (i = 0; i < LOAD_SIZE; i++) {
5547 pthread_mutex_lock(&load_hash[i].lock);
5548 if (load_hash[i].next == NULL) {
5549 pthread_mutex_unlock(&load_hash[i].lock);
5550 continue;
5551 }
5552 f = load_hash[i].next;
5553 first_node = 1;
5554 while (f) {
5555 length = strlen(f->cg) + 2;
5556 do {
5557 /* strlen(f->cg) + '.' or '' + \0 */
5558 path = malloc(length);
5559 } while (!path);
5560
5561 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5562 if (ret < 0 || ret > length - 1) {
5563 /* snprintf failed, ignore the node.*/
5564 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5565 goto out;
5566 }
5567 sum = refresh_load(f, path);
5568 if (sum == 0) {
5569 f = del_node(f, i);
5570 } else {
5571out: f = f->next;
5572 }
5573 free(path);
5574 /* load_hash[i].lock locks only on the first node.*/
5575 if (first_node == 1) {
5576 first_node = 0;
5577 pthread_mutex_unlock(&load_hash[i].lock);
5578 }
5579 }
5580 }
a83618e2
JS
5581
5582 if (loadavg_stop == 1)
5583 return NULL;
5584
6db4f7a3 5585 time2 = clock();
5586 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5587 }
5588}
5589
5590static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5591 struct fuse_file_info *fi)
5592{
5593 struct fuse_context *fc = fuse_get_context();
5594 struct file_info *d = (struct file_info *)fi->fh;
5595 pid_t initpid;
5596 char *cg;
5597 size_t total_len = 0;
5598 char *cache = d->buf;
5599 struct load_node *n;
5600 int hash;
01d88ede 5601 int cfd, rv = 0;
6db4f7a3 5602 unsigned long a, b, c;
5603
5604 if (offset) {
5605 if (offset > d->size)
5606 return -EINVAL;
5607 if (!d->cached)
5608 return 0;
5609 int left = d->size - offset;
5610 total_len = left > size ? size : left;
5611 memcpy(buf, cache + offset, total_len);
5612 return total_len;
5613 }
5614 if (!loadavg)
5615 return read_file("/proc/loadavg", buf, size, d);
5616
5617 initpid = lookup_initpid_in_store(fc->pid);
5618 if (initpid <= 0)
5619 initpid = fc->pid;
5620 cg = get_pid_cgroup(initpid, "cpu");
5621 if (!cg)
5622 return read_file("/proc/loadavg", buf, size, d);
5623
5624 prune_init_slice(cg);
b077527b 5625 hash = calc_hash(cg) % LOAD_SIZE;
6db4f7a3 5626 n = locate_node(cg, hash);
5627
5628 /* First time */
5629 if (n == NULL) {
5630 if (!find_mounted_controller("cpu", &cfd)) {
5631 /*
5632 * In locate_node() above, pthread_rwlock_unlock() isn't used
5633 * because delete is not allowed before read has ended.
5634 */
5635 pthread_rwlock_unlock(&load_hash[hash].rdlock);
01d88ede
JS
5636 rv = 0;
5637 goto err;
6db4f7a3 5638 }
5639 do {
5640 n = malloc(sizeof(struct load_node));
5641 } while (!n);
5642
5643 do {
5644 n->cg = malloc(strlen(cg)+1);
5645 } while (!n->cg);
5646 strcpy(n->cg, cg);
5647 n->avenrun[0] = 0;
5648 n->avenrun[1] = 0;
5649 n->avenrun[2] = 0;
5650 n->run_pid = 0;
5651 n->total_pid = 1;
5652 n->last_pid = initpid;
5653 n->cfd = cfd;
5654 insert_node(&n, hash);
5655 }
5656 a = n->avenrun[0] + (FIXED_1/200);
5657 b = n->avenrun[1] + (FIXED_1/200);
5658 c = n->avenrun[2] + (FIXED_1/200);
5659 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5660 LOAD_INT(a), LOAD_FRAC(a),
5661 LOAD_INT(b), LOAD_FRAC(b),
5662 LOAD_INT(c), LOAD_FRAC(c),
5663 n->run_pid, n->total_pid, n->last_pid);
5664 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5665 if (total_len < 0 || total_len >= d->buflen) {
5666 lxcfs_error("%s\n", "Failed to write to cache");
01d88ede
JS
5667 rv = 0;
5668 goto err;
6db4f7a3 5669 }
5670 d->size = (int)total_len;
5671 d->cached = 1;
5672
5673 if (total_len > size)
5674 total_len = size;
5675 memcpy(buf, d->buf, total_len);
01d88ede
JS
5676 rv = total_len;
5677
5678err:
5679 free(cg);
5680 return rv;
6db4f7a3 5681}
5682/* Return a positive number on success, return 0 on failure.*/
5683pthread_t load_daemon(int load_use)
5684{
5685 int ret;
5686 pthread_t pid;
5687
5688 ret = init_load();
5689 if (ret == -1) {
5690 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5691 return 0;
5692 }
5693 ret = pthread_create(&pid, NULL, load_begin, NULL);
5694 if (ret != 0) {
5695 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5696 load_free();
5697 return 0;
5698 }
5699 /* use loadavg, here loadavg = 1*/
5700 loadavg = load_use;
5701 return pid;
5702}
70dcc12e 5703
a83618e2
JS
5704/* Returns 0 on success. */
5705int stop_load_daemon(pthread_t pid)
5706{
5707 int s;
5708
5709 /* Signal the thread to gracefully stop */
5710 loadavg_stop = 1;
5711
5712 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5713 if (s != 0) {
5714 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5715 return -1;
5716 }
5717
5718 load_free();
5719 loadavg_stop = 0;
5720
5721 return 0;
5722}
5723
237e200e
SH
5724static off_t get_procfile_size(const char *which)
5725{
5726 FILE *f = fopen(which, "r");
5727 char *line = NULL;
5728 size_t len = 0;
5729 ssize_t sz, answer = 0;
5730 if (!f)
5731 return 0;
5732
5733 while ((sz = getline(&line, &len, f)) != -1)
5734 answer += sz;
5735 fclose (f);
5736 free(line);
5737
5738 return answer;
5739}
5740
5741int proc_getattr(const char *path, struct stat *sb)
5742{
5743 struct timespec now;
5744
5745 memset(sb, 0, sizeof(struct stat));
5746 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5747 return -EINVAL;
5748 sb->st_uid = sb->st_gid = 0;
5749 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5750 if (strcmp(path, "/proc") == 0) {
5751 sb->st_mode = S_IFDIR | 00555;
5752 sb->st_nlink = 2;
5753 return 0;
5754 }
5755 if (strcmp(path, "/proc/meminfo") == 0 ||
5756 strcmp(path, "/proc/cpuinfo") == 0 ||
5757 strcmp(path, "/proc/uptime") == 0 ||
5758 strcmp(path, "/proc/stat") == 0 ||
70dcc12e 5759 strcmp(path, "/proc/diskstats") == 0 ||
46be8eed 5760 strcmp(path, "/proc/swaps") == 0 ||
5761 strcmp(path, "/proc/loadavg") == 0) {
237e200e
SH
5762 sb->st_size = 0;
5763 sb->st_mode = S_IFREG | 00444;
5764 sb->st_nlink = 1;
5765 return 0;
5766 }
5767
5768 return -ENOENT;
5769}
5770
5771int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5772 struct fuse_file_info *fi)
5773{
d639f863
CB
5774 if (filler(buf, ".", NULL, 0) != 0 ||
5775 filler(buf, "..", NULL, 0) != 0 ||
5776 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5777 filler(buf, "meminfo", NULL, 0) != 0 ||
5778 filler(buf, "stat", NULL, 0) != 0 ||
5779 filler(buf, "uptime", NULL, 0) != 0 ||
5780 filler(buf, "diskstats", NULL, 0) != 0 ||
46be8eed 5781 filler(buf, "swaps", NULL, 0) != 0 ||
5782 filler(buf, "loadavg", NULL, 0) != 0)
237e200e
SH
5783 return -EINVAL;
5784 return 0;
5785}
5786
5787int proc_open(const char *path, struct fuse_file_info *fi)
5788{
5789 int type = -1;
5790 struct file_info *info;
5791
5792 if (strcmp(path, "/proc/meminfo") == 0)
5793 type = LXC_TYPE_PROC_MEMINFO;
5794 else if (strcmp(path, "/proc/cpuinfo") == 0)
5795 type = LXC_TYPE_PROC_CPUINFO;
5796 else if (strcmp(path, "/proc/uptime") == 0)
5797 type = LXC_TYPE_PROC_UPTIME;
5798 else if (strcmp(path, "/proc/stat") == 0)
5799 type = LXC_TYPE_PROC_STAT;
5800 else if (strcmp(path, "/proc/diskstats") == 0)
5801 type = LXC_TYPE_PROC_DISKSTATS;
70dcc12e
SH
5802 else if (strcmp(path, "/proc/swaps") == 0)
5803 type = LXC_TYPE_PROC_SWAPS;
46be8eed 5804 else if (strcmp(path, "/proc/loadavg") == 0)
5805 type = LXC_TYPE_PROC_LOADAVG;
237e200e
SH
5806 if (type == -1)
5807 return -ENOENT;
5808
5809 info = malloc(sizeof(*info));
5810 if (!info)
5811 return -ENOMEM;
5812
5813 memset(info, 0, sizeof(*info));
5814 info->type = type;
5815
5816 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5817 do {
5818 info->buf = malloc(info->buflen);
5819 } while (!info->buf);
5820 memset(info->buf, 0, info->buflen);
5821 /* set actual size to buffer size */
5822 info->size = info->buflen;
5823
5824 fi->fh = (unsigned long)info;
5825 return 0;
5826}
5827
bddbb106
SH
5828int proc_access(const char *path, int mask)
5829{
e7849aa3
CB
5830 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5831 return 0;
5832
bddbb106
SH
5833 /* these are all read-only */
5834 if ((mask & ~R_OK) != 0)
1b060d0a 5835 return -EACCES;
bddbb106
SH
5836 return 0;
5837}
5838
237e200e
SH
5839int proc_release(const char *path, struct fuse_file_info *fi)
5840{
43215927 5841 do_release_file_info(fi);
237e200e
SH
5842 return 0;
5843}
5844
5845int proc_read(const char *path, char *buf, size_t size, off_t offset,
5846 struct fuse_file_info *fi)
5847{
5848 struct file_info *f = (struct file_info *) fi->fh;
5849
5850 switch (f->type) {
5851 case LXC_TYPE_PROC_MEMINFO:
5852 return proc_meminfo_read(buf, size, offset, fi);
5853 case LXC_TYPE_PROC_CPUINFO:
5854 return proc_cpuinfo_read(buf, size, offset, fi);
5855 case LXC_TYPE_PROC_UPTIME:
5856 return proc_uptime_read(buf, size, offset, fi);
5857 case LXC_TYPE_PROC_STAT:
5858 return proc_stat_read(buf, size, offset, fi);
5859 case LXC_TYPE_PROC_DISKSTATS:
5860 return proc_diskstats_read(buf, size, offset, fi);
70dcc12e
SH
5861 case LXC_TYPE_PROC_SWAPS:
5862 return proc_swaps_read(buf, size, offset, fi);
46be8eed 5863 case LXC_TYPE_PROC_LOADAVG:
5864 return proc_loadavg_read(buf, size, offset, fi);
237e200e
SH
5865 default:
5866 return -EINVAL;
5867 }
5868}
5869
29a73c2f
CB
5870/*
5871 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
5872 */
5873
5874static bool mkdir_p(const char *dir, mode_t mode)
5875{
5876 const char *tmp = dir;
5877 const char *orig = dir;
5878 char *makeme;
5879
5880 do {
5881 dir = tmp + strspn(tmp, "/");
5882 tmp = dir + strcspn(dir, "/");
5883 makeme = strndup(orig, dir - orig);
5884 if (!makeme)
5885 return false;
5886 if (mkdir(makeme, mode) && errno != EEXIST) {
b8defc3d 5887 lxcfs_error("Failed to create directory '%s': %s.\n",
29a73c2f
CB
5888 makeme, strerror(errno));
5889 free(makeme);
5890 return false;
5891 }
5892 free(makeme);
5893 } while(tmp != dir);
5894
5895 return true;
5896}
5897
5898static bool umount_if_mounted(void)
5899{
5900 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 5901 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
5902 return false;
5903 }
5904 return true;
5905}
5906
2283e240
CB
5907/* __typeof__ should be safe to use with all compilers. */
5908typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5909static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5910{
5911 return (fs->f_type == (fs_type_magic)magic_val);
5912}
5913
0a4dea41
CB
5914/*
5915 * looking at fs/proc_namespace.c, it appears we can
5916 * actually expect the rootfs entry to very specifically contain
5917 * " - rootfs rootfs "
5918 * IIUC, so long as we've chrooted so that rootfs is not our root,
5919 * the rootfs entry should always be skipped in mountinfo contents.
5920 */
5921static bool is_on_ramfs(void)
5922{
5923 FILE *f;
5924 char *p, *p2;
5925 char *line = NULL;
5926 size_t len = 0;
5927 int i;
5928
5929 f = fopen("/proc/self/mountinfo", "r");
5930 if (!f)
5931 return false;
5932
5933 while (getline(&line, &len, f) != -1) {
5934 for (p = line, i = 0; p && i < 4; i++)
5935 p = strchr(p + 1, ' ');
5936 if (!p)
5937 continue;
5938 p2 = strchr(p + 1, ' ');
5939 if (!p2)
5940 continue;
5941 *p2 = '\0';
5942 if (strcmp(p + 1, "/") == 0) {
5943 // this is '/'. is it the ramfs?
5944 p = strchr(p2 + 1, '-');
5945 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5946 free(line);
5947 fclose(f);
5948 return true;
5949 }
5950 }
5951 }
5952 free(line);
5953 fclose(f);
5954 return false;
5955}
5956
cc309f33 5957static int pivot_enter()
0a4dea41 5958{
cc309f33
CB
5959 int ret = -1, oldroot = -1, newroot = -1;
5960
5961 oldroot = open("/", O_DIRECTORY | O_RDONLY);
5962 if (oldroot < 0) {
5963 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5964 return ret;
5965 }
5966
5967 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5968 if (newroot < 0) {
5969 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5970 goto err;
5971 }
5972
5973 /* change into new root fs */
5974 if (fchdir(newroot) < 0) {
5975 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5976 goto err;
5977 }
5978
0a4dea41
CB
5979 /* pivot_root into our new root fs */
5980 if (pivot_root(".", ".") < 0) {
5981 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
cc309f33 5982 goto err;
0a4dea41
CB
5983 }
5984
5985 /*
5986 * At this point the old-root is mounted on top of our new-root.
5987 * To unmounted it we must not be chdir'd into it, so escape back
5988 * to the old-root.
5989 */
5990 if (fchdir(oldroot) < 0) {
5991 lxcfs_error("%s\n", "Failed to enter old root.");
cc309f33 5992 goto err;
0a4dea41
CB
5993 }
5994
5995 if (umount2(".", MNT_DETACH) < 0) {
5996 lxcfs_error("%s\n", "Failed to detach old root.");
cc309f33 5997 goto err;
0a4dea41
CB
5998 }
5999
6000 if (fchdir(newroot) < 0) {
6001 lxcfs_error("%s\n", "Failed to re-enter new root.");
cc309f33 6002 goto err;
0a4dea41
CB
6003 }
6004
cc309f33
CB
6005 ret = 0;
6006
6007err:
6008 if (oldroot > 0)
6009 close(oldroot);
6010 if (newroot > 0)
6011 close(newroot);
6012
6013 return ret;
0a4dea41
CB
6014}
6015
6016static int chroot_enter()
6017{
6018 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
6019 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
6020 return -1;
6021 }
6022
6023 if (chroot(".") < 0) {
6024 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
6025 return -1;
6026 }
6027
6028 if (chdir("/") < 0) {
6029 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
6030 return -1;
6031 }
6032
6033 return 0;
6034}
6035
0232cbac 6036static int permute_and_enter(void)
29a73c2f 6037{
0a4dea41
CB
6038 struct statfs sb;
6039
6040 if (statfs("/", &sb) < 0) {
6041 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 6042 return -1;
0a4dea41
CB
6043 }
6044
6045 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
6046 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
6047 * /proc/1/mountinfo. */
6048 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
6049 return chroot_enter();
29a73c2f 6050
cc309f33 6051 if (pivot_enter() < 0) {
0a4dea41 6052 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 6053 return -1;
29a73c2f
CB
6054 }
6055
cc309f33 6056 return 0;
29a73c2f
CB
6057}
6058
6059/* Prepare our new clean root. */
0232cbac 6060static int permute_prepare(void)
29a73c2f
CB
6061{
6062 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 6063 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
6064 return -1;
6065 }
6066
6067 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 6068 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
6069 return -1;
6070 }
6071
6072 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 6073 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
6074 return -1;
6075 }
6076
6077 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 6078 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
6079 return -1;
6080 }
6081
6082 return 0;
6083}
6084
0232cbac
CB
6085/* Calls chroot() on ramfs, pivot_root() in all other cases. */
6086static bool permute_root(void)
29a73c2f
CB
6087{
6088 /* Prepare new root. */
0232cbac 6089 if (permute_prepare() < 0)
29a73c2f
CB
6090 return false;
6091
6092 /* Pivot into new root. */
0232cbac 6093 if (permute_and_enter() < 0)
29a73c2f
CB
6094 return false;
6095
6096 return true;
6097}
6098
a257a8ee
CB
6099static int preserve_mnt_ns(int pid)
6100{
6101 int ret;
6102 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6103 char path[len];
6104
6105 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6106 if (ret < 0 || (size_t)ret >= len)
6107 return -1;
6108
6109 return open(path, O_RDONLY | O_CLOEXEC);
6110}
6111
0a4dea41 6112static bool cgfs_prepare_mounts(void)
29a73c2f
CB
6113{
6114 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 6115 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
6116 return false;
6117 }
480262c9 6118
29a73c2f 6119 if (!umount_if_mounted()) {
b8defc3d 6120 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
6121 return false;
6122 }
6123
6124 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 6125 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
6126 return false;
6127 }
6128
a257a8ee
CB
6129 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6130 if (cgroup_mount_ns_fd < 0) {
6131 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6132 return false;
6133 }
6134
480262c9 6135 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 6136 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
6137 return false;
6138 }
480262c9 6139
29a73c2f 6140 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 6141 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
6142 return false;
6143 }
480262c9 6144
29a73c2f
CB
6145 return true;
6146}
6147
0a4dea41 6148static bool cgfs_mount_hierarchies(void)
29a73c2f
CB
6149{
6150 char *target;
6151 size_t clen, len;
6152 int i, ret;
6153
6154 for (i = 0; i < num_hierarchies; i++) {
6155 char *controller = hierarchies[i];
51c7ca35 6156
29a73c2f
CB
6157 clen = strlen(controller);
6158 len = strlen(BASEDIR) + clen + 2;
6159 target = malloc(len);
6160 if (!target)
6161 return false;
51c7ca35 6162
29a73c2f
CB
6163 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
6164 if (ret < 0 || ret >= len) {
6165 free(target);
6166 return false;
6167 }
6168 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
6169 free(target);
6170 return false;
6171 }
51c7ca35
CB
6172 if (!strcmp(controller, "unified"))
6173 ret = mount("none", target, "cgroup2", 0, NULL);
6174 else
6175 ret = mount(controller, target, "cgroup", 0, controller);
6176 if (ret < 0) {
6177 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
29a73c2f
CB
6178 free(target);
6179 return false;
6180 }
6181
6182 fd_hierarchies[i] = open(target, O_DIRECTORY);
6183 if (fd_hierarchies[i] < 0) {
6184 free(target);
6185 return false;
6186 }
6187 free(target);
6188 }
6189 return true;
6190}
6191
480262c9 6192static bool cgfs_setup_controllers(void)
29a73c2f 6193{
0a4dea41 6194 if (!cgfs_prepare_mounts())
29a73c2f 6195 return false;
29a73c2f 6196
0a4dea41 6197 if (!cgfs_mount_hierarchies()) {
b8defc3d 6198 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
29a73c2f
CB
6199 return false;
6200 }
6201
0232cbac 6202 if (!permute_root())
29a73c2f
CB
6203 return false;
6204
6205 return true;
6206}
6207
6208static void __attribute__((constructor)) collect_and_mount_subsystems(void)
237e200e
SH
6209{
6210 FILE *f;
e58dab00
CB
6211 char *cret, *line = NULL;
6212 char cwd[MAXPATHLEN];
237e200e 6213 size_t len = 0;
480262c9 6214 int i, init_ns = -1;
51c7ca35 6215 bool found_unified = false;
237e200e
SH
6216
6217 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
b8defc3d 6218 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
237e200e
SH
6219 return;
6220 }
e58dab00 6221
237e200e 6222 while (getline(&line, &len, f) != -1) {
51c7ca35 6223 char *idx, *p, *p2;
237e200e
SH
6224
6225 p = strchr(line, ':');
6226 if (!p)
6227 goto out;
51c7ca35 6228 idx = line;
237e200e
SH
6229 *(p++) = '\0';
6230
6231 p2 = strrchr(p, ':');
6232 if (!p2)
6233 goto out;
6234 *p2 = '\0';
6235
a67719f6
CB
6236 /* With cgroupv2 /proc/self/cgroup can contain entries of the
6237 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
6238 * because it parses out the empty string "" and later on passes
6239 * it to mount(). Let's skip such entries.
6240 */
51c7ca35
CB
6241 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
6242 found_unified = true;
6243 p = "unified";
6244 }
a67719f6 6245
237e200e
SH
6246 if (!store_hierarchy(line, p))
6247 goto out;
6248 }
6249
480262c9 6250 /* Preserve initial namespace. */
a257a8ee 6251 init_ns = preserve_mnt_ns(getpid());
b8defc3d
CB
6252 if (init_ns < 0) {
6253 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
480262c9 6254 goto out;
b8defc3d 6255 }
480262c9 6256
92c3ee11 6257 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
b8defc3d
CB
6258 if (!fd_hierarchies) {
6259 lxcfs_error("%s\n", strerror(errno));
29a73c2f 6260 goto out;
b8defc3d 6261 }
29a73c2f 6262
480262c9
CB
6263 for (i = 0; i < num_hierarchies; i++)
6264 fd_hierarchies[i] = -1;
6265
e58dab00
CB
6266 cret = getcwd(cwd, MAXPATHLEN);
6267 if (!cret)
6268 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6269
480262c9
CB
6270 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6271 * to privately mount lxcfs cgroups. */
b8defc3d
CB
6272 if (!cgfs_setup_controllers()) {
6273 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
29a73c2f 6274 goto out;
b8defc3d 6275 }
480262c9 6276
b8defc3d
CB
6277 if (setns(init_ns, 0) < 0) {
6278 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
29a73c2f 6279 goto out;
b8defc3d 6280 }
29a73c2f 6281
e58dab00
CB
6282 if (!cret || chdir(cwd) < 0)
6283 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6284
056adcef
JS
6285 if (!init_cpuview()) {
6286 lxcfs_error("%s\n", "failed to init CPU view");
6287 goto out;
6288 }
6289
237e200e
SH
6290 print_subsystems();
6291
6292out:
6293 free(line);
6294 fclose(f);
480262c9
CB
6295 if (init_ns >= 0)
6296 close(init_ns);
237e200e
SH
6297}
6298
6299static void __attribute__((destructor)) free_subsystems(void)
6300{
6301 int i;
6302
b8defc3d
CB
6303 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6304
29a73c2f 6305 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
6306 if (hierarchies[i])
6307 free(hierarchies[i]);
480262c9 6308 if (fd_hierarchies && fd_hierarchies[i] >= 0)
29a73c2f
CB
6309 close(fd_hierarchies[i]);
6310 }
237e200e 6311 free(hierarchies);
480262c9 6312 free(fd_hierarchies);
056adcef 6313 free_cpuview();
a257a8ee
CB
6314
6315 if (cgroup_mount_ns_fd >= 0)
6316 close(cgroup_mount_ns_fd);
237e200e 6317}