]> git.proxmox.com Git - mirror_lxcfs.git/blame - bindings.c
CPU view: handle CPU hotplug at runtime
[mirror_lxcfs.git] / bindings.c
CommitLineData
237e200e
SH
1/* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9#define FUSE_USE_VERSION 26
10
0ecddf02 11#define __STDC_FORMAT_MACROS
237e200e 12#include <dirent.h>
29a73c2f 13#include <errno.h>
237e200e
SH
14#include <fcntl.h>
15#include <fuse.h>
0ecddf02 16#include <inttypes.h>
237e200e 17#include <libgen.h>
237e200e 18#include <pthread.h>
29a73c2f
CB
19#include <sched.h>
20#include <stdbool.h>
0ecddf02 21#include <stdint.h>
29a73c2f
CB
22#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25#include <time.h>
26#include <unistd.h>
27#include <wait.h>
d89504c4 28#include <linux/magic.h>
237e200e 29#include <linux/sched.h>
29a73c2f
CB
30#include <sys/epoll.h>
31#include <sys/mman.h>
32#include <sys/mount.h>
237e200e
SH
33#include <sys/param.h>
34#include <sys/socket.h>
29a73c2f 35#include <sys/syscall.h>
0ecddf02 36#include <sys/sysinfo.h>
d89504c4 37#include <sys/vfs.h>
237e200e 38
237e200e 39#include "bindings.h"
237e200e
SH
40#include "config.h" // for VERSION
41
0ecddf02
CB
42/* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
43#define LXCFS_NUMSTRLEN64 21
44
29a73c2f
CB
45/* Define pivot_root() if missing from the C library */
46#ifndef HAVE_PIVOT_ROOT
47static int pivot_root(const char * new_root, const char * put_old)
48{
49#ifdef __NR_pivot_root
50return syscall(__NR_pivot_root, new_root, put_old);
51#else
52errno = ENOSYS;
53return -1;
54#endif
55}
56#else
57extern int pivot_root(const char * new_root, const char * put_old);
58#endif
59
237e200e
SH
60enum {
61 LXC_TYPE_CGDIR,
62 LXC_TYPE_CGFILE,
63 LXC_TYPE_PROC_MEMINFO,
64 LXC_TYPE_PROC_CPUINFO,
65 LXC_TYPE_PROC_UPTIME,
66 LXC_TYPE_PROC_STAT,
67 LXC_TYPE_PROC_DISKSTATS,
70dcc12e 68 LXC_TYPE_PROC_SWAPS,
46be8eed 69 LXC_TYPE_PROC_LOADAVG,
237e200e
SH
70};
71
72struct file_info {
73 char *controller;
74 char *cgroup;
75 char *file;
76 int type;
77 char *buf; // unused as of yet
78 int buflen;
79 int size; //actual data size
80 int cached;
81};
82
8be92dd1
JS
83struct cpuacct_usage {
84 uint64_t user;
85 uint64_t system;
056adcef 86 uint64_t idle;
8be92dd1
JS
87};
88
0e47acaa 89/* The function of hash table.*/
90#define LOAD_SIZE 100 /*the size of hash_table */
6db4f7a3 91#define FLUSH_TIME 5 /*the flush rate */
92#define DEPTH_DIR 3 /*the depth of per cgroup */
93/* The function of calculate loadavg .*/
94#define FSHIFT 11 /* nr of bits of precision */
95#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
96#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
97#define EXP_5 2014 /* 1/exp(5sec/5min) */
98#define EXP_15 2037 /* 1/exp(5sec/15min) */
99#define LOAD_INT(x) ((x) >> FSHIFT)
100#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
beb5024e 101/*
6db4f7a3 102 * This parameter is used for proc_loadavg_read().
103 * 1 means use loadavg, 0 means not use.
104 */
105static int loadavg = 0;
a83618e2 106static volatile sig_atomic_t loadavg_stop = 0;
056adcef 107static int calc_hash(const char *name)
0e47acaa 108{
109 unsigned int hash = 0;
110 unsigned int x = 0;
111 /* ELFHash algorithm. */
112 while (*name) {
113 hash = (hash << 4) + *name++;
114 x = hash & 0xf0000000;
115 if (x != 0)
116 hash ^= (x >> 24);
117 hash &= ~x;
118 }
b077527b 119 return (hash & 0x7fffffff);
0e47acaa 120}
121
122struct load_node {
123 char *cg; /*cg */
124 unsigned long avenrun[3]; /* Load averages */
125 unsigned int run_pid;
126 unsigned int total_pid;
127 unsigned int last_pid;
128 int cfd; /* The file descriptor of the mounted cgroup */
129 struct load_node *next;
130 struct load_node **pre;
131};
132
133struct load_head {
134 /*
135 * The lock is about insert load_node and refresh load_node.To the first
136 * load_node of each hash bucket, insert and refresh in this hash bucket is
137 * mutually exclusive.
138 */
139 pthread_mutex_t lock;
140 /*
141 * The rdlock is about read loadavg and delete load_node.To each hash
142 * bucket, read and delete is mutually exclusive. But at the same time, we
143 * allow paratactic read operation. This rdlock is at list level.
144 */
145 pthread_rwlock_t rdlock;
146 /*
147 * The rilock is about read loadavg and insert load_node.To the first
148 * load_node of each hash bucket, read and insert is mutually exclusive.
149 * But at the same time, we allow paratactic read operation.
150 */
151 pthread_rwlock_t rilock;
152 struct load_node *next;
153};
154
155static struct load_head load_hash[LOAD_SIZE]; /* hash table */
156/*
157 * init_load initialize the hash table.
158 * Return 0 on success, return -1 on failure.
159 */
160static int init_load(void)
161{
162 int i;
163 int ret;
164
165 for (i = 0; i < LOAD_SIZE; i++) {
166 load_hash[i].next = NULL;
167 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
168 if (ret != 0) {
169 lxcfs_error("%s\n", "Failed to initialize lock");
170 goto out3;
171 }
172 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
173 if (ret != 0) {
174 lxcfs_error("%s\n", "Failed to initialize rdlock");
175 goto out2;
176 }
177 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
178 if (ret != 0) {
179 lxcfs_error("%s\n", "Failed to initialize rilock");
180 goto out1;
181 }
182 }
183 return 0;
184out1:
185 pthread_rwlock_destroy(&load_hash[i].rdlock);
186out2:
187 pthread_mutex_destroy(&load_hash[i].lock);
188out3:
189 while (i > 0) {
190 i--;
191 pthread_mutex_destroy(&load_hash[i].lock);
192 pthread_rwlock_destroy(&load_hash[i].rdlock);
193 pthread_rwlock_destroy(&load_hash[i].rilock);
194 }
195 return -1;
196}
197
198static void insert_node(struct load_node **n, int locate)
199{
200 struct load_node *f;
201
202 pthread_mutex_lock(&load_hash[locate].lock);
203 pthread_rwlock_wrlock(&load_hash[locate].rilock);
204 f = load_hash[locate].next;
205 load_hash[locate].next = *n;
206
207 (*n)->pre = &(load_hash[locate].next);
208 if (f)
209 f->pre = &((*n)->next);
210 (*n)->next = f;
211 pthread_mutex_unlock(&load_hash[locate].lock);
212 pthread_rwlock_unlock(&load_hash[locate].rilock);
213}
214/*
215 * locate_node() finds special node. Not return NULL means success.
216 * It should be noted that rdlock isn't unlocked at the end of code
217 * because this function is used to read special node. Delete is not
218 * allowed before read has ended.
219 * unlock rdlock only in proc_loadavg_read().
220 */
221static struct load_node *locate_node(char *cg, int locate)
222{
223 struct load_node *f = NULL;
224 int i = 0;
225
226 pthread_rwlock_rdlock(&load_hash[locate].rilock);
227 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
228 if (load_hash[locate].next == NULL) {
229 pthread_rwlock_unlock(&load_hash[locate].rilock);
230 return f;
231 }
232 f = load_hash[locate].next;
233 pthread_rwlock_unlock(&load_hash[locate].rilock);
234 while (f && ((i = strcmp(f->cg, cg)) != 0))
235 f = f->next;
236 return f;
237}
238/* Delete the load_node n and return the next node of it. */
239static struct load_node *del_node(struct load_node *n, int locate)
240{
241 struct load_node *g;
242
243 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
244 if (n->next == NULL) {
245 *(n->pre) = NULL;
246 } else {
247 *(n->pre) = n->next;
248 n->next->pre = n->pre;
249 }
250 g = n->next;
251 free(n->cg);
252 free(n);
253 pthread_rwlock_unlock(&load_hash[locate].rdlock);
254 return g;
255}
256
a83618e2 257static void load_free(void)
9c480eb7 258{
259 int i;
260 struct load_node *f, *p;
261
262 for (i = 0; i < LOAD_SIZE; i++) {
263 pthread_mutex_lock(&load_hash[i].lock);
264 pthread_rwlock_wrlock(&load_hash[i].rilock);
265 pthread_rwlock_wrlock(&load_hash[i].rdlock);
266 if (load_hash[i].next == NULL) {
267 pthread_mutex_unlock(&load_hash[i].lock);
268 pthread_mutex_destroy(&load_hash[i].lock);
269 pthread_rwlock_unlock(&load_hash[i].rilock);
270 pthread_rwlock_destroy(&load_hash[i].rilock);
271 pthread_rwlock_unlock(&load_hash[i].rdlock);
272 pthread_rwlock_destroy(&load_hash[i].rdlock);
273 continue;
274 }
275 for (f = load_hash[i].next; f; ) {
276 free(f->cg);
277 p = f->next;
278 free(f);
279 f = p;
280 }
281 pthread_mutex_unlock(&load_hash[i].lock);
282 pthread_mutex_destroy(&load_hash[i].lock);
283 pthread_rwlock_unlock(&load_hash[i].rilock);
284 pthread_rwlock_destroy(&load_hash[i].rilock);
285 pthread_rwlock_unlock(&load_hash[i].rdlock);
286 pthread_rwlock_destroy(&load_hash[i].rdlock);
287 }
288}
056adcef
JS
289
290/* Data for CPU view */
291struct cg_proc_stat {
292 char *cg;
293 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
294 struct cpuacct_usage *view; // Usage stats reported to the container
295 int cpu_count;
2f49b662 296 pthread_mutex_t lock; // For node manipulation
056adcef
JS
297 struct cg_proc_stat *next;
298};
299
300struct cg_proc_stat_head {
301 struct cg_proc_stat *next;
951acc94 302 time_t lastcheck;
2f49b662
JS
303
304 /*
305 * For access to the list. Reading can be parallel, pruning is exclusive.
306 */
307 pthread_rwlock_t lock;
056adcef
JS
308};
309
310#define CPUVIEW_HASH_SIZE 100
311static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
312
313static bool cpuview_init_head(struct cg_proc_stat_head **head)
314{
315 *head = malloc(sizeof(struct cg_proc_stat_head));
316 if (!(*head)) {
317 lxcfs_error("%s\n", strerror(errno));
318 return false;
319 }
320
951acc94 321 (*head)->lastcheck = time(NULL);
056adcef 322 (*head)->next = NULL;
2f49b662
JS
323
324 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
325 lxcfs_error("%s\n", "Failed to initialize list lock");
326 free(*head);
327 return false;
328 }
329
056adcef
JS
330 return true;
331}
332
333static bool init_cpuview()
334{
335 int i;
336
337 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
338 proc_stat_history[i] = NULL;
339
340 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
341 if (!cpuview_init_head(&proc_stat_history[i]))
342 goto err;
343 }
344
345 return true;
346
347err:
348 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
349 if (proc_stat_history[i]) {
350 free(proc_stat_history[i]);
351 proc_stat_history[i] = NULL;
352 }
353 }
354
355 return false;
356}
357
951acc94
JS
358static void free_proc_stat_node(struct cg_proc_stat *node)
359{
2f49b662 360 pthread_mutex_destroy(&node->lock);
951acc94
JS
361 free(node->cg);
362 free(node->usage);
363 free(node->view);
364 free(node);
365}
366
056adcef
JS
367static void cpuview_free_head(struct cg_proc_stat_head *head)
368{
369 struct cg_proc_stat *node, *tmp;
370
371 if (head->next) {
372 node = head->next;
373
374 for (;;) {
375 tmp = node;
376 node = node->next;
951acc94 377 free_proc_stat_node(tmp);
056adcef
JS
378
379 if (!node)
380 break;
381 }
382 }
383
2f49b662 384 pthread_rwlock_destroy(&head->lock);
056adcef
JS
385 free(head);
386}
387
388static void free_cpuview()
389{
390 int i;
391
392 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
393 if (proc_stat_history[i])
394 cpuview_free_head(proc_stat_history[i]);
395 }
396}
397
f34de69a
CB
398/* Reserve buffer size to account for file size changes. */
399#define BUF_RESERVE_SIZE 512
237e200e
SH
400
401/*
402 * A table caching which pid is init for a pid namespace.
403 * When looking up which pid is init for $qpid, we first
404 * 1. Stat /proc/$qpid/ns/pid.
405 * 2. Check whether the ino_t is in our store.
406 * a. if not, fork a child in qpid's ns to send us
407 * ucred.pid = 1, and read the initpid. Cache
408 * initpid and creation time for /proc/initpid
409 * in a new store entry.
410 * b. if so, verify that /proc/initpid still matches
411 * what we have saved. If not, clear the store
412 * entry and go back to a. If so, return the
413 * cached initpid.
414 */
415struct pidns_init_store {
416 ino_t ino; // inode number for /proc/$pid/ns/pid
417 pid_t initpid; // the pid of nit in that ns
418 long int ctime; // the time at which /proc/$initpid was created
419 struct pidns_init_store *next;
420 long int lastcheck;
421};
422
423/* lol - look at how they are allocated in the kernel */
424#define PIDNS_HASH_SIZE 4096
425#define HASH(x) ((x) % PIDNS_HASH_SIZE)
426
427static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
428static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
429static void lock_mutex(pthread_mutex_t *l)
430{
431 int ret;
432
433 if ((ret = pthread_mutex_lock(l)) != 0) {
b8defc3d 434 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
435 exit(1);
436 }
437}
438
29a73c2f
CB
439/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
440 * Number of hierarchies mounted. */
441static int num_hierarchies;
442
443/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
444 * Hierachies mounted {cpuset, blkio, ...}:
445 * Initialized via __constructor__ collect_and_mount_subsystems(). */
446static char **hierarchies;
447
448/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
449 * Open file descriptors:
450 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
451 * private mount namespace.
452 * Initialized via __constructor__ collect_and_mount_subsystems().
453 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
454 * mounts and respective files in the private namespace even when located in
455 * another namespace using the *at() family of functions
456 * {openat(), fchownat(), ...}. */
457static int *fd_hierarchies;
a257a8ee 458static int cgroup_mount_ns_fd = -1;
29a73c2f 459
237e200e
SH
460static void unlock_mutex(pthread_mutex_t *l)
461{
462 int ret;
463
464 if ((ret = pthread_mutex_unlock(l)) != 0) {
b8defc3d 465 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
466 exit(1);
467 }
468}
469
470static void store_lock(void)
471{
472 lock_mutex(&pidns_store_mutex);
473}
474
475static void store_unlock(void)
476{
477 unlock_mutex(&pidns_store_mutex);
478}
479
480/* Must be called under store_lock */
481static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
482{
483 struct stat initsb;
484 char fnam[100];
485
486 snprintf(fnam, 100, "/proc/%d", e->initpid);
487 if (stat(fnam, &initsb) < 0)
488 return false;
7dd6560a
CB
489
490 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
491 initsb.st_ctime, e->initpid);
492
237e200e
SH
493 if (e->ctime != initsb.st_ctime)
494 return false;
495 return true;
496}
497
498/* Must be called under store_lock */
499static void remove_initpid(struct pidns_init_store *e)
500{
501 struct pidns_init_store *tmp;
502 int h;
503
7dd6560a
CB
504 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
505
237e200e
SH
506 h = HASH(e->ino);
507 if (pidns_hash_table[h] == e) {
508 pidns_hash_table[h] = e->next;
509 free(e);
510 return;
511 }
512
513 tmp = pidns_hash_table[h];
514 while (tmp) {
515 if (tmp->next == e) {
516 tmp->next = e->next;
517 free(e);
518 return;
519 }
520 tmp = tmp->next;
521 }
522}
523
524#define PURGE_SECS 5
525/* Must be called under store_lock */
526static void prune_initpid_store(void)
527{
528 static long int last_prune = 0;
529 struct pidns_init_store *e, *prev, *delme;
530 long int now, threshold;
531 int i;
532
533 if (!last_prune) {
534 last_prune = time(NULL);
535 return;
536 }
537 now = time(NULL);
538 if (now < last_prune + PURGE_SECS)
539 return;
7dd6560a
CB
540
541 lxcfs_debug("%s\n", "Pruning.");
542
237e200e
SH
543 last_prune = now;
544 threshold = now - 2 * PURGE_SECS;
545
546 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
547 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
548 if (e->lastcheck < threshold) {
7dd6560a
CB
549
550 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
551
237e200e
SH
552 delme = e;
553 if (prev)
554 prev->next = e->next;
555 else
556 pidns_hash_table[i] = e->next;
557 e = e->next;
558 free(delme);
559 } else {
560 prev = e;
561 e = e->next;
562 }
563 }
564 }
565}
566
567/* Must be called under store_lock */
568static void save_initpid(struct stat *sb, pid_t pid)
569{
570 struct pidns_init_store *e;
571 char fpath[100];
572 struct stat procsb;
573 int h;
574
7dd6560a
CB
575 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
576
237e200e
SH
577 snprintf(fpath, 100, "/proc/%d", pid);
578 if (stat(fpath, &procsb) < 0)
579 return;
580 do {
581 e = malloc(sizeof(*e));
582 } while (!e);
583 e->ino = sb->st_ino;
584 e->initpid = pid;
585 e->ctime = procsb.st_ctime;
586 h = HASH(e->ino);
587 e->next = pidns_hash_table[h];
588 e->lastcheck = time(NULL);
589 pidns_hash_table[h] = e;
590}
591
592/*
593 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
594 * entry for the inode number and creation time. Verify that the init pid
595 * is still valid. If not, remove it. Return the entry if valid, NULL
596 * otherwise.
597 * Must be called under store_lock
598 */
599static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
600{
601 int h = HASH(sb->st_ino);
602 struct pidns_init_store *e = pidns_hash_table[h];
603
604 while (e) {
605 if (e->ino == sb->st_ino) {
606 if (initpid_still_valid(e, sb)) {
607 e->lastcheck = time(NULL);
608 return e;
609 }
610 remove_initpid(e);
611 return NULL;
612 }
613 e = e->next;
614 }
615
616 return NULL;
617}
618
0f657ce3 619static int is_dir(const char *path, int fd)
237e200e
SH
620{
621 struct stat statbuf;
0f657ce3 622 int ret = fstatat(fd, path, &statbuf, fd);
237e200e
SH
623 if (ret == 0 && S_ISDIR(statbuf.st_mode))
624 return 1;
625 return 0;
626}
627
628static char *must_copy_string(const char *str)
629{
630 char *dup = NULL;
631 if (!str)
632 return NULL;
633 do {
634 dup = strdup(str);
635 } while (!dup);
636
637 return dup;
638}
639
640static inline void drop_trailing_newlines(char *s)
641{
642 int l;
643
644 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
645 s[l-1] = '\0';
646}
647
648#define BATCH_SIZE 50
649static void dorealloc(char **mem, size_t oldlen, size_t newlen)
650{
651 int newbatches = (newlen / BATCH_SIZE) + 1;
652 int oldbatches = (oldlen / BATCH_SIZE) + 1;
653
654 if (!*mem || newbatches > oldbatches) {
655 char *tmp;
656 do {
657 tmp = realloc(*mem, newbatches * BATCH_SIZE);
658 } while (!tmp);
659 *mem = tmp;
660 }
661}
662static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
663{
664 size_t newlen = *len + linelen;
665 dorealloc(contents, *len, newlen + 1);
666 memcpy(*contents + *len, line, linelen+1);
667 *len = newlen;
668}
669
60f2ae53 670static char *slurp_file(const char *from, int fd)
237e200e
SH
671{
672 char *line = NULL;
673 char *contents = NULL;
60f2ae53 674 FILE *f = fdopen(fd, "r");
237e200e
SH
675 size_t len = 0, fulllen = 0;
676 ssize_t linelen;
677
678 if (!f)
679 return NULL;
680
681 while ((linelen = getline(&line, &len, f)) != -1) {
682 append_line(&contents, &fulllen, line, linelen);
683 }
684 fclose(f);
685
686 if (contents)
687 drop_trailing_newlines(contents);
688 free(line);
689 return contents;
690}
691
ba59ea09 692static bool write_string(const char *fnam, const char *string, int fd)
237e200e
SH
693{
694 FILE *f;
695 size_t len, ret;
696
beb5024e
CB
697 f = fdopen(fd, "w");
698 if (!f)
237e200e 699 return false;
beb5024e 700
237e200e
SH
701 len = strlen(string);
702 ret = fwrite(string, 1, len, f);
703 if (ret != len) {
beb5024e
CB
704 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
705 strerror(errno), string, fnam);
237e200e
SH
706 fclose(f);
707 return false;
708 }
beb5024e 709
237e200e 710 if (fclose(f) < 0) {
beb5024e 711 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
237e200e
SH
712 return false;
713 }
beb5024e 714
237e200e
SH
715 return true;
716}
717
237e200e
SH
718struct cgfs_files {
719 char *name;
720 uint32_t uid, gid;
721 uint32_t mode;
722};
723
0619767c 724#define ALLOC_NUM 20
237e200e
SH
725static bool store_hierarchy(char *stridx, char *h)
726{
0619767c
SH
727 if (num_hierarchies % ALLOC_NUM == 0) {
728 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
729 n *= ALLOC_NUM;
730 char **tmp = realloc(hierarchies, n * sizeof(char *));
0619767c 731 if (!tmp) {
b8defc3d 732 lxcfs_error("%s\n", strerror(errno));
0619767c
SH
733 exit(1);
734 }
237e200e 735 hierarchies = tmp;
237e200e 736 }
f676eb79 737
0619767c 738 hierarchies[num_hierarchies++] = must_copy_string(h);
237e200e
SH
739 return true;
740}
741
742static void print_subsystems(void)
743{
744 int i;
745
a257a8ee 746 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
cc97d34c 747 fprintf(stderr, "hierarchies:\n");
237e200e
SH
748 for (i = 0; i < num_hierarchies; i++) {
749 if (hierarchies[i])
b8defc3d
CB
750 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
751 fd_hierarchies[i], hierarchies[i]);
237e200e
SH
752 }
753}
754
755static bool in_comma_list(const char *needle, const char *haystack)
756{
757 const char *s = haystack, *e;
758 size_t nlen = strlen(needle);
759
06081b29 760 while (*s && (e = strchr(s, ','))) {
237e200e
SH
761 if (nlen != e - s) {
762 s = e + 1;
763 continue;
764 }
765 if (strncmp(needle, s, nlen) == 0)
766 return true;
767 s = e + 1;
768 }
769 if (strcmp(needle, s) == 0)
770 return true;
771 return false;
772}
773
774/* do we need to do any massaging here? I'm not sure... */
5dd3e6fd
CB
775/* Return the mounted controller and store the corresponding open file descriptor
776 * referring to the controller mountpoint in the private lxcfs namespace in
777 * @cfd.
778 */
779static char *find_mounted_controller(const char *controller, int *cfd)
237e200e
SH
780{
781 int i;
782
783 for (i = 0; i < num_hierarchies; i++) {
784 if (!hierarchies[i])
785 continue;
5dd3e6fd
CB
786 if (strcmp(hierarchies[i], controller) == 0) {
787 *cfd = fd_hierarchies[i];
237e200e 788 return hierarchies[i];
5dd3e6fd
CB
789 }
790 if (in_comma_list(controller, hierarchies[i])) {
791 *cfd = fd_hierarchies[i];
237e200e 792 return hierarchies[i];
5dd3e6fd 793 }
237e200e
SH
794 }
795
796 return NULL;
797}
798
799bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
800 const char *value)
801{
ba59ea09 802 int ret, fd, cfd;
237e200e 803 size_t len;
f5a6d92e 804 char *fnam, *tmpc;
237e200e 805
f5a6d92e 806 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
807 if (!tmpc)
808 return false;
f5a6d92e
CB
809
810 /* Make sure we pass a relative path to *at() family of functions.
811 * . + /cgroup + / + file + \0
812 */
ba59ea09 813 len = strlen(cgroup) + strlen(file) + 3;
237e200e 814 fnam = alloca(len);
ba59ea09
CB
815 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
816 if (ret < 0 || (size_t)ret >= len)
817 return false;
818
819 fd = openat(cfd, fnam, O_WRONLY);
820 if (fd < 0)
821 return false;
f676eb79 822
ba59ea09 823 return write_string(fnam, value, fd);
237e200e
SH
824}
825
826// Chown all the files in the cgroup directory. We do this when we create
827// a cgroup on behalf of a user.
f23fe717 828static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e 829{
f23fe717 830 struct dirent *direntp;
237e200e
SH
831 char path[MAXPATHLEN];
832 size_t len;
833 DIR *d;
f23fe717 834 int fd1, ret;
237e200e
SH
835
836 len = strlen(dirname);
837 if (len >= MAXPATHLEN) {
b8defc3d 838 lxcfs_error("Pathname too long: %s\n", dirname);
237e200e
SH
839 return;
840 }
841
f23fe717
CB
842 fd1 = openat(fd, dirname, O_DIRECTORY);
843 if (fd1 < 0)
844 return;
845
846 d = fdopendir(fd1);
237e200e 847 if (!d) {
b8defc3d 848 lxcfs_error("Failed to open %s\n", dirname);
237e200e
SH
849 return;
850 }
851
f23fe717 852 while ((direntp = readdir(d))) {
237e200e
SH
853 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
854 continue;
855 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
856 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 857 lxcfs_error("Pathname too long under %s\n", dirname);
237e200e
SH
858 continue;
859 }
f23fe717 860 if (fchownat(fd, path, uid, gid, 0) < 0)
b8defc3d 861 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
237e200e
SH
862 }
863 closedir(d);
864}
865
866int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
867{
5dd3e6fd 868 int cfd;
237e200e 869 size_t len;
f5a6d92e 870 char *dirnam, *tmpc;
237e200e 871
f5a6d92e 872 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
873 if (!tmpc)
874 return -EINVAL;
f5a6d92e
CB
875
876 /* Make sure we pass a relative path to *at() family of functions.
877 * . + /cg + \0
878 */
f23fe717 879 len = strlen(cg) + 2;
237e200e 880 dirnam = alloca(len);
f23fe717 881 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
237e200e 882
f23fe717 883 if (mkdirat(cfd, dirnam, 0755) < 0)
237e200e
SH
884 return -errno;
885
886 if (uid == 0 && gid == 0)
887 return 0;
888
f23fe717 889 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
237e200e
SH
890 return -errno;
891
f23fe717 892 chown_all_cgroup_files(dirnam, uid, gid, cfd);
237e200e
SH
893
894 return 0;
895}
896
7213ec5c 897static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
237e200e 898{
b7672ded 899 struct dirent *direntp;
237e200e
SH
900 DIR *dir;
901 bool ret = false;
902 char pathname[MAXPATHLEN];
b7672ded
CB
903 int dupfd;
904
905 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
906 if (dupfd < 0)
907 return false;
237e200e 908
b7672ded 909 dir = fdopendir(dupfd);
237e200e 910 if (!dir) {
7dd6560a 911 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
7213ec5c 912 close(dupfd);
237e200e
SH
913 return false;
914 }
915
b7672ded 916 while ((direntp = readdir(dir))) {
237e200e
SH
917 struct stat mystat;
918 int rc;
919
237e200e
SH
920 if (!strcmp(direntp->d_name, ".") ||
921 !strcmp(direntp->d_name, ".."))
922 continue;
923
924 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
925 if (rc < 0 || rc >= MAXPATHLEN) {
b8defc3d 926 lxcfs_error("%s\n", "Pathname too long.");
237e200e
SH
927 continue;
928 }
929
2e81a5e3
CB
930 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
931 if (rc) {
7dd6560a 932 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
237e200e
SH
933 continue;
934 }
7dd6560a 935 if (S_ISDIR(mystat.st_mode))
2e81a5e3 936 if (!recursive_rmdir(pathname, fd, cfd))
7dd6560a 937 lxcfs_debug("Error removing %s.\n", pathname);
237e200e
SH
938 }
939
940 ret = true;
941 if (closedir(dir) < 0) {
b8defc3d 942 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
237e200e
SH
943 ret = false;
944 }
945
2e81a5e3 946 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
7dd6560a 947 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
237e200e
SH
948 ret = false;
949 }
7213ec5c
CB
950
951 close(dupfd);
237e200e
SH
952
953 return ret;
954}
955
956bool cgfs_remove(const char *controller, const char *cg)
957{
b7672ded 958 int fd, cfd;
237e200e 959 size_t len;
f5a6d92e 960 char *dirnam, *tmpc;
7213ec5c 961 bool bret;
237e200e 962
f5a6d92e 963 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
964 if (!tmpc)
965 return false;
f5a6d92e
CB
966
967 /* Make sure we pass a relative path to *at() family of functions.
968 * . + /cg + \0
969 */
b7672ded 970 len = strlen(cg) + 2;
237e200e 971 dirnam = alloca(len);
b7672ded
CB
972 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
973
974 fd = openat(cfd, dirnam, O_DIRECTORY);
975 if (fd < 0)
976 return false;
977
7213ec5c
CB
978 bret = recursive_rmdir(dirnam, fd, cfd);
979 close(fd);
980 return bret;
237e200e
SH
981}
982
983bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
984{
5dd3e6fd 985 int cfd;
237e200e 986 size_t len;
f5a6d92e 987 char *pathname, *tmpc;
237e200e 988
f5a6d92e 989 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
990 if (!tmpc)
991 return false;
f5a6d92e
CB
992
993 /* Make sure we pass a relative path to *at() family of functions.
994 * . + /file + \0
995 */
534690b4 996 len = strlen(file) + 2;
237e200e 997 pathname = alloca(len);
534690b4
CB
998 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
999 if (fchmodat(cfd, pathname, mode, 0) < 0)
237e200e
SH
1000 return false;
1001 return true;
1002}
1003
0f657ce3 1004static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e
SH
1005{
1006 size_t len;
1007 char *fname;
1008
1009 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
1010 fname = alloca(len);
1011 snprintf(fname, len, "%s/tasks", dirname);
0f657ce3 1012 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
1013 return -errno;
1014 snprintf(fname, len, "%s/cgroup.procs", dirname);
0f657ce3 1015 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
1016 return -errno;
1017 return 0;
1018}
1019
1020int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
1021{
5dd3e6fd 1022 int cfd;
237e200e 1023 size_t len;
f5a6d92e 1024 char *pathname, *tmpc;
237e200e 1025
f5a6d92e 1026 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1027 if (!tmpc)
1028 return -EINVAL;
f5a6d92e
CB
1029
1030 /* Make sure we pass a relative path to *at() family of functions.
1031 * . + /file + \0
1032 */
0f657ce3 1033 len = strlen(file) + 2;
237e200e 1034 pathname = alloca(len);
0f657ce3
CB
1035 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1036 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
237e200e
SH
1037 return -errno;
1038
0f657ce3 1039 if (is_dir(pathname, cfd))
237e200e 1040 // like cgmanager did, we want to chown the tasks file as well
0f657ce3 1041 return chown_tasks_files(pathname, uid, gid, cfd);
237e200e
SH
1042
1043 return 0;
1044}
1045
1046FILE *open_pids_file(const char *controller, const char *cgroup)
1047{
3ffd08ee 1048 int fd, cfd;
237e200e 1049 size_t len;
f5a6d92e 1050 char *pathname, *tmpc;
237e200e 1051
f5a6d92e 1052 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1053 if (!tmpc)
1054 return NULL;
f5a6d92e
CB
1055
1056 /* Make sure we pass a relative path to *at() family of functions.
1057 * . + /cgroup + / "cgroup.procs" + \0
1058 */
3ffd08ee 1059 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
237e200e 1060 pathname = alloca(len);
3ffd08ee
CB
1061 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1062
1063 fd = openat(cfd, pathname, O_WRONLY);
1064 if (fd < 0)
1065 return NULL;
1066
1067 return fdopen(fd, "w");
237e200e
SH
1068}
1069
f366da65
WB
1070static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1071 void ***list, size_t typesize,
1072 void* (*iterator)(const char*, const char*, const char*))
237e200e 1073{
4ea38a4c 1074 int cfd, fd, ret;
237e200e 1075 size_t len;
4ea38a4c 1076 char *cg, *tmpc;
237e200e 1077 char pathname[MAXPATHLEN];
f366da65 1078 size_t sz = 0, asz = 0;
4ea38a4c 1079 struct dirent *dirent;
237e200e 1080 DIR *dir;
237e200e 1081
4ea38a4c 1082 tmpc = find_mounted_controller(controller, &cfd);
f366da65 1083 *list = NULL;
237e200e 1084 if (!tmpc)
e97c834b 1085 return false;
237e200e 1086
f5a6d92e 1087 /* Make sure we pass a relative path to *at() family of functions. */
4ea38a4c
CB
1088 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1089 cg = alloca(len);
1090 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1091 if (ret < 0 || (size_t)ret >= len) {
b8defc3d 1092 lxcfs_error("Pathname too long under %s\n", cgroup);
4ea38a4c
CB
1093 return false;
1094 }
237e200e 1095
4ea38a4c
CB
1096 fd = openat(cfd, cg, O_DIRECTORY);
1097 if (fd < 0)
1098 return false;
1099
1100 dir = fdopendir(fd);
237e200e
SH
1101 if (!dir)
1102 return false;
1103
4ea38a4c 1104 while ((dirent = readdir(dir))) {
237e200e 1105 struct stat mystat;
237e200e 1106
4ea38a4c
CB
1107 if (!strcmp(dirent->d_name, ".") ||
1108 !strcmp(dirent->d_name, ".."))
237e200e
SH
1109 continue;
1110
4ea38a4c
CB
1111 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1112 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 1113 lxcfs_error("Pathname too long under %s\n", cg);
237e200e
SH
1114 continue;
1115 }
1116
4ea38a4c 1117 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
237e200e 1118 if (ret) {
b8defc3d 1119 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
237e200e
SH
1120 continue;
1121 }
f366da65
WB
1122 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1123 (directories && !S_ISDIR(mystat.st_mode)))
237e200e
SH
1124 continue;
1125
1126 if (sz+2 >= asz) {
f366da65 1127 void **tmp;
237e200e
SH
1128 asz += BATCH_SIZE;
1129 do {
f366da65 1130 tmp = realloc(*list, asz * typesize);
237e200e
SH
1131 } while (!tmp);
1132 *list = tmp;
1133 }
4ea38a4c 1134 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
237e200e
SH
1135 (*list)[sz+1] = NULL;
1136 sz++;
1137 }
1138 if (closedir(dir) < 0) {
b8defc3d 1139 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
237e200e
SH
1140 return false;
1141 }
1142 return true;
1143}
1144
f366da65
WB
1145static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1146{
1147 char *dup;
1148 do {
1149 dup = strdup(dir_entry);
1150 } while (!dup);
1151 return dup;
1152}
1153
1154bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1155{
1156 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1157}
1158
237e200e
SH
1159void free_key(struct cgfs_files *k)
1160{
1161 if (!k)
1162 return;
1163 free(k->name);
1164 free(k);
1165}
1166
1167void free_keys(struct cgfs_files **keys)
1168{
1169 int i;
1170
1171 if (!keys)
1172 return;
1173 for (i = 0; keys[i]; i++) {
1174 free_key(keys[i]);
1175 }
1176 free(keys);
1177}
1178
1179bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1180{
60f2ae53 1181 int ret, fd, cfd;
237e200e 1182 size_t len;
f5a6d92e 1183 char *fnam, *tmpc;
237e200e 1184
f5a6d92e 1185 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1186 if (!tmpc)
1187 return false;
f5a6d92e
CB
1188
1189 /* Make sure we pass a relative path to *at() family of functions.
1190 * . + /cgroup + / + file + \0
1191 */
60f2ae53 1192 len = strlen(cgroup) + strlen(file) + 3;
237e200e 1193 fnam = alloca(len);
60f2ae53
CB
1194 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1195 if (ret < 0 || (size_t)ret >= len)
234a820c 1196 return false;
60f2ae53
CB
1197
1198 fd = openat(cfd, fnam, O_RDONLY);
1199 if (fd < 0)
234a820c 1200 return false;
237e200e 1201
60f2ae53 1202 *value = slurp_file(fnam, fd);
237e200e
SH
1203 return *value != NULL;
1204}
1205
951acc94
JS
1206bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1207{
1208 int ret, cfd;
1209 size_t len;
1210 char *fnam, *tmpc;
1211
1212 tmpc = find_mounted_controller(controller, &cfd);
1213 if (!tmpc)
1214 return false;
1215
1216 /* Make sure we pass a relative path to *at() family of functions.
1217 * . + /cgroup + / + file + \0
1218 */
1219 len = strlen(cgroup) + strlen(file) + 3;
1220 fnam = alloca(len);
1221 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1222 if (ret < 0 || (size_t)ret >= len)
1223 return false;
1224
1225 return (faccessat(cfd, fnam, F_OK, 0) == 0);
1226}
1227
237e200e
SH
1228struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1229{
4ea38a4c 1230 int ret, cfd;
237e200e 1231 size_t len;
f5a6d92e 1232 char *fnam, *tmpc;
237e200e
SH
1233 struct stat sb;
1234 struct cgfs_files *newkey;
237e200e 1235
f5a6d92e 1236 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1237 if (!tmpc)
1238 return false;
1239
1240 if (file && *file == '/')
1241 file++;
1242
06081b29 1243 if (file && strchr(file, '/'))
237e200e
SH
1244 return NULL;
1245
f5a6d92e
CB
1246 /* Make sure we pass a relative path to *at() family of functions.
1247 * . + /cgroup + / + file + \0
1248 */
4ea38a4c 1249 len = strlen(cgroup) + 3;
237e200e
SH
1250 if (file)
1251 len += strlen(file) + 1;
1252 fnam = alloca(len);
4ea38a4c
CB
1253 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1254 file ? "/" : "", file ? file : "");
237e200e 1255
4ea38a4c 1256 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1257 if (ret < 0)
1258 return NULL;
1259
1260 do {
1261 newkey = malloc(sizeof(struct cgfs_files));
1262 } while (!newkey);
1263 if (file)
1264 newkey->name = must_copy_string(file);
06081b29
CB
1265 else if (strrchr(cgroup, '/'))
1266 newkey->name = must_copy_string(strrchr(cgroup, '/'));
237e200e
SH
1267 else
1268 newkey->name = must_copy_string(cgroup);
1269 newkey->uid = sb.st_uid;
1270 newkey->gid = sb.st_gid;
1271 newkey->mode = sb.st_mode;
1272
1273 return newkey;
1274}
1275
f366da65 1276static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
237e200e 1277{
f366da65
WB
1278 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1279 if (!entry) {
b8defc3d
CB
1280 lxcfs_error("Error getting files under %s:%s\n", controller,
1281 cgroup);
237e200e 1282 }
f366da65
WB
1283 return entry;
1284}
1285
1286bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1287{
1288 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
237e200e
SH
1289}
1290
1291bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
5dd3e6fd
CB
1292{
1293 int cfd;
1294 size_t len;
f5a6d92e 1295 char *fnam, *tmpc;
237e200e
SH
1296 int ret;
1297 struct stat sb;
1298
f5a6d92e 1299 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1300 if (!tmpc)
1301 return false;
f5a6d92e
CB
1302
1303 /* Make sure we pass a relative path to *at() family of functions.
1304 * . + /cgroup + / + f + \0
1305 */
d04232f2 1306 len = strlen(cgroup) + strlen(f) + 3;
237e200e 1307 fnam = alloca(len);
d04232f2
CB
1308 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1309 if (ret < 0 || (size_t)ret >= len)
1310 return false;
237e200e 1311
d04232f2 1312 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1313 if (ret < 0 || !S_ISDIR(sb.st_mode))
1314 return false;
f5a6d92e 1315
237e200e
SH
1316 return true;
1317}
1318
1319#define SEND_CREDS_OK 0
1320#define SEND_CREDS_NOTSK 1
1321#define SEND_CREDS_FAIL 2
1322static bool recv_creds(int sock, struct ucred *cred, char *v);
1323static int wait_for_pid(pid_t pid);
1324static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
b10bdd6c 1325static int send_creds_clone_wrapper(void *arg);
237e200e
SH
1326
1327/*
b10bdd6c 1328 * clone a task which switches to @task's namespace and writes '1'.
237e200e
SH
1329 * over a unix sock so we can read the task's reaper's pid in our
1330 * namespace
b10bdd6c
FG
1331 *
1332 * Note: glibc's fork() does not respect pidns, which can lead to failed
1333 * assertions inside glibc (and thus failed forks) if the child's pid in
1334 * the pidns and the parent pid outside are identical. Using clone prevents
1335 * this issue.
237e200e
SH
1336 */
1337static void write_task_init_pid_exit(int sock, pid_t target)
1338{
237e200e
SH
1339 char fnam[100];
1340 pid_t pid;
237e200e 1341 int fd, ret;
b10bdd6c
FG
1342 size_t stack_size = sysconf(_SC_PAGESIZE);
1343 void *stack = alloca(stack_size);
237e200e
SH
1344
1345 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1346 if (ret < 0 || ret >= sizeof(fnam))
1347 _exit(1);
1348
1349 fd = open(fnam, O_RDONLY);
1350 if (fd < 0) {
1351 perror("write_task_init_pid_exit open of ns/pid");
1352 _exit(1);
1353 }
1354 if (setns(fd, 0)) {
1355 perror("write_task_init_pid_exit setns 1");
1356 close(fd);
1357 _exit(1);
1358 }
b10bdd6c 1359 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
237e200e
SH
1360 if (pid < 0)
1361 _exit(1);
1362 if (pid != 0) {
1363 if (!wait_for_pid(pid))
1364 _exit(1);
1365 _exit(0);
1366 }
b10bdd6c
FG
1367}
1368
1369static int send_creds_clone_wrapper(void *arg) {
1370 struct ucred cred;
1371 char v;
1372 int sock = *(int *)arg;
237e200e
SH
1373
1374 /* we are the child */
1375 cred.uid = 0;
1376 cred.gid = 0;
1377 cred.pid = 1;
1378 v = '1';
1379 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
b10bdd6c
FG
1380 return 1;
1381 return 0;
237e200e
SH
1382}
1383
1384static pid_t get_init_pid_for_task(pid_t task)
1385{
1386 int sock[2];
1387 pid_t pid;
1388 pid_t ret = -1;
1389 char v = '0';
1390 struct ucred cred;
1391
1392 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1393 perror("socketpair");
1394 return -1;
1395 }
1396
1397 pid = fork();
1398 if (pid < 0)
1399 goto out;
1400 if (!pid) {
1401 close(sock[1]);
1402 write_task_init_pid_exit(sock[0], task);
1403 _exit(0);
1404 }
1405
1406 if (!recv_creds(sock[1], &cred, &v))
1407 goto out;
1408 ret = cred.pid;
1409
1410out:
1411 close(sock[0]);
1412 close(sock[1]);
1413 if (pid > 0)
1414 wait_for_pid(pid);
1415 return ret;
1416}
1417
1418static pid_t lookup_initpid_in_store(pid_t qpid)
1419{
1420 pid_t answer = 0;
1421 struct stat sb;
1422 struct pidns_init_store *e;
1423 char fnam[100];
1424
1425 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1426 store_lock();
1427 if (stat(fnam, &sb) < 0)
1428 goto out;
1429 e = lookup_verify_initpid(&sb);
1430 if (e) {
1431 answer = e->initpid;
1432 goto out;
1433 }
1434 answer = get_init_pid_for_task(qpid);
1435 if (answer > 0)
1436 save_initpid(&sb, answer);
1437
1438out:
1439 /* we prune at end in case we are returning
1440 * the value we were about to return */
1441 prune_initpid_store();
1442 store_unlock();
1443 return answer;
1444}
1445
1446static int wait_for_pid(pid_t pid)
1447{
1448 int status, ret;
1449
1450 if (pid <= 0)
1451 return -1;
1452
1453again:
1454 ret = waitpid(pid, &status, 0);
1455 if (ret == -1) {
1456 if (errno == EINTR)
1457 goto again;
1458 return -1;
1459 }
1460 if (ret != pid)
1461 goto again;
1462 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1463 return -1;
1464 return 0;
1465}
1466
1467
1468/*
1469 * append pid to *src.
1470 * src: a pointer to a char* in which ot append the pid.
1471 * sz: the number of characters printed so far, minus trailing \0.
1472 * asz: the allocated size so far
1473 * pid: the pid to append
1474 */
1475static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1476{
1477 char tmp[30];
1478
1479 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1480
1481 if (!*src || tmplen + *sz + 1 >= *asz) {
1482 char *tmp;
1483 do {
1484 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1485 } while (!tmp);
1486 *src = tmp;
1487 *asz += BUF_RESERVE_SIZE;
1488 }
bbfd0e33 1489 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
237e200e 1490 *sz += tmplen;
237e200e
SH
1491}
1492
1493/*
1494 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1495 * valid in the caller's namespace, return the id mapped into
1496 * pid's namespace.
1497 * Returns the mapped id, or -1 on error.
1498 */
1499unsigned int
1500convert_id_to_ns(FILE *idfile, unsigned int in_id)
1501{
1502 unsigned int nsuid, // base id for a range in the idfile's namespace
1503 hostuid, // base id for a range in the caller's namespace
1504 count; // number of ids in this range
1505 char line[400];
1506 int ret;
1507
1508 fseek(idfile, 0L, SEEK_SET);
1509 while (fgets(line, 400, idfile)) {
1510 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1511 if (ret != 3)
1512 continue;
1513 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1514 /*
1515 * uids wrapped around - unexpected as this is a procfile,
1516 * so just bail.
1517 */
b8defc3d 1518 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
237e200e
SH
1519 nsuid, hostuid, count, line);
1520 return -1;
1521 }
1522 if (hostuid <= in_id && hostuid+count > in_id) {
1523 /*
1524 * now since hostuid <= in_id < hostuid+count, and
1525 * hostuid+count and nsuid+count do not wrap around,
1526 * we know that nsuid+(in_id-hostuid) which must be
1527 * less that nsuid+(count) must not wrap around
1528 */
1529 return (in_id - hostuid) + nsuid;
1530 }
1531 }
1532
1533 // no answer found
1534 return -1;
1535}
1536
1537/*
1538 * for is_privileged_over,
1539 * specify whether we require the calling uid to be root in his
1540 * namespace
1541 */
1542#define NS_ROOT_REQD true
1543#define NS_ROOT_OPT false
1544
1545#define PROCLEN 100
1546
1547static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1548{
1549 char fpath[PROCLEN];
1550 int ret;
1551 bool answer = false;
1552 uid_t nsuid;
1553
1554 if (victim == -1 || uid == -1)
1555 return false;
1556
1557 /*
1558 * If the request is one not requiring root in the namespace,
1559 * then having the same uid suffices. (i.e. uid 1000 has write
1560 * access to files owned by uid 1000
1561 */
1562 if (!req_ns_root && uid == victim)
1563 return true;
1564
1565 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1566 if (ret < 0 || ret >= PROCLEN)
1567 return false;
1568 FILE *f = fopen(fpath, "r");
1569 if (!f)
1570 return false;
1571
1572 /* if caller's not root in his namespace, reject */
1573 nsuid = convert_id_to_ns(f, uid);
1574 if (nsuid)
1575 goto out;
1576
1577 /*
1578 * If victim is not mapped into caller's ns, reject.
1579 * XXX I'm not sure this check is needed given that fuse
1580 * will be sending requests where the vfs has converted
1581 */
1582 nsuid = convert_id_to_ns(f, victim);
1583 if (nsuid == -1)
1584 goto out;
1585
1586 answer = true;
1587
1588out:
1589 fclose(f);
1590 return answer;
1591}
1592
1593static bool perms_include(int fmode, mode_t req_mode)
1594{
1595 mode_t r;
1596
1597 switch (req_mode & O_ACCMODE) {
1598 case O_RDONLY:
1599 r = S_IROTH;
1600 break;
1601 case O_WRONLY:
1602 r = S_IWOTH;
1603 break;
1604 case O_RDWR:
1605 r = S_IROTH | S_IWOTH;
1606 break;
1607 default:
1608 return false;
1609 }
1610 return ((fmode & r) == r);
1611}
1612
1613
1614/*
1615 * taskcg is a/b/c
1616 * querycg is /a/b/c/d/e
1617 * we return 'd'
1618 */
1619static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1620{
1621 char *start, *end;
1622
1623 if (strlen(taskcg) <= strlen(querycg)) {
b8defc3d 1624 lxcfs_error("%s\n", "I was fed bad input.");
237e200e
SH
1625 return NULL;
1626 }
1627
06081b29 1628 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
237e200e
SH
1629 start = strdup(taskcg + 1);
1630 else
1631 start = strdup(taskcg + strlen(querycg) + 1);
1632 if (!start)
1633 return NULL;
1634 end = strchr(start, '/');
1635 if (end)
1636 *end = '\0';
1637 return start;
1638}
1639
1640static void stripnewline(char *x)
1641{
1642 size_t l = strlen(x);
1643 if (l && x[l-1] == '\n')
1644 x[l-1] = '\0';
1645}
1646
1647static char *get_pid_cgroup(pid_t pid, const char *contrl)
1648{
5dd3e6fd 1649 int cfd;
237e200e
SH
1650 char fnam[PROCLEN];
1651 FILE *f;
1652 char *answer = NULL;
1653 char *line = NULL;
1654 size_t len = 0;
1655 int ret;
5dd3e6fd 1656 const char *h = find_mounted_controller(contrl, &cfd);
237e200e
SH
1657 if (!h)
1658 return NULL;
1659
1660 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1661 if (ret < 0 || ret >= PROCLEN)
1662 return NULL;
1663 if (!(f = fopen(fnam, "r")))
1664 return NULL;
1665
1666 while (getline(&line, &len, f) != -1) {
1667 char *c1, *c2;
1668 if (!line[0])
1669 continue;
1670 c1 = strchr(line, ':');
1671 if (!c1)
1672 goto out;
1673 c1++;
1674 c2 = strchr(c1, ':');
1675 if (!c2)
1676 goto out;
1677 *c2 = '\0';
1678 if (strcmp(c1, h) != 0)
1679 continue;
1680 c2++;
1681 stripnewline(c2);
1682 do {
1683 answer = strdup(c2);
1684 } while (!answer);
1685 break;
1686 }
1687
1688out:
1689 fclose(f);
1690 free(line);
1691 return answer;
1692}
1693
1694/*
1695 * check whether a fuse context may access a cgroup dir or file
1696 *
1697 * If file is not null, it is a cgroup file to check under cg.
1698 * If file is null, then we are checking perms on cg itself.
1699 *
1700 * For files we can check the mode of the list_keys result.
1701 * For cgroups, we must make assumptions based on the files under the
1702 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1703 * yet.
1704 */
1705static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1706{
1707 struct cgfs_files *k = NULL;
1708 bool ret = false;
1709
1710 k = cgfs_get_key(contrl, cg, file);
1711 if (!k)
1712 return false;
1713
1714 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1715 if (perms_include(k->mode >> 6, mode)) {
1716 ret = true;
1717 goto out;
1718 }
1719 }
1720 if (fc->gid == k->gid) {
1721 if (perms_include(k->mode >> 3, mode)) {
1722 ret = true;
1723 goto out;
1724 }
1725 }
1726 ret = perms_include(k->mode, mode);
1727
1728out:
1729 free_key(k);
1730 return ret;
1731}
1732
1733#define INITSCOPE "/init.scope"
1734static void prune_init_slice(char *cg)
1735{
1736 char *point;
1737 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1738
1739 if (cg_len < initscope_len)
1740 return;
1741
1742 point = cg + cg_len - initscope_len;
1743 if (strcmp(point, INITSCOPE) == 0) {
1744 if (point == cg)
1745 *(point+1) = '\0';
1746 else
1747 *point = '\0';
1748 }
1749}
1750
1751/*
1752 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1753 * If pid is in /a, he may act on /a/b, but not on /b.
1754 * if the answer is false and nextcg is not NULL, then *nextcg will point
1755 * to a string containing the next cgroup directory under cg, which must be
1756 * freed by the caller.
1757 */
1758static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1759{
1760 bool answer = false;
1761 char *c2 = get_pid_cgroup(pid, contrl);
1762 char *linecmp;
1763
1764 if (!c2)
1765 return false;
1766 prune_init_slice(c2);
1767
1768 /*
12c31268
CB
1769 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1770 * they pass in a cgroup without leading '/'
1771 *
1772 * The original line here was:
1773 * linecmp = *cg == '/' ? c2 : c2+1;
1774 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1775 * Serge, do you know?
237e200e 1776 */
12c31268
CB
1777 if (*cg == '/' || !strncmp(cg, "./", 2))
1778 linecmp = c2;
1779 else
1780 linecmp = c2 + 1;
237e200e
SH
1781 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1782 if (nextcg) {
1783 *nextcg = get_next_cgroup_dir(linecmp, cg);
1784 }
1785 goto out;
1786 }
1787 answer = true;
1788
1789out:
1790 free(c2);
1791 return answer;
1792}
1793
1794/*
1795 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1796 */
1797static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1798{
1799 bool answer = false;
1800 char *c2, *task_cg;
1801 size_t target_len, task_len;
1802
f7bff426 1803 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
237e200e
SH
1804 return true;
1805
1806 c2 = get_pid_cgroup(pid, contrl);
1807 if (!c2)
1808 return false;
1809 prune_init_slice(c2);
1810
1811 task_cg = c2 + 1;
1812 target_len = strlen(cg);
1813 task_len = strlen(task_cg);
1814 if (task_len == 0) {
1815 /* Task is in the root cg, it can see everything. This case is
1816 * not handled by the strmcps below, since they test for the
1817 * last /, but that is the first / that we've chopped off
1818 * above.
1819 */
1820 answer = true;
1821 goto out;
1822 }
1823 if (strcmp(cg, task_cg) == 0) {
1824 answer = true;
1825 goto out;
1826 }
1827 if (target_len < task_len) {
1828 /* looking up a parent dir */
1829 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1830 answer = true;
1831 goto out;
1832 }
1833 if (target_len > task_len) {
1834 /* looking up a child dir */
1835 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1836 answer = true;
1837 goto out;
1838 }
1839
1840out:
1841 free(c2);
1842 return answer;
1843}
1844
1845/*
1846 * given /cgroup/freezer/a/b, return "freezer".
1847 * the returned char* should NOT be freed.
1848 */
1849static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1850{
1851 const char *p1;
1852 char *contr, *slash;
1853
99142521 1854 if (strlen(path) < 9) {
e254948f 1855 errno = EACCES;
237e200e 1856 return NULL;
99142521
CB
1857 }
1858 if (*(path + 7) != '/') {
1859 errno = EINVAL;
237e200e 1860 return NULL;
99142521 1861 }
3adc421c 1862 p1 = path + 8;
237e200e 1863 contr = strdupa(p1);
99142521
CB
1864 if (!contr) {
1865 errno = ENOMEM;
237e200e 1866 return NULL;
99142521 1867 }
237e200e
SH
1868 slash = strstr(contr, "/");
1869 if (slash)
1870 *slash = '\0';
1871
1872 int i;
3adc421c 1873 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
1874 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1875 return hierarchies[i];
1876 }
99142521 1877 errno = ENOENT;
237e200e
SH
1878 return NULL;
1879}
1880
1881/*
1882 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1883 * Note that the returned value may include files (keynames) etc
1884 */
1885static const char *find_cgroup_in_path(const char *path)
1886{
1887 const char *p1;
1888
bc70ba9b 1889 if (strlen(path) < 9) {
e254948f 1890 errno = EACCES;
237e200e 1891 return NULL;
bc70ba9b
CB
1892 }
1893 p1 = strstr(path + 8, "/");
1894 if (!p1) {
1895 errno = EINVAL;
237e200e 1896 return NULL;
bc70ba9b
CB
1897 }
1898 errno = 0;
1899 return p1 + 1;
237e200e
SH
1900}
1901
1902/*
1903 * split the last path element from the path in @cg.
1904 * @dir is newly allocated and should be freed, @last not
1905*/
1906static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1907{
1908 char *p;
1909
1910 do {
1911 *dir = strdup(cg);
1912 } while (!*dir);
1913 *last = strrchr(cg, '/');
1914 if (!*last) {
1915 *last = NULL;
1916 return;
1917 }
1918 p = strrchr(*dir, '/');
1919 *p = '\0';
1920}
1921
1922/*
1923 * FUSE ops for /cgroup
1924 */
1925
1926int cg_getattr(const char *path, struct stat *sb)
1927{
1928 struct timespec now;
1929 struct fuse_context *fc = fuse_get_context();
1930 char * cgdir = NULL;
1931 char *last = NULL, *path1, *path2;
1932 struct cgfs_files *k = NULL;
1933 const char *cgroup;
1934 const char *controller = NULL;
1935 int ret = -ENOENT;
1936
1937
1938 if (!fc)
1939 return -EIO;
1940
1941 memset(sb, 0, sizeof(struct stat));
1942
1943 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1944 return -EINVAL;
1945
1946 sb->st_uid = sb->st_gid = 0;
1947 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1948 sb->st_size = 0;
1949
1950 if (strcmp(path, "/cgroup") == 0) {
1951 sb->st_mode = S_IFDIR | 00755;
1952 sb->st_nlink = 2;
1953 return 0;
1954 }
1955
1956 controller = pick_controller_from_path(fc, path);
1957 if (!controller)
2f7036d0 1958 return -errno;
237e200e
SH
1959 cgroup = find_cgroup_in_path(path);
1960 if (!cgroup) {
1961 /* this is just /cgroup/controller, return it as a dir */
1962 sb->st_mode = S_IFDIR | 00755;
1963 sb->st_nlink = 2;
1964 return 0;
1965 }
1966
1967 get_cgdir_and_path(cgroup, &cgdir, &last);
1968
1969 if (!last) {
1970 path1 = "/";
1971 path2 = cgdir;
1972 } else {
1973 path1 = cgdir;
1974 path2 = last;
1975 }
1976
1977 pid_t initpid = lookup_initpid_in_store(fc->pid);
1978 if (initpid <= 0)
1979 initpid = fc->pid;
1980 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1981 * Then check that caller's cgroup is under path if last is a child
1982 * cgroup, or cgdir if last is a file */
1983
1984 if (is_child_cgroup(controller, path1, path2)) {
1985 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1986 ret = -ENOENT;
1987 goto out;
1988 }
1989 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1990 /* this is just /cgroup/controller, return it as a dir */
1991 sb->st_mode = S_IFDIR | 00555;
1992 sb->st_nlink = 2;
1993 ret = 0;
1994 goto out;
1995 }
1996 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1997 ret = -EACCES;
1998 goto out;
1999 }
2000
2001 // get uid, gid, from '/tasks' file and make up a mode
2002 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2003 sb->st_mode = S_IFDIR | 00755;
2004 k = cgfs_get_key(controller, cgroup, NULL);
2005 if (!k) {
2006 sb->st_uid = sb->st_gid = 0;
2007 } else {
2008 sb->st_uid = k->uid;
2009 sb->st_gid = k->gid;
2010 }
2011 free_key(k);
2012 sb->st_nlink = 2;
2013 ret = 0;
2014 goto out;
2015 }
2016
2017 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
2018 sb->st_mode = S_IFREG | k->mode;
2019 sb->st_nlink = 1;
2020 sb->st_uid = k->uid;
2021 sb->st_gid = k->gid;
2022 sb->st_size = 0;
2023 free_key(k);
2024 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2025 ret = -ENOENT;
2026 goto out;
2027 }
237e200e
SH
2028 ret = 0;
2029 }
2030
2031out:
2032 free(cgdir);
2033 return ret;
2034}
2035
2036int cg_opendir(const char *path, struct fuse_file_info *fi)
2037{
2038 struct fuse_context *fc = fuse_get_context();
2039 const char *cgroup;
2040 struct file_info *dir_info;
2041 char *controller = NULL;
2042
2043 if (!fc)
2044 return -EIO;
2045
2046 if (strcmp(path, "/cgroup") == 0) {
2047 cgroup = NULL;
2048 controller = NULL;
2049 } else {
2050 // return list of keys for the controller, and list of child cgroups
2051 controller = pick_controller_from_path(fc, path);
2052 if (!controller)
2f7036d0 2053 return -errno;
237e200e
SH
2054
2055 cgroup = find_cgroup_in_path(path);
2056 if (!cgroup) {
2057 /* this is just /cgroup/controller, return its contents */
2058 cgroup = "/";
2059 }
2060 }
2061
2062 pid_t initpid = lookup_initpid_in_store(fc->pid);
2063 if (initpid <= 0)
2064 initpid = fc->pid;
2065 if (cgroup) {
2066 if (!caller_may_see_dir(initpid, controller, cgroup))
2067 return -ENOENT;
2068 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2069 return -EACCES;
2070 }
2071
2072 /* we'll free this at cg_releasedir */
2073 dir_info = malloc(sizeof(*dir_info));
2074 if (!dir_info)
2075 return -ENOMEM;
2076 dir_info->controller = must_copy_string(controller);
2077 dir_info->cgroup = must_copy_string(cgroup);
2078 dir_info->type = LXC_TYPE_CGDIR;
2079 dir_info->buf = NULL;
2080 dir_info->file = NULL;
2081 dir_info->buflen = 0;
2082
2083 fi->fh = (unsigned long)dir_info;
2084 return 0;
2085}
2086
2087int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2088 struct fuse_file_info *fi)
2089{
2090 struct file_info *d = (struct file_info *)fi->fh;
2091 struct cgfs_files **list = NULL;
2092 int i, ret;
2093 char *nextcg = NULL;
2094 struct fuse_context *fc = fuse_get_context();
2095 char **clist = NULL;
2096
d639f863
CB
2097 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2098 return -EIO;
2099
237e200e 2100 if (d->type != LXC_TYPE_CGDIR) {
b8defc3d 2101 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
237e200e
SH
2102 return -EIO;
2103 }
2104 if (!d->cgroup && !d->controller) {
2105 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2106 int i;
2107
2108 for (i = 0; i < num_hierarchies; i++) {
2109 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2110 return -EIO;
2111 }
2112 }
2113 return 0;
2114 }
2115
2116 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2117 // not a valid cgroup
2118 ret = -EINVAL;
2119 goto out;
2120 }
2121
2122 pid_t initpid = lookup_initpid_in_store(fc->pid);
2123 if (initpid <= 0)
2124 initpid = fc->pid;
2125 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2126 if (nextcg) {
2127 ret = filler(buf, nextcg, NULL, 0);
2128 free(nextcg);
2129 if (ret != 0) {
2130 ret = -EIO;
2131 goto out;
2132 }
2133 }
2134 ret = 0;
2135 goto out;
2136 }
2137
2138 for (i = 0; list[i]; i++) {
2139 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2140 ret = -EIO;
2141 goto out;
2142 }
2143 }
2144
2145 // now get the list of child cgroups
2146
2147 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2148 ret = 0;
2149 goto out;
2150 }
f366da65
WB
2151 if (clist) {
2152 for (i = 0; clist[i]; i++) {
2153 if (filler(buf, clist[i], NULL, 0) != 0) {
2154 ret = -EIO;
2155 goto out;
2156 }
237e200e
SH
2157 }
2158 }
2159 ret = 0;
2160
2161out:
2162 free_keys(list);
2163 if (clist) {
2164 for (i = 0; clist[i]; i++)
2165 free(clist[i]);
2166 free(clist);
2167 }
2168 return ret;
2169}
2170
43215927 2171static void do_release_file_info(struct fuse_file_info *fi)
237e200e 2172{
43215927
SH
2173 struct file_info *f = (struct file_info *)fi->fh;
2174
237e200e
SH
2175 if (!f)
2176 return;
43215927
SH
2177
2178 fi->fh = 0;
2179
237e200e 2180 free(f->controller);
43215927 2181 f->controller = NULL;
237e200e 2182 free(f->cgroup);
43215927 2183 f->cgroup = NULL;
237e200e 2184 free(f->file);
43215927 2185 f->file = NULL;
237e200e 2186 free(f->buf);
43215927 2187 f->buf = NULL;
237e200e 2188 free(f);
bbb508dd 2189 f = NULL;
237e200e
SH
2190}
2191
2192int cg_releasedir(const char *path, struct fuse_file_info *fi)
2193{
43215927 2194 do_release_file_info(fi);
237e200e
SH
2195 return 0;
2196}
2197
2198int cg_open(const char *path, struct fuse_file_info *fi)
2199{
2200 const char *cgroup;
2201 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2202 struct cgfs_files *k = NULL;
2203 struct file_info *file_info;
2204 struct fuse_context *fc = fuse_get_context();
2205 int ret;
2206
2207 if (!fc)
2208 return -EIO;
2209
2210 controller = pick_controller_from_path(fc, path);
2211 if (!controller)
2f7036d0 2212 return -errno;
237e200e
SH
2213 cgroup = find_cgroup_in_path(path);
2214 if (!cgroup)
bc70ba9b 2215 return -errno;
237e200e
SH
2216
2217 get_cgdir_and_path(cgroup, &cgdir, &last);
2218 if (!last) {
2219 path1 = "/";
2220 path2 = cgdir;
2221 } else {
2222 path1 = cgdir;
2223 path2 = last;
2224 }
2225
2226 k = cgfs_get_key(controller, path1, path2);
2227 if (!k) {
2228 ret = -EINVAL;
2229 goto out;
2230 }
2231 free_key(k);
2232
2233 pid_t initpid = lookup_initpid_in_store(fc->pid);
2234 if (initpid <= 0)
2235 initpid = fc->pid;
2236 if (!caller_may_see_dir(initpid, controller, path1)) {
2237 ret = -ENOENT;
2238 goto out;
2239 }
2240 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
237e200e
SH
2241 ret = -EACCES;
2242 goto out;
2243 }
2244
2245 /* we'll free this at cg_release */
2246 file_info = malloc(sizeof(*file_info));
2247 if (!file_info) {
2248 ret = -ENOMEM;
2249 goto out;
2250 }
2251 file_info->controller = must_copy_string(controller);
2252 file_info->cgroup = must_copy_string(path1);
2253 file_info->file = must_copy_string(path2);
2254 file_info->type = LXC_TYPE_CGFILE;
2255 file_info->buf = NULL;
2256 file_info->buflen = 0;
2257
2258 fi->fh = (unsigned long)file_info;
2259 ret = 0;
2260
2261out:
2262 free(cgdir);
2263 return ret;
2264}
2265
bddbb106
SH
2266int cg_access(const char *path, int mode)
2267{
6f0f6b83 2268 int ret;
bddbb106 2269 const char *cgroup;
6f0f6b83
CB
2270 char *path1, *path2, *controller;
2271 char *last = NULL, *cgdir = NULL;
bddbb106
SH
2272 struct cgfs_files *k = NULL;
2273 struct fuse_context *fc = fuse_get_context();
6f0f6b83 2274
9873c5e8 2275 if (strcmp(path, "/cgroup") == 0)
6f0f6b83 2276 return 0;
bddbb106
SH
2277
2278 if (!fc)
2279 return -EIO;
2280
2281 controller = pick_controller_from_path(fc, path);
2282 if (!controller)
2f7036d0 2283 return -errno;
bddbb106 2284 cgroup = find_cgroup_in_path(path);
575316c4
SH
2285 if (!cgroup) {
2286 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
3f441bc7
SH
2287 if ((mode & W_OK) == 0)
2288 return 0;
2289 return -EACCES;
575316c4 2290 }
bddbb106
SH
2291
2292 get_cgdir_and_path(cgroup, &cgdir, &last);
2293 if (!last) {
2294 path1 = "/";
2295 path2 = cgdir;
2296 } else {
2297 path1 = cgdir;
2298 path2 = last;
2299 }
2300
2301 k = cgfs_get_key(controller, path1, path2);
2302 if (!k) {
3f441bc7
SH
2303 if ((mode & W_OK) == 0)
2304 ret = 0;
2305 else
2306 ret = -EACCES;
bddbb106
SH
2307 goto out;
2308 }
2309 free_key(k);
2310
2311 pid_t initpid = lookup_initpid_in_store(fc->pid);
2312 if (initpid <= 0)
2313 initpid = fc->pid;
2314 if (!caller_may_see_dir(initpid, controller, path1)) {
2315 ret = -ENOENT;
2316 goto out;
2317 }
2318 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2319 ret = -EACCES;
2320 goto out;
2321 }
2322
2323 ret = 0;
2324
2325out:
2326 free(cgdir);
2327 return ret;
2328}
2329
237e200e
SH
2330int cg_release(const char *path, struct fuse_file_info *fi)
2331{
43215927 2332 do_release_file_info(fi);
237e200e
SH
2333 return 0;
2334}
2335
2336#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2337
2338static bool wait_for_sock(int sock, int timeout)
2339{
2340 struct epoll_event ev;
2341 int epfd, ret, now, starttime, deltatime, saved_errno;
2342
2343 if ((starttime = time(NULL)) < 0)
2344 return false;
2345
2346 if ((epfd = epoll_create(1)) < 0) {
b8defc3d 2347 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
237e200e
SH
2348 return false;
2349 }
2350
2351 ev.events = POLLIN_SET;
2352 ev.data.fd = sock;
2353 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
b8defc3d 2354 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
237e200e
SH
2355 close(epfd);
2356 return false;
2357 }
2358
2359again:
2360 if ((now = time(NULL)) < 0) {
2361 close(epfd);
2362 return false;
2363 }
2364
2365 deltatime = (starttime + timeout) - now;
2366 if (deltatime < 0) { // timeout
2367 errno = 0;
2368 close(epfd);
2369 return false;
2370 }
2371 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2372 if (ret < 0 && errno == EINTR)
2373 goto again;
2374 saved_errno = errno;
2375 close(epfd);
2376
2377 if (ret <= 0) {
2378 errno = saved_errno;
2379 return false;
2380 }
2381 return true;
2382}
2383
2384static int msgrecv(int sockfd, void *buf, size_t len)
2385{
2386 if (!wait_for_sock(sockfd, 2))
2387 return -1;
2388 return recv(sockfd, buf, len, MSG_DONTWAIT);
2389}
2390
2391static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2392{
2393 struct msghdr msg = { 0 };
2394 struct iovec iov;
2395 struct cmsghdr *cmsg;
2396 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2397 char buf[1];
2398 buf[0] = 'p';
2399
2400 if (pingfirst) {
2401 if (msgrecv(sock, buf, 1) != 1) {
b8defc3d 2402 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
237e200e
SH
2403 return SEND_CREDS_FAIL;
2404 }
2405 }
2406
2407 msg.msg_control = cmsgbuf;
2408 msg.msg_controllen = sizeof(cmsgbuf);
2409
2410 cmsg = CMSG_FIRSTHDR(&msg);
2411 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2412 cmsg->cmsg_level = SOL_SOCKET;
2413 cmsg->cmsg_type = SCM_CREDENTIALS;
2414 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2415
2416 msg.msg_name = NULL;
2417 msg.msg_namelen = 0;
2418
2419 buf[0] = v;
2420 iov.iov_base = buf;
2421 iov.iov_len = sizeof(buf);
2422 msg.msg_iov = &iov;
2423 msg.msg_iovlen = 1;
2424
2425 if (sendmsg(sock, &msg, 0) < 0) {
b8defc3d 2426 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
237e200e
SH
2427 if (errno == 3)
2428 return SEND_CREDS_NOTSK;
2429 return SEND_CREDS_FAIL;
2430 }
2431
2432 return SEND_CREDS_OK;
2433}
2434
2435static bool recv_creds(int sock, struct ucred *cred, char *v)
2436{
2437 struct msghdr msg = { 0 };
2438 struct iovec iov;
2439 struct cmsghdr *cmsg;
2440 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2441 char buf[1];
2442 int ret;
2443 int optval = 1;
2444
2445 *v = '1';
2446
2447 cred->pid = -1;
2448 cred->uid = -1;
2449 cred->gid = -1;
2450
2451 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
b8defc3d 2452 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
237e200e
SH
2453 return false;
2454 }
2455 buf[0] = '1';
2456 if (write(sock, buf, 1) != 1) {
b8defc3d 2457 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
237e200e
SH
2458 return false;
2459 }
2460
2461 msg.msg_name = NULL;
2462 msg.msg_namelen = 0;
2463 msg.msg_control = cmsgbuf;
2464 msg.msg_controllen = sizeof(cmsgbuf);
2465
2466 iov.iov_base = buf;
2467 iov.iov_len = sizeof(buf);
2468 msg.msg_iov = &iov;
2469 msg.msg_iovlen = 1;
2470
2471 if (!wait_for_sock(sock, 2)) {
b8defc3d 2472 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
237e200e
SH
2473 return false;
2474 }
2475 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2476 if (ret < 0) {
b8defc3d 2477 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
237e200e
SH
2478 return false;
2479 }
2480
2481 cmsg = CMSG_FIRSTHDR(&msg);
2482
2483 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2484 cmsg->cmsg_level == SOL_SOCKET &&
2485 cmsg->cmsg_type == SCM_CREDENTIALS) {
2486 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2487 }
2488 *v = buf[0];
2489
2490 return true;
2491}
2492
35174b0f
FG
2493struct pid_ns_clone_args {
2494 int *cpipe;
2495 int sock;
2496 pid_t tpid;
2497 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2498};
2499
2500/*
2501 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2502 * with clone(). This simply writes '1' as ACK back to the parent
2503 * before calling the actual wrapped function.
2504 */
2505static int pid_ns_clone_wrapper(void *arg) {
2506 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2507 char b = '1';
2508
2509 close(args->cpipe[0]);
b8defc3d
CB
2510 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2511 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
35174b0f
FG
2512 close(args->cpipe[1]);
2513 return args->wrapped(args->sock, args->tpid);
2514}
237e200e
SH
2515
2516/*
2517 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2518 * int value back over the socket. This shifts the pid from the
2519 * sender's pidns into tpid's pidns.
2520 */
35174b0f 2521static int pid_to_ns(int sock, pid_t tpid)
237e200e
SH
2522{
2523 char v = '0';
2524 struct ucred cred;
2525
2526 while (recv_creds(sock, &cred, &v)) {
2527 if (v == '1')
35174b0f 2528 return 0;
237e200e 2529 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
35174b0f 2530 return 1;
237e200e 2531 }
35174b0f 2532 return 0;
237e200e
SH
2533}
2534
35174b0f 2535
237e200e
SH
2536/*
2537 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
35174b0f
FG
2538 * in your old pidns. Only children which you clone will be in the target
2539 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2540 * actually convert pids.
2541 *
2542 * Note: glibc's fork() does not respect pidns, which can lead to failed
2543 * assertions inside glibc (and thus failed forks) if the child's pid in
2544 * the pidns and the parent pid outside are identical. Using clone prevents
2545 * this issue.
237e200e
SH
2546 */
2547static void pid_to_ns_wrapper(int sock, pid_t tpid)
2548{
2549 int newnsfd = -1, ret, cpipe[2];
2550 char fnam[100];
2551 pid_t cpid;
2552 char v;
2553
2554 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2555 if (ret < 0 || ret >= sizeof(fnam))
2556 _exit(1);
2557 newnsfd = open(fnam, O_RDONLY);
2558 if (newnsfd < 0)
2559 _exit(1);
2560 if (setns(newnsfd, 0) < 0)
2561 _exit(1);
2562 close(newnsfd);
2563
2564 if (pipe(cpipe) < 0)
2565 _exit(1);
2566
35174b0f
FG
2567 struct pid_ns_clone_args args = {
2568 .cpipe = cpipe,
2569 .sock = sock,
2570 .tpid = tpid,
2571 .wrapped = &pid_to_ns
2572 };
2573 size_t stack_size = sysconf(_SC_PAGESIZE);
2574 void *stack = alloca(stack_size);
2575
2576 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2577 if (cpid < 0)
2578 _exit(1);
2579
237e200e
SH
2580 // give the child 1 second to be done forking and
2581 // write its ack
2582 if (!wait_for_sock(cpipe[0], 1))
2583 _exit(1);
2584 ret = read(cpipe[0], &v, 1);
2585 if (ret != sizeof(char) || v != '1')
2586 _exit(1);
2587
2588 if (!wait_for_pid(cpid))
2589 _exit(1);
2590 _exit(0);
2591}
2592
2593/*
2594 * To read cgroup files with a particular pid, we will setns into the child
2595 * pidns, open a pipe, fork a child - which will be the first to really be in
2596 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2597 */
2598bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2599{
2600 int sock[2] = {-1, -1};
2601 char *tmpdata = NULL;
2602 int ret;
2603 pid_t qpid, cpid = -1;
2604 bool answer = false;
2605 char v = '0';
2606 struct ucred cred;
2607 size_t sz = 0, asz = 0;
2608
2609 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2610 return false;
2611
2612 /*
2613 * Now we read the pids from returned data one by one, pass
2614 * them into a child in the target namespace, read back the
2615 * translated pids, and put them into our to-return data
2616 */
2617
2618 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2619 perror("socketpair");
2620 free(tmpdata);
2621 return false;
2622 }
2623
2624 cpid = fork();
2625 if (cpid == -1)
2626 goto out;
2627
2628 if (!cpid) // child - exits when done
2629 pid_to_ns_wrapper(sock[1], tpid);
2630
2631 char *ptr = tmpdata;
2632 cred.uid = 0;
2633 cred.gid = 0;
2634 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2635 cred.pid = qpid;
2636 ret = send_creds(sock[0], &cred, v, true);
2637
2638 if (ret == SEND_CREDS_NOTSK)
2639 goto next;
2640 if (ret == SEND_CREDS_FAIL)
2641 goto out;
2642
2643 // read converted results
2644 if (!wait_for_sock(sock[0], 2)) {
b8defc3d 2645 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
237e200e
SH
2646 goto out;
2647 }
2648 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2649 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
237e200e
SH
2650 goto out;
2651 }
2652 must_strcat_pid(d, &sz, &asz, qpid);
2653next:
2654 ptr = strchr(ptr, '\n');
2655 if (!ptr)
2656 break;
2657 ptr++;
2658 }
2659
2660 cred.pid = getpid();
2661 v = '1';
2662 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2663 // failed to ask child to exit
b8defc3d 2664 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
237e200e
SH
2665 goto out;
2666 }
2667
2668 answer = true;
2669
2670out:
2671 free(tmpdata);
2672 if (cpid != -1)
2673 wait_for_pid(cpid);
2674 if (sock[0] != -1) {
2675 close(sock[0]);
2676 close(sock[1]);
2677 }
2678 return answer;
2679}
2680
2681int cg_read(const char *path, char *buf, size_t size, off_t offset,
2682 struct fuse_file_info *fi)
2683{
2684 struct fuse_context *fc = fuse_get_context();
2685 struct file_info *f = (struct file_info *)fi->fh;
2686 struct cgfs_files *k = NULL;
2687 char *data = NULL;
2688 int ret, s;
2689 bool r;
2690
2691 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 2692 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
237e200e
SH
2693 return -EIO;
2694 }
2695
2696 if (offset)
2697 return 0;
2698
2699 if (!fc)
2700 return -EIO;
2701
2702 if (!f->controller)
2703 return -EINVAL;
2704
2705 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2706 return -EINVAL;
2707 }
2708 free_key(k);
2709
2710
888f8f3c 2711 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
237e200e
SH
2712 ret = -EACCES;
2713 goto out;
2714 }
2715
2716 if (strcmp(f->file, "tasks") == 0 ||
2717 strcmp(f->file, "/tasks") == 0 ||
2718 strcmp(f->file, "/cgroup.procs") == 0 ||
2719 strcmp(f->file, "cgroup.procs") == 0)
2720 // special case - we have to translate the pids
2721 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2722 else
2723 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2724
2725 if (!r) {
2726 ret = -EINVAL;
2727 goto out;
2728 }
2729
2730 if (!data) {
2731 ret = 0;
2732 goto out;
2733 }
2734 s = strlen(data);
2735 if (s > size)
2736 s = size;
2737 memcpy(buf, data, s);
2738 if (s > 0 && s < size && data[s-1] != '\n')
2739 buf[s++] = '\n';
2740
2741 ret = s;
2742
2743out:
2744 free(data);
2745 return ret;
2746}
2747
35174b0f 2748static int pid_from_ns(int sock, pid_t tpid)
237e200e
SH
2749{
2750 pid_t vpid;
2751 struct ucred cred;
2752 char v;
2753 int ret;
2754
2755 cred.uid = 0;
2756 cred.gid = 0;
2757 while (1) {
2758 if (!wait_for_sock(sock, 2)) {
b8defc3d 2759 lxcfs_error("%s\n", "Timeout reading from parent.");
35174b0f 2760 return 1;
237e200e
SH
2761 }
2762 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
b8defc3d 2763 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
35174b0f 2764 return 1;
237e200e
SH
2765 }
2766 if (vpid == -1) // done
2767 break;
2768 v = '0';
2769 cred.pid = vpid;
2770 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2771 v = '1';
2772 cred.pid = getpid();
2773 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
35174b0f 2774 return 1;
237e200e
SH
2775 }
2776 }
35174b0f 2777 return 0;
237e200e
SH
2778}
2779
2780static void pid_from_ns_wrapper(int sock, pid_t tpid)
2781{
2782 int newnsfd = -1, ret, cpipe[2];
2783 char fnam[100];
2784 pid_t cpid;
2785 char v;
2786
2787 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2788 if (ret < 0 || ret >= sizeof(fnam))
2789 _exit(1);
2790 newnsfd = open(fnam, O_RDONLY);
2791 if (newnsfd < 0)
2792 _exit(1);
2793 if (setns(newnsfd, 0) < 0)
2794 _exit(1);
2795 close(newnsfd);
2796
2797 if (pipe(cpipe) < 0)
2798 _exit(1);
2799
35174b0f
FG
2800 struct pid_ns_clone_args args = {
2801 .cpipe = cpipe,
2802 .sock = sock,
2803 .tpid = tpid,
2804 .wrapped = &pid_from_ns
2805 };
f0f8b851
SH
2806 size_t stack_size = sysconf(_SC_PAGESIZE);
2807 void *stack = alloca(stack_size);
35174b0f
FG
2808
2809 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2810 if (cpid < 0)
2811 _exit(1);
2812
237e200e
SH
2813 // give the child 1 second to be done forking and
2814 // write its ack
2815 if (!wait_for_sock(cpipe[0], 1))
f0f8b851 2816 _exit(1);
237e200e 2817 ret = read(cpipe[0], &v, 1);
f0f8b851
SH
2818 if (ret != sizeof(char) || v != '1')
2819 _exit(1);
237e200e
SH
2820
2821 if (!wait_for_pid(cpid))
2822 _exit(1);
2823 _exit(0);
237e200e
SH
2824}
2825
2826/*
2827 * Given host @uid, return the uid to which it maps in
2828 * @pid's user namespace, or -1 if none.
2829 */
2830bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2831{
2832 FILE *f;
2833 char line[400];
2834
2835 sprintf(line, "/proc/%d/uid_map", pid);
2836 if ((f = fopen(line, "r")) == NULL) {
2837 return false;
2838 }
2839
2840 *answer = convert_id_to_ns(f, uid);
2841 fclose(f);
2842
2843 if (*answer == -1)
2844 return false;
2845 return true;
2846}
2847
2848/*
2849 * get_pid_creds: get the real uid and gid of @pid from
2850 * /proc/$$/status
2851 * (XXX should we use euid here?)
2852 */
2853void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2854{
2855 char line[400];
2856 uid_t u;
2857 gid_t g;
2858 FILE *f;
2859
2860 *uid = -1;
2861 *gid = -1;
2862 sprintf(line, "/proc/%d/status", pid);
2863 if ((f = fopen(line, "r")) == NULL) {
b8defc3d 2864 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
237e200e
SH
2865 return;
2866 }
2867 while (fgets(line, 400, f)) {
2868 if (strncmp(line, "Uid:", 4) == 0) {
2869 if (sscanf(line+4, "%u", &u) != 1) {
b8defc3d 2870 lxcfs_error("bad uid line for pid %u\n", pid);
237e200e
SH
2871 fclose(f);
2872 return;
2873 }
2874 *uid = u;
2875 } else if (strncmp(line, "Gid:", 4) == 0) {
2876 if (sscanf(line+4, "%u", &g) != 1) {
b8defc3d 2877 lxcfs_error("bad gid line for pid %u\n", pid);
237e200e
SH
2878 fclose(f);
2879 return;
2880 }
2881 *gid = g;
2882 }
2883 }
2884 fclose(f);
2885}
2886
2887/*
2888 * May the requestor @r move victim @v to a new cgroup?
2889 * This is allowed if
2890 * . they are the same task
2891 * . they are ownedy by the same uid
2892 * . @r is root on the host, or
2893 * . @v's uid is mapped into @r's where @r is root.
2894 */
2895bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2896{
2897 uid_t v_uid, tmpuid;
2898 gid_t v_gid;
2899
2900 if (r == v)
2901 return true;
2902 if (r_uid == 0)
2903 return true;
2904 get_pid_creds(v, &v_uid, &v_gid);
2905 if (r_uid == v_uid)
2906 return true;
2907 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2908 && hostuid_to_ns(v_uid, r, &tmpuid))
2909 return true;
2910 return false;
2911}
2912
2913static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2914 const char *file, const char *buf)
2915{
2916 int sock[2] = {-1, -1};
2917 pid_t qpid, cpid = -1;
2918 FILE *pids_file = NULL;
2919 bool answer = false, fail = false;
2920
2921 pids_file = open_pids_file(contrl, cg);
2922 if (!pids_file)
2923 return false;
2924
2925 /*
2926 * write the pids to a socket, have helper in writer's pidns
2927 * call movepid for us
2928 */
2929 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2930 perror("socketpair");
2931 goto out;
2932 }
2933
2934 cpid = fork();
2935 if (cpid == -1)
2936 goto out;
2937
2938 if (!cpid) { // child
2939 fclose(pids_file);
2940 pid_from_ns_wrapper(sock[1], tpid);
2941 }
2942
2943 const char *ptr = buf;
2944 while (sscanf(ptr, "%d", &qpid) == 1) {
2945 struct ucred cred;
2946 char v;
2947
2948 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2949 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
237e200e
SH
2950 goto out;
2951 }
2952
2953 if (recv_creds(sock[0], &cred, &v)) {
2954 if (v == '0') {
2955 if (!may_move_pid(tpid, tuid, cred.pid)) {
2956 fail = true;
2957 break;
2958 }
2959 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2960 fail = true;
2961 }
2962 }
2963
2964 ptr = strchr(ptr, '\n');
2965 if (!ptr)
2966 break;
2967 ptr++;
2968 }
2969
2970 /* All good, write the value */
2971 qpid = -1;
2972 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
b8defc3d 2973 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
237e200e
SH
2974
2975 if (!fail)
2976 answer = true;
2977
2978out:
2979 if (cpid != -1)
2980 wait_for_pid(cpid);
2981 if (sock[0] != -1) {
2982 close(sock[0]);
2983 close(sock[1]);
2984 }
2985 if (pids_file) {
2986 if (fclose(pids_file) != 0)
2987 answer = false;
2988 }
2989 return answer;
2990}
2991
2992int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2993 struct fuse_file_info *fi)
2994{
2995 struct fuse_context *fc = fuse_get_context();
2996 char *localbuf = NULL;
2997 struct cgfs_files *k = NULL;
2998 struct file_info *f = (struct file_info *)fi->fh;
2999 bool r;
3000
3001 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 3002 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
237e200e
SH
3003 return -EIO;
3004 }
3005
3006 if (offset)
3007 return 0;
3008
3009 if (!fc)
3010 return -EIO;
3011
3012 localbuf = alloca(size+1);
3013 localbuf[size] = '\0';
3014 memcpy(localbuf, buf, size);
3015
3016 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
3017 size = -EINVAL;
3018 goto out;
3019 }
3020
3021 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
3022 size = -EACCES;
3023 goto out;
3024 }
3025
3026 if (strcmp(f->file, "tasks") == 0 ||
3027 strcmp(f->file, "/tasks") == 0 ||
3028 strcmp(f->file, "/cgroup.procs") == 0 ||
3029 strcmp(f->file, "cgroup.procs") == 0)
3030 // special case - we have to translate the pids
3031 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
3032 else
3033 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
3034
3035 if (!r)
3036 size = -EINVAL;
3037
3038out:
3039 free_key(k);
3040 return size;
3041}
3042
3043int cg_chown(const char *path, uid_t uid, gid_t gid)
3044{
3045 struct fuse_context *fc = fuse_get_context();
3046 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3047 struct cgfs_files *k = NULL;
3048 const char *cgroup;
3049 int ret;
3050
3051 if (!fc)
3052 return -EIO;
3053
3054 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 3055 return -EPERM;
237e200e
SH
3056
3057 controller = pick_controller_from_path(fc, path);
3058 if (!controller)
bc70ba9b
CB
3059 return errno == ENOENT ? -EPERM : -errno;
3060
237e200e
SH
3061 cgroup = find_cgroup_in_path(path);
3062 if (!cgroup)
3063 /* this is just /cgroup/controller */
bc70ba9b 3064 return -EPERM;
237e200e
SH
3065
3066 get_cgdir_and_path(cgroup, &cgdir, &last);
3067
3068 if (!last) {
3069 path1 = "/";
3070 path2 = cgdir;
3071 } else {
3072 path1 = cgdir;
3073 path2 = last;
3074 }
3075
3076 if (is_child_cgroup(controller, path1, path2)) {
3077 // get uid, gid, from '/tasks' file and make up a mode
3078 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3079 k = cgfs_get_key(controller, cgroup, "tasks");
3080
3081 } else
3082 k = cgfs_get_key(controller, path1, path2);
3083
3084 if (!k) {
3085 ret = -EINVAL;
3086 goto out;
3087 }
3088
3089 /*
3090 * This being a fuse request, the uid and gid must be valid
3091 * in the caller's namespace. So we can just check to make
3092 * sure that the caller is root in his uid, and privileged
3093 * over the file's current owner.
3094 */
3095 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3096 ret = -EACCES;
3097 goto out;
3098 }
3099
3100 ret = cgfs_chown_file(controller, cgroup, uid, gid);
3101
3102out:
3103 free_key(k);
3104 free(cgdir);
3105
3106 return ret;
3107}
3108
3109int cg_chmod(const char *path, mode_t mode)
3110{
3111 struct fuse_context *fc = fuse_get_context();
3112 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3113 struct cgfs_files *k = NULL;
3114 const char *cgroup;
3115 int ret;
3116
3117 if (!fc)
3118 return -EIO;
3119
3120 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 3121 return -EPERM;
237e200e
SH
3122
3123 controller = pick_controller_from_path(fc, path);
3124 if (!controller)
bc70ba9b
CB
3125 return errno == ENOENT ? -EPERM : -errno;
3126
237e200e
SH
3127 cgroup = find_cgroup_in_path(path);
3128 if (!cgroup)
3129 /* this is just /cgroup/controller */
bc70ba9b 3130 return -EPERM;
237e200e
SH
3131
3132 get_cgdir_and_path(cgroup, &cgdir, &last);
3133
3134 if (!last) {
3135 path1 = "/";
3136 path2 = cgdir;
3137 } else {
3138 path1 = cgdir;
3139 path2 = last;
3140 }
3141
3142 if (is_child_cgroup(controller, path1, path2)) {
3143 // get uid, gid, from '/tasks' file and make up a mode
3144 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3145 k = cgfs_get_key(controller, cgroup, "tasks");
3146
3147 } else
3148 k = cgfs_get_key(controller, path1, path2);
3149
3150 if (!k) {
3151 ret = -EINVAL;
3152 goto out;
3153 }
3154
3155 /*
3156 * This being a fuse request, the uid and gid must be valid
3157 * in the caller's namespace. So we can just check to make
3158 * sure that the caller is root in his uid, and privileged
3159 * over the file's current owner.
3160 */
3161 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3162 ret = -EPERM;
3163 goto out;
3164 }
3165
3166 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3167 ret = -EINVAL;
3168 goto out;
3169 }
3170
3171 ret = 0;
3172out:
3173 free_key(k);
3174 free(cgdir);
3175 return ret;
3176}
3177
3178int cg_mkdir(const char *path, mode_t mode)
3179{
3180 struct fuse_context *fc = fuse_get_context();
3181 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3182 const char *cgroup;
3183 int ret;
3184
3185 if (!fc)
3186 return -EIO;
3187
237e200e
SH
3188 controller = pick_controller_from_path(fc, path);
3189 if (!controller)
2f7036d0 3190 return errno == ENOENT ? -EPERM : -errno;
237e200e
SH
3191
3192 cgroup = find_cgroup_in_path(path);
3193 if (!cgroup)
bc70ba9b 3194 return -errno;
237e200e
SH
3195
3196 get_cgdir_and_path(cgroup, &cgdir, &last);
3197 if (!last)
3198 path1 = "/";
3199 else
3200 path1 = cgdir;
3201
3202 pid_t initpid = lookup_initpid_in_store(fc->pid);
3203 if (initpid <= 0)
3204 initpid = fc->pid;
3205 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3206 if (!next)
3207 ret = -EINVAL;
3208 else if (last && strcmp(next, last) == 0)
3209 ret = -EEXIST;
3210 else
2f7036d0 3211 ret = -EPERM;
237e200e
SH
3212 goto out;
3213 }
3214
3215 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3216 ret = -EACCES;
3217 goto out;
3218 }
3219 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3220 ret = -EACCES;
3221 goto out;
3222 }
3223
3224 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3225
3226out:
3227 free(cgdir);
3228 free(next);
3229 return ret;
3230}
3231
3232int cg_rmdir(const char *path)
3233{
3234 struct fuse_context *fc = fuse_get_context();
3235 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3236 const char *cgroup;
3237 int ret;
3238
3239 if (!fc)
3240 return -EIO;
3241
3242 controller = pick_controller_from_path(fc, path);
e254948f
CB
3243 if (!controller) /* Someone's trying to delete "/cgroup". */
3244 return -EPERM;
237e200e
SH
3245
3246 cgroup = find_cgroup_in_path(path);
e254948f
CB
3247 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3248 return -EPERM;
237e200e
SH
3249
3250 get_cgdir_and_path(cgroup, &cgdir, &last);
3251 if (!last) {
e254948f
CB
3252 /* Someone's trying to delete a cgroup on the same level as the
3253 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3254 * rmdir "/cgroup/blkio/init.slice".
3255 */
3256 ret = -EPERM;
237e200e
SH
3257 goto out;
3258 }
3259
3260 pid_t initpid = lookup_initpid_in_store(fc->pid);
3261 if (initpid <= 0)
3262 initpid = fc->pid;
3263 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
de77249b 3264 if (!last || (next && (strcmp(next, last) == 0)))
237e200e
SH
3265 ret = -EBUSY;
3266 else
3267 ret = -ENOENT;
3268 goto out;
3269 }
3270
3271 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3272 ret = -EACCES;
3273 goto out;
3274 }
3275 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3276 ret = -EACCES;
3277 goto out;
3278 }
3279
3280 if (!cgfs_remove(controller, cgroup)) {
3281 ret = -EINVAL;
3282 goto out;
3283 }
3284
3285 ret = 0;
3286
3287out:
3288 free(cgdir);
3289 free(next);
3290 return ret;
3291}
3292
3293static bool startswith(const char *line, const char *pref)
3294{
3295 if (strncmp(line, pref, strlen(pref)) == 0)
3296 return true;
3297 return false;
3298}
3299
c6095b08
SH
3300static void parse_memstat(char *memstat, unsigned long *cached,
3301 unsigned long *active_anon, unsigned long *inactive_anon,
3302 unsigned long *active_file, unsigned long *inactive_file,
559eaa8f 3303 unsigned long *unevictable, unsigned long *shmem)
237e200e
SH
3304{
3305 char *eol;
3306
237e200e 3307 while (*memstat) {
4accebfb
AS
3308 if (startswith(memstat, "total_cache")) {
3309 sscanf(memstat + 11, "%lu", cached);
c6095b08 3310 *cached /= 1024;
4accebfb
AS
3311 } else if (startswith(memstat, "total_active_anon")) {
3312 sscanf(memstat + 17, "%lu", active_anon);
c6095b08 3313 *active_anon /= 1024;
4accebfb
AS
3314 } else if (startswith(memstat, "total_inactive_anon")) {
3315 sscanf(memstat + 19, "%lu", inactive_anon);
c6095b08 3316 *inactive_anon /= 1024;
4accebfb
AS
3317 } else if (startswith(memstat, "total_active_file")) {
3318 sscanf(memstat + 17, "%lu", active_file);
c6095b08 3319 *active_file /= 1024;
4accebfb
AS
3320 } else if (startswith(memstat, "total_inactive_file")) {
3321 sscanf(memstat + 19, "%lu", inactive_file);
c6095b08 3322 *inactive_file /= 1024;
4accebfb
AS
3323 } else if (startswith(memstat, "total_unevictable")) {
3324 sscanf(memstat + 17, "%lu", unevictable);
c6095b08 3325 *unevictable /= 1024;
559eaa8f
JS
3326 } else if (startswith(memstat, "total_shmem")) {
3327 sscanf(memstat + 11, "%lu", shmem);
3328 *shmem /= 1024;
237e200e
SH
3329 }
3330 eol = strchr(memstat, '\n');
3331 if (!eol)
3332 return;
3333 memstat = eol+1;
3334 }
3335}
3336
3337static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3338{
3339 char *eol;
3340 char key[32];
3341
3342 memset(key, 0, 32);
3343 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3344
3345 size_t len = strlen(key);
3346 *v = 0;
3347
3348 while (*str) {
3349 if (startswith(str, key)) {
3350 sscanf(str + len, "%lu", v);
3351 return;
3352 }
3353 eol = strchr(str, '\n');
3354 if (!eol)
3355 return;
3356 str = eol+1;
3357 }
3358}
3359
3360static int read_file(const char *path, char *buf, size_t size,
3361 struct file_info *d)
3362{
3363 size_t linelen = 0, total_len = 0, rv = 0;
3364 char *line = NULL;
3365 char *cache = d->buf;
3366 size_t cache_size = d->buflen;
3367 FILE *f = fopen(path, "r");
3368 if (!f)
3369 return 0;
3370
3371 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3372 ssize_t l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
3373 if (l < 0) {
3374 perror("Error writing to cache");
3375 rv = 0;
3376 goto err;
3377 }
3378 if (l >= cache_size) {
b8defc3d 3379 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3380 rv = 0;
3381 goto err;
3382 }
3383 cache += l;
3384 cache_size -= l;
3385 total_len += l;
3386 }
3387
3388 d->size = total_len;
a262ddb7
CB
3389 if (total_len > size)
3390 total_len = size;
237e200e
SH
3391
3392 /* read from off 0 */
3393 memcpy(buf, d->buf, total_len);
3394 rv = total_len;
3395 err:
3396 fclose(f);
3397 free(line);
3398 return rv;
3399}
3400
3401/*
3402 * FUSE ops for /proc
3403 */
3404
018246ff 3405static unsigned long get_memlimit(const char *cgroup, const char *file)
237e200e
SH
3406{
3407 char *memlimit_str = NULL;
3408 unsigned long memlimit = -1;
3409
018246ff 3410 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
237e200e
SH
3411 memlimit = strtoul(memlimit_str, NULL, 10);
3412
3413 free(memlimit_str);
3414
3415 return memlimit;
3416}
3417
018246ff 3418static unsigned long get_min_memlimit(const char *cgroup, const char *file)
237e200e
SH
3419{
3420 char *copy = strdupa(cgroup);
3421 unsigned long memlimit = 0, retlimit;
3422
018246ff 3423 retlimit = get_memlimit(copy, file);
237e200e
SH
3424
3425 while (strcmp(copy, "/") != 0) {
3426 copy = dirname(copy);
018246ff 3427 memlimit = get_memlimit(copy, file);
237e200e
SH
3428 if (memlimit != -1 && memlimit < retlimit)
3429 retlimit = memlimit;
3430 };
3431
3432 return retlimit;
3433}
3434
3435static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3436 struct fuse_file_info *fi)
3437{
3438 struct fuse_context *fc = fuse_get_context();
3439 struct file_info *d = (struct file_info *)fi->fh;
3440 char *cg;
3441 char *memusage_str = NULL, *memstat_str = NULL,
018246ff 3442 *memswlimit_str = NULL, *memswusage_str = NULL;
237e200e 3443 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
c6095b08 3444 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
559eaa8f 3445 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
594a10e6 3446 hostswtotal = 0;
237e200e
SH
3447 char *line = NULL;
3448 size_t linelen = 0, total_len = 0, rv = 0;
3449 char *cache = d->buf;
3450 size_t cache_size = d->buflen;
3451 FILE *f = NULL;
3452
3453 if (offset){
3454 if (offset > d->size)
3455 return -EINVAL;
3456 if (!d->cached)
3457 return 0;
3458 int left = d->size - offset;
3459 total_len = left > size ? size: left;
3460 memcpy(buf, cache + offset, total_len);
3461 return total_len;
3462 }
3463
3464 pid_t initpid = lookup_initpid_in_store(fc->pid);
3465 if (initpid <= 0)
3466 initpid = fc->pid;
3467 cg = get_pid_cgroup(initpid, "memory");
3468 if (!cg)
3469 return read_file("/proc/meminfo", buf, size, d);
6d2f6996 3470 prune_init_slice(cg);
237e200e 3471
018246ff 3472 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
237e200e
SH
3473 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3474 goto err;
3475 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3476 goto err;
3477
3478 // Following values are allowed to fail, because swapaccount might be turned
3479 // off for current kernel
3480 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3481 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3482 {
018246ff 3483 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
237e200e
SH
3484 memswusage = strtoul(memswusage_str, NULL, 10);
3485
237e200e
SH
3486 memswlimit = memswlimit / 1024;
3487 memswusage = memswusage / 1024;
3488 }
3489
3490 memusage = strtoul(memusage_str, NULL, 10);
3491 memlimit /= 1024;
3492 memusage /= 1024;
3493
c6095b08
SH
3494 parse_memstat(memstat_str, &cached, &active_anon,
3495 &inactive_anon, &active_file, &inactive_file,
559eaa8f 3496 &unevictable, &shmem);
237e200e
SH
3497
3498 f = fopen("/proc/meminfo", "r");
3499 if (!f)
3500 goto err;
3501
3502 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3503 ssize_t l;
237e200e
SH
3504 char *printme, lbuf[100];
3505
3506 memset(lbuf, 0, 100);
3507 if (startswith(line, "MemTotal:")) {
594a10e6 3508 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
237e200e
SH
3509 if (hosttotal < memlimit)
3510 memlimit = hosttotal;
3511 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3512 printme = lbuf;
3513 } else if (startswith(line, "MemFree:")) {
3514 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3515 printme = lbuf;
3516 } else if (startswith(line, "MemAvailable:")) {
ad19b86d 3517 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
237e200e
SH
3518 printme = lbuf;
3519 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
594a10e6 3520 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
4127e51b 3521 if (hostswtotal < memswlimit)
3522 memswlimit = hostswtotal;
3523 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
237e200e
SH
3524 printme = lbuf;
3525 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
4127e51b 3526 unsigned long swaptotal = memswlimit,
b4665ce0
SH
3527 swapusage = memswusage - memusage,
3528 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3529 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
237e200e 3530 printme = lbuf;
da35d72a
SH
3531 } else if (startswith(line, "Slab:")) {
3532 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3533 printme = lbuf;
237e200e
SH
3534 } else if (startswith(line, "Buffers:")) {
3535 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3536 printme = lbuf;
3537 } else if (startswith(line, "Cached:")) {
3538 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3539 printme = lbuf;
3540 } else if (startswith(line, "SwapCached:")) {
3541 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3542 printme = lbuf;
2f306ad3 3543 } else if (startswith(line, "Active:")) {
c6095b08
SH
3544 snprintf(lbuf, 100, "Active: %8lu kB\n",
3545 active_anon + active_file);
3546 printme = lbuf;
2f306ad3 3547 } else if (startswith(line, "Inactive:")) {
c6095b08
SH
3548 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3549 inactive_anon + inactive_file);
3550 printme = lbuf;
3551 } else if (startswith(line, "Active(anon)")) {
3552 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3553 printme = lbuf;
3554 } else if (startswith(line, "Inactive(anon)")) {
3555 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3556 printme = lbuf;
3557 } else if (startswith(line, "Active(file)")) {
3558 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3559 printme = lbuf;
3560 } else if (startswith(line, "Inactive(file)")) {
3561 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3562 printme = lbuf;
3563 } else if (startswith(line, "Unevictable")) {
3564 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3565 printme = lbuf;
3566 } else if (startswith(line, "SReclaimable")) {
3567 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3568 printme = lbuf;
3569 } else if (startswith(line, "SUnreclaim")) {
3570 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3571 printme = lbuf;
559eaa8f
JS
3572 } else if (startswith(line, "Shmem:")) {
3573 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3574 printme = lbuf;
28cdea9b
JS
3575 } else if (startswith(line, "ShmemHugePages")) {
3576 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3577 printme = lbuf;
3578 } else if (startswith(line, "ShmemPmdMapped")) {
3579 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3580 printme = lbuf;
237e200e
SH
3581 } else
3582 printme = line;
3583
3584 l = snprintf(cache, cache_size, "%s", printme);
3585 if (l < 0) {
3586 perror("Error writing to cache");
3587 rv = 0;
3588 goto err;
3589
3590 }
3591 if (l >= cache_size) {
b8defc3d 3592 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3593 rv = 0;
3594 goto err;
3595 }
3596
3597 cache += l;
3598 cache_size -= l;
3599 total_len += l;
3600 }
3601
3602 d->cached = 1;
3603 d->size = total_len;
3604 if (total_len > size ) total_len = size;
3605 memcpy(buf, d->buf, total_len);
3606
3607 rv = total_len;
3608err:
3609 if (f)
3610 fclose(f);
3611 free(line);
3612 free(cg);
3613 free(memusage_str);
3614 free(memswlimit_str);
3615 free(memswusage_str);
3616 free(memstat_str);
237e200e
SH
3617 return rv;
3618}
3619
3620/*
3621 * Read the cpuset.cpus for cg
3622 * Return the answer in a newly allocated string which must be freed
3623 */
3624static char *get_cpuset(const char *cg)
3625{
3626 char *answer;
3627
3628 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3629 return NULL;
3630 return answer;
3631}
3632
3633bool cpu_in_cpuset(int cpu, const char *cpuset);
3634
3635static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3636{
3637 int cpu;
3638
3639 if (sscanf(line, "processor : %d", &cpu) != 1)
3640 return false;
3641 return cpu_in_cpuset(cpu, cpuset);
3642}
3643
c59d6a55
JS
3644/*
3645 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3646 * depending on `param`. Parameter value is returned throuh `value`.
3647 */
3648static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3649{
3650 bool rv = false;
3651 char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3652 char *str = NULL;
3653
3654 sprintf(file, "cpu.cfs_%s_us", param);
3655
3656 if (!cgfs_get_value("cpu", cg, file, &str))
3657 goto err;
3658
3659 if (sscanf(str, "%ld", value) != 1)
3660 goto err;
3661
3662 rv = true;
3663
3664err:
3665 if (str)
3666 free(str);
3667 return rv;
3668}
3669
3670/*
3671 * Return the maximum number of visible CPUs based on CPU quotas.
3672 * If there is no quota set, zero is returned.
3673 */
3674int max_cpu_count(const char *cg)
3675{
3676 int rv, nprocs;
3677 int64_t cfs_quota, cfs_period;
3678
3679 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3680 return 0;
3681
3682 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3683 return 0;
3684
3685 if (cfs_quota <= 0 || cfs_period <= 0)
3686 return 0;
3687
3688 rv = cfs_quota / cfs_period;
3689
3690 /* In case quota/period does not yield a whole number, add one CPU for
3691 * the remainder.
3692 */
3693 if ((cfs_quota % cfs_period) > 0)
3694 rv += 1;
3695
3696 nprocs = get_nprocs();
3697
3698 if (rv > nprocs)
3699 rv = nprocs;
3700
3701 return rv;
3702}
3703
3704/*
3705 * Determine whether CPU views should be used or not.
3706 */
3707bool use_cpuview(const char *cg)
3708{
3709 int cfd;
3710 char *tmpc;
3711
3712 tmpc = find_mounted_controller("cpu", &cfd);
3713 if (!tmpc)
3714 return false;
3715
3716 tmpc = find_mounted_controller("cpuacct", &cfd);
3717 if (!tmpc)
3718 return false;
3719
3720 return true;
3721}
3722
237e200e
SH
3723/*
3724 * check whether this is a '^processor" line in /proc/cpuinfo
3725 */
3726static bool is_processor_line(const char *line)
3727{
3728 int cpu;
3729
3730 if (sscanf(line, "processor : %d", &cpu) == 1)
3731 return true;
3732 return false;
3733}
3734
3735static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3736 struct fuse_file_info *fi)
3737{
3738 struct fuse_context *fc = fuse_get_context();
3739 struct file_info *d = (struct file_info *)fi->fh;
3740 char *cg;
3741 char *cpuset = NULL;
3742 char *line = NULL;
3743 size_t linelen = 0, total_len = 0, rv = 0;
f676eb79 3744 bool am_printing = false, firstline = true, is_s390x = false;
c59d6a55
JS
3745 int curcpu = -1, cpu, max_cpus = 0;
3746 bool use_view;
237e200e
SH
3747 char *cache = d->buf;
3748 size_t cache_size = d->buflen;
3749 FILE *f = NULL;
3750
3751 if (offset){
3752 if (offset > d->size)
3753 return -EINVAL;
3754 if (!d->cached)
3755 return 0;
3756 int left = d->size - offset;
3757 total_len = left > size ? size: left;
3758 memcpy(buf, cache + offset, total_len);
3759 return total_len;
3760 }
3761
3762 pid_t initpid = lookup_initpid_in_store(fc->pid);
3763 if (initpid <= 0)
3764 initpid = fc->pid;
3765 cg = get_pid_cgroup(initpid, "cpuset");
3766 if (!cg)
3767 return read_file("proc/cpuinfo", buf, size, d);
6d2f6996 3768 prune_init_slice(cg);
237e200e
SH
3769
3770 cpuset = get_cpuset(cg);
3771 if (!cpuset)
3772 goto err;
3773
c59d6a55
JS
3774 use_view = use_cpuview(cg);
3775
3776 if (use_view)
3777 max_cpus = max_cpu_count(cg);
3778
237e200e
SH
3779 f = fopen("/proc/cpuinfo", "r");
3780 if (!f)
3781 goto err;
3782
3783 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3784 ssize_t l;
f676eb79
SH
3785 if (firstline) {
3786 firstline = false;
3787 if (strstr(line, "IBM/S390") != NULL) {
3788 is_s390x = true;
3789 am_printing = true;
5ed9d4e2 3790 continue;
f676eb79
SH
3791 }
3792 }
5ed9d4e2
SH
3793 if (strncmp(line, "# processors:", 12) == 0)
3794 continue;
237e200e 3795 if (is_processor_line(line)) {
c59d6a55
JS
3796 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3797 break;
237e200e
SH
3798 am_printing = cpuline_in_cpuset(line, cpuset);
3799 if (am_printing) {
3800 curcpu ++;
3801 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3802 if (l < 0) {
3803 perror("Error writing to cache");
3804 rv = 0;
3805 goto err;
3806 }
3807 if (l >= cache_size) {
b8defc3d 3808 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3809 rv = 0;
3810 goto err;
3811 }
3812 cache += l;
3813 cache_size -= l;
3814 total_len += l;
3815 }
3816 continue;
f676eb79
SH
3817 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3818 char *p;
c59d6a55
JS
3819 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3820 break;
f676eb79
SH
3821 if (!cpu_in_cpuset(cpu, cpuset))
3822 continue;
3823 curcpu ++;
3824 p = strchr(line, ':');
3825 if (!p || !*p)
3826 goto err;
3827 p++;
5ed9d4e2 3828 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
f676eb79
SH
3829 if (l < 0) {
3830 perror("Error writing to cache");
3831 rv = 0;
3832 goto err;
3833 }
3834 if (l >= cache_size) {
b8defc3d 3835 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
f676eb79
SH
3836 rv = 0;
3837 goto err;
3838 }
3839 cache += l;
3840 cache_size -= l;
3841 total_len += l;
3842 continue;
3843
237e200e
SH
3844 }
3845 if (am_printing) {
3846 l = snprintf(cache, cache_size, "%s", line);
3847 if (l < 0) {
3848 perror("Error writing to cache");
3849 rv = 0;
3850 goto err;
3851 }
3852 if (l >= cache_size) {
b8defc3d 3853 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3854 rv = 0;
3855 goto err;
3856 }
3857 cache += l;
3858 cache_size -= l;
3859 total_len += l;
3860 }
3861 }
3862
5ed9d4e2
SH
3863 if (is_s390x) {
3864 char *origcache = d->buf;
a262ddb7 3865 ssize_t l;
5ed9d4e2
SH
3866 do {
3867 d->buf = malloc(d->buflen);
3868 } while (!d->buf);
3869 cache = d->buf;
3870 cache_size = d->buflen;
3871 total_len = 0;
3872 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3873 if (l < 0 || l >= cache_size) {
3874 free(origcache);
3875 goto err;
3876 }
3877 cache_size -= l;
3878 cache += l;
3879 total_len += l;
3880 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3881 if (l < 0 || l >= cache_size) {
3882 free(origcache);
3883 goto err;
3884 }
3885 cache_size -= l;
3886 cache += l;
3887 total_len += l;
3888 l = snprintf(cache, cache_size, "%s", origcache);
3889 free(origcache);
3890 if (l < 0 || l >= cache_size)
3891 goto err;
3892 total_len += l;
3893 }
3894
237e200e
SH
3895 d->cached = 1;
3896 d->size = total_len;
3897 if (total_len > size ) total_len = size;
3898
3899 /* read from off 0 */
3900 memcpy(buf, d->buf, total_len);
3901 rv = total_len;
3902err:
3903 if (f)
3904 fclose(f);
3905 free(line);
3906 free(cpuset);
3907 free(cg);
3908 return rv;
3909}
3910
0ecddf02 3911static uint64_t get_reaper_start_time(pid_t pid)
9ac264cf 3912{
9ac264cf 3913 int ret;
0ecddf02
CB
3914 FILE *f;
3915 uint64_t starttime;
3916 /* strlen("/proc/") = 6
3917 * +
3918 * LXCFS_NUMSTRLEN64
3919 * +
3920 * strlen("/stat") = 5
3921 * +
3922 * \0 = 1
3923 * */
3924#define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3925 char path[__PROC_PID_STAT_LEN];
9ac264cf
JB
3926 pid_t qpid;
3927
3928 qpid = lookup_initpid_in_store(pid);
0ecddf02
CB
3929 if (qpid <= 0) {
3930 /* Caller can check for EINVAL on 0. */
3931 errno = EINVAL;
9ac264cf 3932 return 0;
0ecddf02 3933 }
9ac264cf 3934
0ecddf02
CB
3935 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3936 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3937 /* Caller can check for EINVAL on 0. */
3938 errno = EINVAL;
9ac264cf 3939 return 0;
0ecddf02 3940 }
9ac264cf 3941
0ecddf02
CB
3942 f = fopen(path, "r");
3943 if (!f) {
3944 /* Caller can check for EINVAL on 0. */
3945 errno = EINVAL;
9ac264cf 3946 return 0;
0ecddf02 3947 }
9ac264cf 3948
0ecddf02
CB
3949 /* Note that the *scanf() argument supression requires that length
3950 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3951 * at us. It's like telling someone you're not married and then asking
3952 * if you can bring your wife to the party.
3953 */
3954 ret = fscanf(f, "%*d " /* (1) pid %d */
3955 "%*s " /* (2) comm %s */
3956 "%*c " /* (3) state %c */
3957 "%*d " /* (4) ppid %d */
3958 "%*d " /* (5) pgrp %d */
3959 "%*d " /* (6) session %d */
3960 "%*d " /* (7) tty_nr %d */
3961 "%*d " /* (8) tpgid %d */
3962 "%*u " /* (9) flags %u */
3963 "%*u " /* (10) minflt %lu */
3964 "%*u " /* (11) cminflt %lu */
3965 "%*u " /* (12) majflt %lu */
3966 "%*u " /* (13) cmajflt %lu */
3967 "%*u " /* (14) utime %lu */
3968 "%*u " /* (15) stime %lu */
3969 "%*d " /* (16) cutime %ld */
3970 "%*d " /* (17) cstime %ld */
3971 "%*d " /* (18) priority %ld */
3972 "%*d " /* (19) nice %ld */
3973 "%*d " /* (20) num_threads %ld */
3974 "%*d " /* (21) itrealvalue %ld */
3975 "%" PRIu64, /* (22) starttime %llu */
3976 &starttime);
3977 if (ret != 1) {
3978 fclose(f);
3979 /* Caller can check for EINVAL on 0. */
3980 errno = EINVAL;
3981 return 0;
3982 }
3983
3984 fclose(f);
3985
3986 errno = 0;
3987 return starttime;
3988}
3989
3990static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3991{
3992 uint64_t clockticks;
3993 int64_t ticks_per_sec;
3994
3995 clockticks = get_reaper_start_time(pid);
3996 if (clockticks == 0 && errno == EINVAL) {
3997 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3998 return 0;
3999 }
4000
4001 ticks_per_sec = sysconf(_SC_CLK_TCK);
4002 if (ticks_per_sec < 0 && errno == EINVAL) {
4003 lxcfs_debug(
4004 "%s\n",
4005 "failed to determine number of clock ticks in a second");
4006 return 0;
4007 }
4008
4009 return (clockticks /= ticks_per_sec);
4010}
4011
4012static uint64_t get_reaper_age(pid_t pid)
4013{
4014 uint64_t procstart, uptime, procage;
4015
4016 /* We need to substract the time the process has started since system
4017 * boot minus the time when the system has started to get the actual
4018 * reaper age.
4019 */
4020 procstart = get_reaper_start_time_in_sec(pid);
4021 procage = procstart;
4022 if (procstart > 0) {
4023 int ret;
4024 struct timespec spec;
4025
4026 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
4027 if (ret < 0)
4028 return 0;
4029 /* We could make this more precise here by using the tv_nsec
4030 * field in the timespec struct and convert it to milliseconds
4031 * and then create a double for the seconds and milliseconds but
4032 * that seems more work than it is worth.
4033 */
4034 uptime = spec.tv_sec;
4035 procage = uptime - procstart;
4036 }
4037
4038 return procage;
4039}
4040
8be92dd1
JS
4041/*
4042 * Returns 0 on success.
4043 * It is the caller's responsibility to free `return_usage`, unless this
4044 * function returns an error.
4045 */
4046static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage)
4047{
4048 int cpucount = get_nprocs();
4049 struct cpuacct_usage *cpu_usage;
4050 int rv = 0, i, j, ret, read_pos = 0, read_cnt;
4051 int cg_cpu;
4052 uint64_t cg_user, cg_system;
4053 int64_t ticks_per_sec;
4054 char *usage_str = NULL;
4055
4056 ticks_per_sec = sysconf(_SC_CLK_TCK);
4057
4058 if (ticks_per_sec < 0 && errno == EINVAL) {
4059 lxcfs_debug(
4060 "%s\n",
4061 "read_cpuacct_usage_all failed to determine number of clock ticks "
4062 "in a second");
4063 return -1;
4064 }
4065
4066 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4067 if (!cpu_usage)
4068 return -ENOMEM;
4069
4070 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4071 rv = -1;
4072 goto err;
4073 }
4074
4075 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4076 lxcfs_error("read_cpuacct_usage_all reading first line from "
4077 "%s/cpuacct.usage_all failed.\n", cg);
4078 rv = -1;
4079 goto err;
4080 }
4081
4082 read_pos += read_cnt;
4083
4084 for (i = 0, j = 0; i < cpucount; i++) {
4085 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4086 &cg_system, &read_cnt);
4087
4088 if (ret == EOF)
4089 break;
4090
4091 if (ret != 3) {
4092 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4093 "failed.\n", cg);
4094 rv = -1;
4095 goto err;
4096 }
4097
4098 read_pos += read_cnt;
4099
4100 if (!cpu_in_cpuset(i, cpuset))
4101 continue;
4102
4103 /* Convert the time from nanoseconds to USER_HZ */
4104 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4105 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4106 j++;
4107 }
4108
4109 rv = 0;
4110 *return_usage = cpu_usage;
4111
4112err:
4113 if (usage_str)
4114 free(usage_str);
4115
4116 if (rv != 0) {
4117 free(cpu_usage);
4118 *return_usage = NULL;
4119 }
4120
4121 return rv;
4122}
4123
056adcef
JS
4124static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4125{
4126 int i;
4127 unsigned long sum = 0;
4128
4129 for (i = 0; i < cpu_count; i++) {
4130 /* When cpuset is changed on the fly, the CPUs might get reordered.
4131 * We could either reset all counters, or check that the substractions
4132 * below will return expected results.
4133 */
4134 if (newer[i].user > older[i].user)
4135 diff[i].user = newer[i].user - older[i].user;
4136 else
4137 diff[i].user = 0;
4138
4139 if (newer[i].system > older[i].system)
4140 diff[i].system = newer[i].system - older[i].system;
4141 else
4142 diff[i].system = 0;
4143
4144 if (newer[i].idle > older[i].idle)
4145 diff[i].idle = newer[i].idle - older[i].idle;
4146 else
4147 diff[i].idle = 0;
4148
4149 sum += diff[i].user;
4150 sum += diff[i].system;
4151 sum += diff[i].idle;
4152 }
4153
4154 return sum;
4155}
4156
4157static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4158{
4159 unsigned long free_space, to_add;
4160
4161 free_space = threshold - usage->user - usage->system;
4162
4163 if (free_space > usage->idle)
4164 free_space = usage->idle;
4165
4166 to_add = free_space > *surplus ? *surplus : free_space;
4167
4168 *counter += to_add;
4169 usage->idle -= to_add;
4170 *surplus -= to_add;
4171}
4172
951acc94
JS
4173static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4174{
4175 struct cg_proc_stat *first = NULL, *prev, *tmp;
4176
4177 for (prev = NULL; node; ) {
4178 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4179 tmp = node;
4180 lxcfs_debug("Removing stat node for %s\n", node->cg);
4181
4182 if (prev)
4183 prev->next = node->next;
4184 else
4185 first = node->next;
4186
4187 node = node->next;
4188 free_proc_stat_node(tmp);
4189 } else {
4190 if (!first)
4191 first = node;
4192 prev = node;
4193 node = node->next;
4194 }
4195 }
4196
4197 return first;
4198}
4199
4200#define PROC_STAT_PRUNE_INTERVAL 10
4201static void prune_proc_stat_history(void)
4202{
4203 int i;
4204 time_t now = time(NULL);
4205
4206 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
2f49b662
JS
4207 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4208
4209 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4210 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
951acc94 4211 return;
2f49b662 4212 }
951acc94 4213
2f49b662
JS
4214 if (proc_stat_history[i]->next) {
4215 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4216 proc_stat_history[i]->lastcheck = now;
4217 }
951acc94 4218
2f49b662 4219 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
951acc94
JS
4220 }
4221}
4222
2f49b662 4223static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
056adcef 4224{
056adcef
JS
4225 struct cg_proc_stat *node;
4226
2f49b662
JS
4227 pthread_rwlock_rdlock(&head->lock);
4228
4229 if (!head->next) {
4230 pthread_rwlock_unlock(&head->lock);
056adcef 4231 return NULL;
2f49b662 4232 }
056adcef
JS
4233
4234 node = head->next;
4235
4236 do {
4237 if (strcmp(cg, node->cg) == 0)
951acc94 4238 goto out;
056adcef
JS
4239 } while ((node = node->next));
4240
951acc94
JS
4241 node = NULL;
4242
4243out:
2f49b662 4244 pthread_rwlock_unlock(&head->lock);
951acc94
JS
4245 prune_proc_stat_history();
4246 return node;
056adcef
JS
4247}
4248
4249static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4250{
4251 struct cg_proc_stat *node;
4252 int i;
4253
4254 node = malloc(sizeof(struct cg_proc_stat));
4255 if (!node)
4256 goto err;
4257
4258 node->cg = NULL;
4259 node->usage = NULL;
4260 node->view = NULL;
4261
4262 node->cg = malloc(strlen(cg) + 1);
4263 if (!node->cg)
4264 goto err;
4265
4266 strcpy(node->cg, cg);
4267
4268 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4269 if (!node->usage)
4270 goto err;
4271
4272 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4273
4274 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4275 if (!node->view)
4276 goto err;
4277
4278 node->cpu_count = cpu_count;
4279 node->next = NULL;
4280
2f49b662
JS
4281 if (pthread_mutex_init(&node->lock, NULL) != 0) {
4282 lxcfs_error("%s\n", "Failed to initialize node lock");
4283 goto err;
4284 }
4285
056adcef
JS
4286 for (i = 0; i < cpu_count; i++) {
4287 node->view[i].user = 0;
4288 node->view[i].system = 0;
4289 node->view[i].idle = 0;
4290 }
4291
4292 return node;
4293
4294err:
4295 if (node && node->cg)
4296 free(node->cg);
4297 if (node && node->usage)
4298 free(node->usage);
4299 if (node && node->view)
4300 free(node->view);
4301 if (node)
4302 free(node);
4303
4304 return NULL;
4305}
4306
2f49b662 4307static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
056adcef
JS
4308{
4309 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4310 struct cg_proc_stat_head *head = proc_stat_history[hash];
2f49b662
JS
4311 struct cg_proc_stat *node, *rv = new_node;
4312
4313 pthread_rwlock_wrlock(&head->lock);
056adcef
JS
4314
4315 if (!head->next) {
4316 head->next = new_node;
2f49b662 4317 goto out;
056adcef
JS
4318 }
4319
2f49b662
JS
4320 node = head->next;
4321
056adcef 4322 for (;;) {
2f49b662
JS
4323 if (strcmp(node->cg, new_node->cg) == 0) {
4324 /* The node is already present, return it */
4325 free_proc_stat_node(new_node);
4326 rv = node;
4327 goto out;
4328 }
056adcef
JS
4329
4330 if (node->next) {
4331 node = node->next;
4332 continue;
4333 }
4334
4335 node->next = new_node;
2f49b662
JS
4336 goto out;
4337 }
4338
4339out:
4340 pthread_rwlock_unlock(&head->lock);
4341 return rv;
4342}
4343
895f28e5
JS
4344static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4345{
4346 struct cpuacct_usage *new_usage, *new_view;
4347 int i;
4348
4349 /* Allocate new memory */
4350 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4351 if (!new_usage)
4352 return false;
4353
4354 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4355 if (!new_view) {
4356 free(new_usage);
4357 return false;
4358 }
4359
4360 /* Copy existing data & initialize new elements */
4361 for (i = 0; i < cpu_count; i++) {
4362 if (i < node->cpu_count) {
4363 new_usage[i].user = node->usage[i].user;
4364 new_usage[i].system = node->usage[i].system;
4365 new_usage[i].idle = node->usage[i].idle;
4366
4367 new_view[i].user = node->view[i].user;
4368 new_view[i].system = node->view[i].system;
4369 new_view[i].idle = node->view[i].idle;
4370 } else {
4371 new_usage[i].user = 0;
4372 new_usage[i].system = 0;
4373 new_usage[i].idle = 0;
4374
4375 new_view[i].user = 0;
4376 new_view[i].system = 0;
4377 new_view[i].idle = 0;
4378 }
4379 }
4380
4381 free(node->usage);
4382 free(node->view);
4383
4384 node->usage = new_usage;
4385 node->view = new_view;
4386 node->cpu_count = cpu_count;
4387
4388 return true;
4389}
4390
2f49b662
JS
4391static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4392{
4393 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4394 struct cg_proc_stat_head *head = proc_stat_history[hash];
4395 struct cg_proc_stat *node;
4396
4397 node = find_proc_stat_node(head, cg);
4398
4399 if (!node) {
4400 node = new_proc_stat_node(usage, cpu_count, cg);
4401 if (!node)
4402 return NULL;
4403
4404 node = add_proc_stat_node(node);
4405 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
056adcef 4406 }
2f49b662
JS
4407
4408 pthread_mutex_lock(&node->lock);
895f28e5
JS
4409
4410 /* If additional CPUs on the host have been enabled, CPU usage counter
4411 * arrays have to be expanded */
4412 if (node->cpu_count < cpu_count) {
4413 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4414 node->cpu_count, cpu_count, cg);
4415
4416 if (!expand_proc_stat_node(node, cpu_count)) {
4417 pthread_mutex_unlock(&node->lock);
4418 lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4419 node->cpu_count, cpu_count, cg);
4420 return NULL;
4421 }
4422 }
4423
2f49b662 4424 return node;
056adcef
JS
4425}
4426
4427static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4428{
4429 int i;
4430
4431 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4432 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4433
4434 for (i = 0; i < cpu_count; i++) {
4435 node->view[i].user = 0;
4436 node->view[i].system = 0;
4437 node->view[i].idle = 0;
4438 }
4439
4440 node->cpu_count = cpu_count;
4441}
4442
4443static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, FILE *f, char *buf, size_t buf_size)
4444{
4445 char *line = NULL;
4446 size_t linelen = 0, total_len = 0, rv = 0, l;
4447 int curcpu = -1; /* cpu numbering starts at 0 */
4448 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4449 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4450 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4451 unsigned long user_surplus = 0, system_surplus = 0;
4452 unsigned long total_sum, threshold;
4453 struct cg_proc_stat *stat_node;
4454 struct cpuacct_usage *diff = NULL;
4455 int nprocs = get_nprocs();
4456
4457 /* Read all CPU stats and stop when we've encountered other lines */
4458 while (getline(&line, &linelen, f) != -1) {
4459 int cpu, ret;
4460 char cpu_char[10]; /* That's a lot of cores */
4461 uint64_t all_used, cg_used;
4462
4463 if (strlen(line) == 0)
4464 continue;
4465 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4466 /* not a ^cpuN line containing a number N */
4467 break;
4468 }
4469
4470 if (sscanf(cpu_char, "%d", &cpu) != 1)
4471 continue;
4472 if (!cpu_in_cpuset(cpu, cpuset))
4473 continue;
4474 curcpu ++;
4475 cpu_cnt ++;
4476
4477 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4478 &user,
4479 &nice,
4480 &system,
4481 &idle,
4482 &iowait,
4483 &irq,
4484 &softirq,
4485 &steal,
4486 &guest,
4487 &guest_nice);
4488
4489 if (ret != 10)
4490 continue;
4491
4492 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4493 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4494
4495 if (all_used >= cg_used) {
4496 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4497
4498 } else {
4499 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4500 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4501 curcpu, cg, all_used, cg_used);
4502 cg_cpu_usage[curcpu].idle = idle;
4503 }
4504 }
4505
4506 /* Cannot use more CPUs than is available due to cpuset */
4507 if (max_cpus > cpu_cnt)
4508 max_cpus = cpu_cnt;
4509
2f49b662 4510 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
056adcef
JS
4511
4512 if (!stat_node) {
2f49b662
JS
4513 lxcfs_error("unable to find/create stat node for %s\n", cg);
4514 rv = 0;
4515 goto err;
056adcef
JS
4516 }
4517
4518 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4519 if (!diff) {
4520 rv = 0;
4521 goto err;
4522 }
4523
4524 /*
4525 * If the new values are LOWER than values stored in memory, it means
4526 * the cgroup has been reset/recreated and we should reset too.
4527 */
4528 if (cg_cpu_usage[0].user < stat_node->usage[0].user)
4529 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4530
4531 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, cpu_cnt);
4532
4533 for (curcpu = 0; curcpu < cpu_cnt; curcpu++) {
4534 stat_node->usage[curcpu].user += diff[curcpu].user;
4535 stat_node->usage[curcpu].system += diff[curcpu].system;
4536 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4537
4538 if (max_cpus > 0 && curcpu >= max_cpus) {
4539 user_surplus += diff[curcpu].user;
4540 system_surplus += diff[curcpu].system;
4541 }
4542 }
4543
4544 /* Calculate usage counters of visible CPUs */
4545 if (max_cpus > 0) {
4546 /* threshold = maximum usage per cpu, including idle */
4547 threshold = total_sum / cpu_cnt * max_cpus;
4548
4549 for (curcpu = 0; curcpu < max_cpus; curcpu++) {
4550 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4551 continue;
4552
4553 /* Add user */
4554 add_cpu_usage(
4555 &user_surplus,
4556 &diff[curcpu],
4557 &diff[curcpu].user,
4558 threshold);
4559
4560 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4561 continue;
4562
4563 /* If there is still room, add system */
4564 add_cpu_usage(
4565 &system_surplus,
4566 &diff[curcpu],
4567 &diff[curcpu].system,
4568 threshold);
4569 }
4570
4571 if (user_surplus > 0)
4572 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4573 if (system_surplus > 0)
4574 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4575
4576 for (curcpu = 0; curcpu < max_cpus; curcpu++) {
4577 stat_node->view[curcpu].user += diff[curcpu].user;
4578 stat_node->view[curcpu].system += diff[curcpu].system;
4579 stat_node->view[curcpu].idle += diff[curcpu].idle;
4580
4581 user_sum += stat_node->view[curcpu].user;
4582 system_sum += stat_node->view[curcpu].system;
4583 idle_sum += stat_node->view[curcpu].idle;
4584 }
4585
4586 } else {
4587 for (curcpu = 0; curcpu < cpu_cnt; curcpu++) {
4588 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4589 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4590 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4591
4592 user_sum += stat_node->view[curcpu].user;
4593 system_sum += stat_node->view[curcpu].system;
4594 idle_sum += stat_node->view[curcpu].idle;
4595 }
4596 }
4597
4598 /* Render the file */
4599 /* cpu-all */
4600 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4601 user_sum,
4602 system_sum,
4603 idle_sum);
4604
4605 if (l < 0) {
4606 perror("Error writing to cache");
4607 rv = 0;
4608 goto err;
4609
4610 }
4611 if (l >= buf_size) {
4612 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4613 rv = 0;
4614 goto err;
4615 }
4616
4617 buf += l;
4618 buf_size -= l;
4619 total_len += l;
4620
4621 /* Render visible CPUs */
4622 for (curcpu = 0; curcpu < cpu_cnt; curcpu++) {
4623 if (max_cpus > 0 && curcpu == max_cpus)
4624 break;
4625
4626 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4627 curcpu,
4628 stat_node->view[curcpu].user,
4629 stat_node->view[curcpu].system,
4630 stat_node->view[curcpu].idle);
4631
4632 if (l < 0) {
4633 perror("Error writing to cache");
4634 rv = 0;
4635 goto err;
4636
4637 }
4638 if (l >= buf_size) {
4639 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4640 rv = 0;
4641 goto err;
4642 }
4643
4644 buf += l;
4645 buf_size -= l;
4646 total_len += l;
4647 }
4648
4649 /* Pass the rest of /proc/stat, start with the last line read */
4650 l = snprintf(buf, buf_size, "%s", line);
4651
4652 if (l < 0) {
4653 perror("Error writing to cache");
4654 rv = 0;
4655 goto err;
4656
4657 }
4658 if (l >= buf_size) {
4659 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4660 rv = 0;
4661 goto err;
4662 }
4663
4664 buf += l;
4665 buf_size -= l;
4666 total_len += l;
4667
4668 /* Pass the rest of the host's /proc/stat */
4669 while (getline(&line, &linelen, f) != -1) {
4670 l = snprintf(buf, buf_size, "%s", line);
4671 if (l < 0) {
4672 perror("Error writing to cache");
4673 rv = 0;
4674 goto err;
4675 }
4676 if (l >= buf_size) {
4677 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4678 rv = 0;
4679 goto err;
4680 }
4681 buf += l;
4682 buf_size -= l;
4683 total_len += l;
4684 }
4685
4686 rv = total_len;
4687
4688err:
2f49b662
JS
4689 if (stat_node)
4690 pthread_mutex_unlock(&stat_node->lock);
056adcef
JS
4691 if (line)
4692 free(line);
4693 if (diff)
4694 free(diff);
4695 return rv;
4696}
4697
f34de69a 4698#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
237e200e
SH
4699static int proc_stat_read(char *buf, size_t size, off_t offset,
4700 struct fuse_file_info *fi)
4701{
4702 struct fuse_context *fc = fuse_get_context();
4703 struct file_info *d = (struct file_info *)fi->fh;
4704 char *cg;
4705 char *cpuset = NULL;
4706 char *line = NULL;
4707 size_t linelen = 0, total_len = 0, rv = 0;
4708 int curcpu = -1; /* cpu numbering starts at 0 */
7144f069 4709 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
237e200e 4710 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
7144f069 4711 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
237e200e
SH
4712 char cpuall[CPUALL_MAX_SIZE];
4713 /* reserve for cpu all */
4714 char *cache = d->buf + CPUALL_MAX_SIZE;
4715 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4716 FILE *f = NULL;
8be92dd1 4717 struct cpuacct_usage *cg_cpu_usage = NULL;
237e200e
SH
4718
4719 if (offset){
4720 if (offset > d->size)
4721 return -EINVAL;
4722 if (!d->cached)
4723 return 0;
4724 int left = d->size - offset;
4725 total_len = left > size ? size: left;
4726 memcpy(buf, d->buf + offset, total_len);
4727 return total_len;
4728 }
4729
4730 pid_t initpid = lookup_initpid_in_store(fc->pid);
4731 if (initpid <= 0)
4732 initpid = fc->pid;
4733 cg = get_pid_cgroup(initpid, "cpuset");
4734 if (!cg)
4735 return read_file("/proc/stat", buf, size, d);
6d2f6996 4736 prune_init_slice(cg);
237e200e
SH
4737
4738 cpuset = get_cpuset(cg);
4739 if (!cpuset)
4740 goto err;
4741
8be92dd1
JS
4742 /*
4743 * Read cpuacct.usage_all for all CPUs.
4744 * If the cpuacct cgroup is present, it is used to calculate the container's
4745 * CPU usage. If not, values from the host's /proc/stat are used.
4746 */
4747 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage) != 0) {
4748 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
4749 "falling back to the host's /proc/stat");
4750 }
4751
237e200e
SH
4752 f = fopen("/proc/stat", "r");
4753 if (!f)
4754 goto err;
4755
4756 //skip first line
4757 if (getline(&line, &linelen, f) < 0) {
b8defc3d 4758 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
237e200e
SH
4759 goto err;
4760 }
4761
056adcef
JS
4762 if (use_cpuview(cg) && cg_cpu_usage) {
4763 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, f, d->buf, d->buflen);
4764 goto out;
4765 }
4766
237e200e 4767 while (getline(&line, &linelen, f) != -1) {
a262ddb7 4768 ssize_t l;
237e200e
SH
4769 int cpu;
4770 char cpu_char[10]; /* That's a lot of cores */
4771 char *c;
8be92dd1
JS
4772 uint64_t all_used, cg_used, new_idle;
4773 int ret;
237e200e 4774
b4665ce0
SH
4775 if (strlen(line) == 0)
4776 continue;
237e200e
SH
4777 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4778 /* not a ^cpuN line containing a number N, just print it */
9502bae2 4779 l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
4780 if (l < 0) {
4781 perror("Error writing to cache");
4782 rv = 0;
4783 goto err;
4784 }
4785 if (l >= cache_size) {
b8defc3d 4786 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
4787 rv = 0;
4788 goto err;
4789 }
4790 cache += l;
4791 cache_size -= l;
4792 total_len += l;
4793 continue;
4794 }
4795
4796 if (sscanf(cpu_char, "%d", &cpu) != 1)
4797 continue;
4798 if (!cpu_in_cpuset(cpu, cpuset))
4799 continue;
4800 curcpu ++;
4801
8be92dd1 4802 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
7144f069
CB
4803 &user,
4804 &nice,
4805 &system,
4806 &idle,
4807 &iowait,
4808 &irq,
4809 &softirq,
4810 &steal,
4811 &guest,
8be92dd1
JS
4812 &guest_nice);
4813
4814 if (ret != 10 || !cg_cpu_usage) {
4815 c = strchr(line, ' ');
4816 if (!c)
4817 continue;
4818 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4819 if (l < 0) {
4820 perror("Error writing to cache");
4821 rv = 0;
4822 goto err;
4823
4824 }
4825 if (l >= cache_size) {
4826 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4827 rv = 0;
4828 goto err;
4829 }
4830
4831 cache += l;
4832 cache_size -= l;
4833 total_len += l;
4834
4835 if (ret != 10)
4836 continue;
4837 }
4838
4839 if (cg_cpu_usage) {
4840 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4841 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4842
4843 if (all_used >= cg_used) {
4844 new_idle = idle + (all_used - cg_used);
4845
4846 } else {
4847 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4848 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4849 curcpu, cg, all_used, cg_used);
4850 new_idle = idle;
4851 }
4852
4853 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4854 curcpu, cg_cpu_usage[curcpu].user, cg_cpu_usage[curcpu].system,
4855 new_idle);
4856
4857 if (l < 0) {
4858 perror("Error writing to cache");
4859 rv = 0;
4860 goto err;
4861
4862 }
4863 if (l >= cache_size) {
4864 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4865 rv = 0;
4866 goto err;
4867 }
4868
4869 cache += l;
4870 cache_size -= l;
4871 total_len += l;
4872
4873 user_sum += cg_cpu_usage[curcpu].user;
4874 system_sum += cg_cpu_usage[curcpu].system;
4875 idle_sum += new_idle;
4876
4877 } else {
4878 user_sum += user;
4879 nice_sum += nice;
4880 system_sum += system;
4881 idle_sum += idle;
4882 iowait_sum += iowait;
4883 irq_sum += irq;
4884 softirq_sum += softirq;
4885 steal_sum += steal;
4886 guest_sum += guest;
4887 guest_nice_sum += guest_nice;
4888 }
237e200e
SH
4889 }
4890
4891 cache = d->buf;
4892
7144f069
CB
4893 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4894 user_sum,
4895 nice_sum,
4896 system_sum,
4897 idle_sum,
4898 iowait_sum,
4899 irq_sum,
4900 softirq_sum,
4901 steal_sum,
4902 guest_sum,
4903 guest_nice_sum);
4904 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
237e200e
SH
4905 memcpy(cache, cpuall, cpuall_len);
4906 cache += cpuall_len;
7144f069 4907 } else {
237e200e 4908 /* shouldn't happen */
b8defc3d 4909 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
237e200e
SH
4910 cpuall_len = 0;
4911 }
4912
4913 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4914 total_len += cpuall_len;
056adcef
JS
4915
4916out:
237e200e
SH
4917 d->cached = 1;
4918 d->size = total_len;
7144f069
CB
4919 if (total_len > size)
4920 total_len = size;
237e200e
SH
4921
4922 memcpy(buf, d->buf, total_len);
4923 rv = total_len;
4924
4925err:
4926 if (f)
4927 fclose(f);
8be92dd1
JS
4928 if (cg_cpu_usage)
4929 free(cg_cpu_usage);
237e200e
SH
4930 free(line);
4931 free(cpuset);
4932 free(cg);
4933 return rv;
4934}
4935
0ecddf02
CB
4936/* This function retrieves the busy time of a group of tasks by looking at
4937 * cpuacct.usage. Unfortunately, this only makes sense when the container has
4938 * been given it's own cpuacct cgroup. If not, this function will take the busy
4939 * time of all other taks that do not actually belong to the container into
4940 * account as well. If someone has a clever solution for this please send a
4941 * patch!
4942 */
237e200e
SH
4943static unsigned long get_reaper_busy(pid_t task)
4944{
4945 pid_t initpid = lookup_initpid_in_store(task);
4946 char *cgroup = NULL, *usage_str = NULL;
4947 unsigned long usage = 0;
4948
4949 if (initpid <= 0)
4950 return 0;
4951
4952 cgroup = get_pid_cgroup(initpid, "cpuacct");
4953 if (!cgroup)
4954 goto out;
6d2f6996 4955 prune_init_slice(cgroup);
237e200e
SH
4956 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
4957 goto out;
4958 usage = strtoul(usage_str, NULL, 10);
4959 usage /= 1000000000;
4960
4961out:
4962 free(cgroup);
4963 free(usage_str);
4964 return usage;
4965}
4966
4967#if RELOADTEST
4968void iwashere(void)
4969{
237e200e
SH
4970 int fd;
4971
ec2b5e7c 4972 fd = creat("/tmp/lxcfs-iwashere", 0644);
237e200e
SH
4973 if (fd >= 0)
4974 close(fd);
4975}
4976#endif
4977
4978/*
4979 * We read /proc/uptime and reuse its second field.
4980 * For the first field, we use the mtime for the reaper for
4981 * the calling pid as returned by getreaperage
4982 */
4983static int proc_uptime_read(char *buf, size_t size, off_t offset,
4984 struct fuse_file_info *fi)
4985{
4986 struct fuse_context *fc = fuse_get_context();
4987 struct file_info *d = (struct file_info *)fi->fh;
0ecddf02 4988 unsigned long int busytime = get_reaper_busy(fc->pid);
237e200e 4989 char *cache = d->buf;
a262ddb7 4990 ssize_t total_len = 0;
0ecddf02 4991 uint64_t idletime, reaperage;
237e200e
SH
4992
4993#if RELOADTEST
4994 iwashere();
4995#endif
4996
4997 if (offset){
237e200e
SH
4998 if (!d->cached)
4999 return 0;
bbdf646b
BM
5000 if (offset > d->size)
5001 return -EINVAL;
237e200e
SH
5002 int left = d->size - offset;
5003 total_len = left > size ? size: left;
5004 memcpy(buf, cache + offset, total_len);
5005 return total_len;
5006 }
5007
0ecddf02
CB
5008 reaperage = get_reaper_age(fc->pid);
5009 /* To understand why this is done, please read the comment to the
5010 * get_reaper_busy() function.
5011 */
5012 idletime = reaperage;
5013 if (reaperage >= busytime)
5014 idletime = reaperage - busytime;
237e200e 5015
bbdf646b
BM
5016 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
5017 if (total_len < 0 || total_len >= d->buflen){
0ecddf02 5018 lxcfs_error("%s\n", "failed to write to cache");
237e200e
SH
5019 return 0;
5020 }
5021
5022 d->size = (int)total_len;
5023 d->cached = 1;
5024
5025 if (total_len > size) total_len = size;
5026
5027 memcpy(buf, d->buf, total_len);
5028 return total_len;
5029}
5030
5031static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5032 struct fuse_file_info *fi)
5033{
5034 char dev_name[72];
5035 struct fuse_context *fc = fuse_get_context();
5036 struct file_info *d = (struct file_info *)fi->fh;
5037 char *cg;
5038 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5039 *io_wait_time_str = NULL, *io_service_time_str = NULL;
5040 unsigned long read = 0, write = 0;
5041 unsigned long read_merged = 0, write_merged = 0;
5042 unsigned long read_sectors = 0, write_sectors = 0;
5043 unsigned long read_ticks = 0, write_ticks = 0;
5044 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5045 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5046 char *cache = d->buf;
5047 size_t cache_size = d->buflen;
5048 char *line = NULL;
5049 size_t linelen = 0, total_len = 0, rv = 0;
5050 unsigned int major = 0, minor = 0;
5051 int i = 0;
5052 FILE *f = NULL;
5053
5054 if (offset){
5055 if (offset > d->size)
5056 return -EINVAL;
5057 if (!d->cached)
5058 return 0;
5059 int left = d->size - offset;
5060 total_len = left > size ? size: left;
5061 memcpy(buf, cache + offset, total_len);
5062 return total_len;
5063 }
5064
5065 pid_t initpid = lookup_initpid_in_store(fc->pid);
5066 if (initpid <= 0)
5067 initpid = fc->pid;
5068 cg = get_pid_cgroup(initpid, "blkio");
5069 if (!cg)
5070 return read_file("/proc/diskstats", buf, size, d);
6d2f6996 5071 prune_init_slice(cg);
237e200e 5072
2209fe50 5073 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
237e200e 5074 goto err;
2209fe50 5075 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
237e200e 5076 goto err;
2209fe50 5077 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
237e200e 5078 goto err;
2209fe50 5079 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
237e200e 5080 goto err;
2209fe50 5081 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
237e200e
SH
5082 goto err;
5083
5084
5085 f = fopen("/proc/diskstats", "r");
5086 if (!f)
5087 goto err;
5088
5089 while (getline(&line, &linelen, f) != -1) {
a262ddb7 5090 ssize_t l;
2209fe50 5091 char lbuf[256];
237e200e
SH
5092
5093 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2209fe50 5094 if (i != 3)
237e200e 5095 continue;
2209fe50
SH
5096
5097 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5098 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5099 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5100 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5101 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5102 read_sectors = read_sectors/512;
5103 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5104 write_sectors = write_sectors/512;
5105
5106 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5107 rd_svctm = rd_svctm/1000000;
5108 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5109 rd_wait = rd_wait/1000000;
5110 read_ticks = rd_svctm + rd_wait;
5111
5112 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5113 wr_svctm = wr_svctm/1000000;
5114 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5115 wr_wait = wr_wait/1000000;
5116 write_ticks = wr_svctm + wr_wait;
5117
5118 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5119 tot_ticks = tot_ticks/1000000;
237e200e
SH
5120
5121 memset(lbuf, 0, 256);
2db31eb6
SH
5122 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5123 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5124 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5125 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5126 else
5127 continue;
237e200e 5128
2209fe50 5129 l = snprintf(cache, cache_size, "%s", lbuf);
237e200e
SH
5130 if (l < 0) {
5131 perror("Error writing to fuse buf");
5132 rv = 0;
5133 goto err;
5134 }
5135 if (l >= cache_size) {
b8defc3d 5136 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
5137 rv = 0;
5138 goto err;
5139 }
5140 cache += l;
5141 cache_size -= l;
5142 total_len += l;
5143 }
5144
5145 d->cached = 1;
5146 d->size = total_len;
5147 if (total_len > size ) total_len = size;
5148 memcpy(buf, d->buf, total_len);
5149
5150 rv = total_len;
5151err:
5152 free(cg);
5153 if (f)
5154 fclose(f);
5155 free(line);
5156 free(io_serviced_str);
5157 free(io_merged_str);
5158 free(io_service_bytes_str);
5159 free(io_wait_time_str);
5160 free(io_service_time_str);
5161 return rv;
5162}
5163
70dcc12e
SH
5164static int proc_swaps_read(char *buf, size_t size, off_t offset,
5165 struct fuse_file_info *fi)
5166{
5167 struct fuse_context *fc = fuse_get_context();
5168 struct file_info *d = (struct file_info *)fi->fh;
5169 char *cg = NULL;
018246ff 5170 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
70dcc12e 5171 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
a262ddb7
CB
5172 ssize_t total_len = 0, rv = 0;
5173 ssize_t l = 0;
70dcc12e
SH
5174 char *cache = d->buf;
5175
5176 if (offset) {
5177 if (offset > d->size)
5178 return -EINVAL;
5179 if (!d->cached)
5180 return 0;
5181 int left = d->size - offset;
5182 total_len = left > size ? size: left;
5183 memcpy(buf, cache + offset, total_len);
5184 return total_len;
5185 }
5186
5187 pid_t initpid = lookup_initpid_in_store(fc->pid);
5188 if (initpid <= 0)
5189 initpid = fc->pid;
5190 cg = get_pid_cgroup(initpid, "memory");
5191 if (!cg)
5192 return read_file("/proc/swaps", buf, size, d);
6d2f6996 5193 prune_init_slice(cg);
70dcc12e 5194
018246ff 5195 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
70dcc12e
SH
5196
5197 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
5198 goto err;
5199
70dcc12e
SH
5200 memusage = strtoul(memusage_str, NULL, 10);
5201
5202 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5203 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5204
018246ff 5205 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
70dcc12e
SH
5206 memswusage = strtoul(memswusage_str, NULL, 10);
5207
70dcc12e
SH
5208 swap_total = (memswlimit - memlimit) / 1024;
5209 swap_free = (memswusage - memusage) / 1024;
5210 }
5211
5212 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5213
5214 /* When no mem + swap limit is specified or swapaccount=0*/
5215 if (!memswlimit) {
5216 char *line = NULL;
5217 size_t linelen = 0;
5218 FILE *f = fopen("/proc/meminfo", "r");
5219
5220 if (!f)
5221 goto err;
5222
5223 while (getline(&line, &linelen, f) != -1) {
5224 if (startswith(line, "SwapTotal:")) {
5225 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
5226 } else if (startswith(line, "SwapFree:")) {
5227 sscanf(line, "SwapFree: %8lu kB", &swap_free);
5228 }
5229 }
5230
5231 free(line);
5232 fclose(f);
5233 }
5234
5235 if (swap_total > 0) {
a262ddb7
CB
5236 l = snprintf(d->buf + total_len, d->size - total_len,
5237 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5238 swap_total, swap_free);
5239 total_len += l;
70dcc12e
SH
5240 }
5241
a262ddb7 5242 if (total_len < 0 || l < 0) {
70dcc12e
SH
5243 perror("Error writing to cache");
5244 rv = 0;
5245 goto err;
5246 }
5247
5248 d->cached = 1;
5249 d->size = (int)total_len;
5250
5251 if (total_len > size) total_len = size;
5252 memcpy(buf, d->buf, total_len);
5253 rv = total_len;
5254
5255err:
5256 free(cg);
5257 free(memswlimit_str);
5258 free(memlimit_str);
5259 free(memusage_str);
5260 free(memswusage_str);
70dcc12e
SH
5261 return rv;
5262}
6db4f7a3 5263/*
5264 * Find the process pid from cgroup path.
5265 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5266 * @pid_buf : put pid to pid_buf.
5267 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5268 * @depth : the depth of cgroup in container.
5269 * @sum : return the number of pid.
5270 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5271 */
5272static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5273{
5274 DIR *dir;
5275 int fd;
5276 struct dirent *file;
5277 FILE *f = NULL;
5278 size_t linelen = 0;
5279 char *line = NULL;
5280 int pd;
5281 char *path_dir, *path;
5282 char **pid;
5283
5284 /* path = dpath + "/cgroup.procs" + /0 */
5285 do {
5286 path = malloc(strlen(dpath) + 20);
5287 } while (!path);
5288
5289 strcpy(path, dpath);
5290 fd = openat(cfd, path, O_RDONLY);
5291 if (fd < 0)
5292 goto out;
5293
5294 dir = fdopendir(fd);
5295 if (dir == NULL) {
5296 close(fd);
5297 goto out;
5298 }
5299
5300 while (((file = readdir(dir)) != NULL) && depth > 0) {
5301 if (strncmp(file->d_name, ".", 1) == 0)
5302 continue;
5303 if (strncmp(file->d_name, "..", 1) == 0)
5304 continue;
5305 if (file->d_type == DT_DIR) {
5306 /* path + '/' + d_name +/0 */
5307 do {
5308 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5309 } while (!path_dir);
5310 strcpy(path_dir, path);
5311 strcat(path_dir, "/");
5312 strcat(path_dir, file->d_name);
5313 pd = depth - 1;
5314 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5315 free(path_dir);
5316 }
5317 }
5318 closedir(dir);
5319
5320 strcat(path, "/cgroup.procs");
5321 fd = openat(cfd, path, O_RDONLY);
5322 if (fd < 0)
5323 goto out;
5324
5325 f = fdopen(fd, "r");
5326 if (!f) {
5327 close(fd);
5328 goto out;
5329 }
5330
5331 while (getline(&line, &linelen, f) != -1) {
5332 do {
5333 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5334 } while (!pid);
5335 *pid_buf = pid;
5336 do {
5337 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5338 } while (*(*pid_buf + sum) == NULL);
5339 strcpy(*(*pid_buf + sum), line);
5340 sum++;
5341 }
5342 fclose(f);
5343out:
832904c1
JS
5344 if (line)
5345 free(line);
6db4f7a3 5346 free(path);
5347 return sum;
5348}
5349/*
5350 * calc_load calculates the load according to the following formula:
5351 * load1 = load0 * exp + active * (1 - exp)
5352 *
5353 * @load1: the new loadavg.
5354 * @load0: the former loadavg.
5355 * @active: the total number of running pid at this moment.
5356 * @exp: the fixed-point defined in the beginning.
5357 */
5358static unsigned long
5359calc_load(unsigned long load, unsigned long exp, unsigned long active)
5360{
5361 unsigned long newload;
5362
5363 active = active > 0 ? active * FIXED_1 : 0;
5364 newload = load * exp + active * (FIXED_1 - exp);
5365 if (active >= load)
5366 newload += FIXED_1 - 1;
5367
5368 return newload / FIXED_1;
5369}
5370
5371/*
5372 * Return 0 means that container p->cg is closed.
5373 * Return -1 means that error occurred in refresh.
5374 * Positive num equals the total number of pid.
5375 */
5376static int refresh_load(struct load_node *p, char *path)
5377{
5378 FILE *f = NULL;
5379 char **idbuf;
5380 char proc_path[256];
5381 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5382 char *line = NULL;
5383 size_t linelen = 0;
5384 int sum, length;
5385 DIR *dp;
5386 struct dirent *file;
5387
5388 do {
5389 idbuf = malloc(sizeof(char *));
5390 } while (!idbuf);
5391 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5392 /* normal exit */
5393 if (sum == 0)
5394 goto out;
5395
5396 for (i = 0; i < sum; i++) {
5397 /*clean up '\n' */
5398 length = strlen(idbuf[i])-1;
5399 idbuf[i][length] = '\0';
5400 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5401 if (ret < 0 || ret > 255) {
5402 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5403 i = sum;
5404 sum = -1;
5405 goto err_out;
5406 }
5407
5408 dp = opendir(proc_path);
5409 if (!dp) {
5410 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5411 continue;
5412 }
5413 while ((file = readdir(dp)) != NULL) {
5414 if (strncmp(file->d_name, ".", 1) == 0)
5415 continue;
5416 if (strncmp(file->d_name, "..", 1) == 0)
5417 continue;
5418 total_pid++;
5419 /* We make the biggest pid become last_pid.*/
5420 ret = atof(file->d_name);
5421 last_pid = (ret > last_pid) ? ret : last_pid;
5422
5423 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5424 if (ret < 0 || ret > 255) {
5425 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5426 i = sum;
5427 sum = -1;
5428 closedir(dp);
5429 goto err_out;
5430 }
5431 f = fopen(proc_path, "r");
5432 if (f != NULL) {
5433 while (getline(&line, &linelen, f) != -1) {
5434 /* Find State */
5435 if ((line[0] == 'S') && (line[1] == 't'))
5436 break;
5437 }
5438 if ((line[7] == 'R') || (line[7] == 'D'))
5439 run_pid++;
5440 fclose(f);
5441 }
5442 }
5443 closedir(dp);
5444 }
5445 /*Calculate the loadavg.*/
5446 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5447 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5448 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5449 p->run_pid = run_pid;
5450 p->total_pid = total_pid;
5451 p->last_pid = last_pid;
5452
5453 free(line);
beb5024e 5454err_out:
6db4f7a3 5455 for (; i > 0; i--)
5456 free(idbuf[i-1]);
5457out:
5458 free(idbuf);
5459 return sum;
5460}
5461/*
5462 * Traverse the hash table and update it.
5463 */
5464void *load_begin(void *arg)
5465{
5466
5467 char *path = NULL;
5468 int i, sum, length, ret;
5469 struct load_node *f;
5470 int first_node;
5471 clock_t time1, time2;
5472
5473 while (1) {
a83618e2
JS
5474 if (loadavg_stop == 1)
5475 return NULL;
5476
6db4f7a3 5477 time1 = clock();
5478 for (i = 0; i < LOAD_SIZE; i++) {
5479 pthread_mutex_lock(&load_hash[i].lock);
5480 if (load_hash[i].next == NULL) {
5481 pthread_mutex_unlock(&load_hash[i].lock);
5482 continue;
5483 }
5484 f = load_hash[i].next;
5485 first_node = 1;
5486 while (f) {
5487 length = strlen(f->cg) + 2;
5488 do {
5489 /* strlen(f->cg) + '.' or '' + \0 */
5490 path = malloc(length);
5491 } while (!path);
5492
5493 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5494 if (ret < 0 || ret > length - 1) {
5495 /* snprintf failed, ignore the node.*/
5496 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5497 goto out;
5498 }
5499 sum = refresh_load(f, path);
5500 if (sum == 0) {
5501 f = del_node(f, i);
5502 } else {
5503out: f = f->next;
5504 }
5505 free(path);
5506 /* load_hash[i].lock locks only on the first node.*/
5507 if (first_node == 1) {
5508 first_node = 0;
5509 pthread_mutex_unlock(&load_hash[i].lock);
5510 }
5511 }
5512 }
a83618e2
JS
5513
5514 if (loadavg_stop == 1)
5515 return NULL;
5516
6db4f7a3 5517 time2 = clock();
5518 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5519 }
5520}
5521
5522static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5523 struct fuse_file_info *fi)
5524{
5525 struct fuse_context *fc = fuse_get_context();
5526 struct file_info *d = (struct file_info *)fi->fh;
5527 pid_t initpid;
5528 char *cg;
5529 size_t total_len = 0;
5530 char *cache = d->buf;
5531 struct load_node *n;
5532 int hash;
01d88ede 5533 int cfd, rv = 0;
6db4f7a3 5534 unsigned long a, b, c;
5535
5536 if (offset) {
5537 if (offset > d->size)
5538 return -EINVAL;
5539 if (!d->cached)
5540 return 0;
5541 int left = d->size - offset;
5542 total_len = left > size ? size : left;
5543 memcpy(buf, cache + offset, total_len);
5544 return total_len;
5545 }
5546 if (!loadavg)
5547 return read_file("/proc/loadavg", buf, size, d);
5548
5549 initpid = lookup_initpid_in_store(fc->pid);
5550 if (initpid <= 0)
5551 initpid = fc->pid;
5552 cg = get_pid_cgroup(initpid, "cpu");
5553 if (!cg)
5554 return read_file("/proc/loadavg", buf, size, d);
5555
5556 prune_init_slice(cg);
b077527b 5557 hash = calc_hash(cg) % LOAD_SIZE;
6db4f7a3 5558 n = locate_node(cg, hash);
5559
5560 /* First time */
5561 if (n == NULL) {
5562 if (!find_mounted_controller("cpu", &cfd)) {
5563 /*
5564 * In locate_node() above, pthread_rwlock_unlock() isn't used
5565 * because delete is not allowed before read has ended.
5566 */
5567 pthread_rwlock_unlock(&load_hash[hash].rdlock);
01d88ede
JS
5568 rv = 0;
5569 goto err;
6db4f7a3 5570 }
5571 do {
5572 n = malloc(sizeof(struct load_node));
5573 } while (!n);
5574
5575 do {
5576 n->cg = malloc(strlen(cg)+1);
5577 } while (!n->cg);
5578 strcpy(n->cg, cg);
5579 n->avenrun[0] = 0;
5580 n->avenrun[1] = 0;
5581 n->avenrun[2] = 0;
5582 n->run_pid = 0;
5583 n->total_pid = 1;
5584 n->last_pid = initpid;
5585 n->cfd = cfd;
5586 insert_node(&n, hash);
5587 }
5588 a = n->avenrun[0] + (FIXED_1/200);
5589 b = n->avenrun[1] + (FIXED_1/200);
5590 c = n->avenrun[2] + (FIXED_1/200);
5591 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5592 LOAD_INT(a), LOAD_FRAC(a),
5593 LOAD_INT(b), LOAD_FRAC(b),
5594 LOAD_INT(c), LOAD_FRAC(c),
5595 n->run_pid, n->total_pid, n->last_pid);
5596 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5597 if (total_len < 0 || total_len >= d->buflen) {
5598 lxcfs_error("%s\n", "Failed to write to cache");
01d88ede
JS
5599 rv = 0;
5600 goto err;
6db4f7a3 5601 }
5602 d->size = (int)total_len;
5603 d->cached = 1;
5604
5605 if (total_len > size)
5606 total_len = size;
5607 memcpy(buf, d->buf, total_len);
01d88ede
JS
5608 rv = total_len;
5609
5610err:
5611 free(cg);
5612 return rv;
6db4f7a3 5613}
5614/* Return a positive number on success, return 0 on failure.*/
5615pthread_t load_daemon(int load_use)
5616{
5617 int ret;
5618 pthread_t pid;
5619
5620 ret = init_load();
5621 if (ret == -1) {
5622 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5623 return 0;
5624 }
5625 ret = pthread_create(&pid, NULL, load_begin, NULL);
5626 if (ret != 0) {
5627 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5628 load_free();
5629 return 0;
5630 }
5631 /* use loadavg, here loadavg = 1*/
5632 loadavg = load_use;
5633 return pid;
5634}
70dcc12e 5635
a83618e2
JS
5636/* Returns 0 on success. */
5637int stop_load_daemon(pthread_t pid)
5638{
5639 int s;
5640
5641 /* Signal the thread to gracefully stop */
5642 loadavg_stop = 1;
5643
5644 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5645 if (s != 0) {
5646 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5647 return -1;
5648 }
5649
5650 load_free();
5651 loadavg_stop = 0;
5652
5653 return 0;
5654}
5655
237e200e
SH
5656static off_t get_procfile_size(const char *which)
5657{
5658 FILE *f = fopen(which, "r");
5659 char *line = NULL;
5660 size_t len = 0;
5661 ssize_t sz, answer = 0;
5662 if (!f)
5663 return 0;
5664
5665 while ((sz = getline(&line, &len, f)) != -1)
5666 answer += sz;
5667 fclose (f);
5668 free(line);
5669
5670 return answer;
5671}
5672
5673int proc_getattr(const char *path, struct stat *sb)
5674{
5675 struct timespec now;
5676
5677 memset(sb, 0, sizeof(struct stat));
5678 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5679 return -EINVAL;
5680 sb->st_uid = sb->st_gid = 0;
5681 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5682 if (strcmp(path, "/proc") == 0) {
5683 sb->st_mode = S_IFDIR | 00555;
5684 sb->st_nlink = 2;
5685 return 0;
5686 }
5687 if (strcmp(path, "/proc/meminfo") == 0 ||
5688 strcmp(path, "/proc/cpuinfo") == 0 ||
5689 strcmp(path, "/proc/uptime") == 0 ||
5690 strcmp(path, "/proc/stat") == 0 ||
70dcc12e 5691 strcmp(path, "/proc/diskstats") == 0 ||
46be8eed 5692 strcmp(path, "/proc/swaps") == 0 ||
5693 strcmp(path, "/proc/loadavg") == 0) {
237e200e
SH
5694 sb->st_size = 0;
5695 sb->st_mode = S_IFREG | 00444;
5696 sb->st_nlink = 1;
5697 return 0;
5698 }
5699
5700 return -ENOENT;
5701}
5702
5703int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5704 struct fuse_file_info *fi)
5705{
d639f863
CB
5706 if (filler(buf, ".", NULL, 0) != 0 ||
5707 filler(buf, "..", NULL, 0) != 0 ||
5708 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5709 filler(buf, "meminfo", NULL, 0) != 0 ||
5710 filler(buf, "stat", NULL, 0) != 0 ||
5711 filler(buf, "uptime", NULL, 0) != 0 ||
5712 filler(buf, "diskstats", NULL, 0) != 0 ||
46be8eed 5713 filler(buf, "swaps", NULL, 0) != 0 ||
5714 filler(buf, "loadavg", NULL, 0) != 0)
237e200e
SH
5715 return -EINVAL;
5716 return 0;
5717}
5718
5719int proc_open(const char *path, struct fuse_file_info *fi)
5720{
5721 int type = -1;
5722 struct file_info *info;
5723
5724 if (strcmp(path, "/proc/meminfo") == 0)
5725 type = LXC_TYPE_PROC_MEMINFO;
5726 else if (strcmp(path, "/proc/cpuinfo") == 0)
5727 type = LXC_TYPE_PROC_CPUINFO;
5728 else if (strcmp(path, "/proc/uptime") == 0)
5729 type = LXC_TYPE_PROC_UPTIME;
5730 else if (strcmp(path, "/proc/stat") == 0)
5731 type = LXC_TYPE_PROC_STAT;
5732 else if (strcmp(path, "/proc/diskstats") == 0)
5733 type = LXC_TYPE_PROC_DISKSTATS;
70dcc12e
SH
5734 else if (strcmp(path, "/proc/swaps") == 0)
5735 type = LXC_TYPE_PROC_SWAPS;
46be8eed 5736 else if (strcmp(path, "/proc/loadavg") == 0)
5737 type = LXC_TYPE_PROC_LOADAVG;
237e200e
SH
5738 if (type == -1)
5739 return -ENOENT;
5740
5741 info = malloc(sizeof(*info));
5742 if (!info)
5743 return -ENOMEM;
5744
5745 memset(info, 0, sizeof(*info));
5746 info->type = type;
5747
5748 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5749 do {
5750 info->buf = malloc(info->buflen);
5751 } while (!info->buf);
5752 memset(info->buf, 0, info->buflen);
5753 /* set actual size to buffer size */
5754 info->size = info->buflen;
5755
5756 fi->fh = (unsigned long)info;
5757 return 0;
5758}
5759
bddbb106
SH
5760int proc_access(const char *path, int mask)
5761{
e7849aa3
CB
5762 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5763 return 0;
5764
bddbb106
SH
5765 /* these are all read-only */
5766 if ((mask & ~R_OK) != 0)
1b060d0a 5767 return -EACCES;
bddbb106
SH
5768 return 0;
5769}
5770
237e200e
SH
5771int proc_release(const char *path, struct fuse_file_info *fi)
5772{
43215927 5773 do_release_file_info(fi);
237e200e
SH
5774 return 0;
5775}
5776
5777int proc_read(const char *path, char *buf, size_t size, off_t offset,
5778 struct fuse_file_info *fi)
5779{
5780 struct file_info *f = (struct file_info *) fi->fh;
5781
5782 switch (f->type) {
5783 case LXC_TYPE_PROC_MEMINFO:
5784 return proc_meminfo_read(buf, size, offset, fi);
5785 case LXC_TYPE_PROC_CPUINFO:
5786 return proc_cpuinfo_read(buf, size, offset, fi);
5787 case LXC_TYPE_PROC_UPTIME:
5788 return proc_uptime_read(buf, size, offset, fi);
5789 case LXC_TYPE_PROC_STAT:
5790 return proc_stat_read(buf, size, offset, fi);
5791 case LXC_TYPE_PROC_DISKSTATS:
5792 return proc_diskstats_read(buf, size, offset, fi);
70dcc12e
SH
5793 case LXC_TYPE_PROC_SWAPS:
5794 return proc_swaps_read(buf, size, offset, fi);
46be8eed 5795 case LXC_TYPE_PROC_LOADAVG:
5796 return proc_loadavg_read(buf, size, offset, fi);
237e200e
SH
5797 default:
5798 return -EINVAL;
5799 }
5800}
5801
29a73c2f
CB
5802/*
5803 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
5804 */
5805
5806static bool mkdir_p(const char *dir, mode_t mode)
5807{
5808 const char *tmp = dir;
5809 const char *orig = dir;
5810 char *makeme;
5811
5812 do {
5813 dir = tmp + strspn(tmp, "/");
5814 tmp = dir + strcspn(dir, "/");
5815 makeme = strndup(orig, dir - orig);
5816 if (!makeme)
5817 return false;
5818 if (mkdir(makeme, mode) && errno != EEXIST) {
b8defc3d 5819 lxcfs_error("Failed to create directory '%s': %s.\n",
29a73c2f
CB
5820 makeme, strerror(errno));
5821 free(makeme);
5822 return false;
5823 }
5824 free(makeme);
5825 } while(tmp != dir);
5826
5827 return true;
5828}
5829
5830static bool umount_if_mounted(void)
5831{
5832 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 5833 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
5834 return false;
5835 }
5836 return true;
5837}
5838
2283e240
CB
5839/* __typeof__ should be safe to use with all compilers. */
5840typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5841static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5842{
5843 return (fs->f_type == (fs_type_magic)magic_val);
5844}
5845
0a4dea41
CB
5846/*
5847 * looking at fs/proc_namespace.c, it appears we can
5848 * actually expect the rootfs entry to very specifically contain
5849 * " - rootfs rootfs "
5850 * IIUC, so long as we've chrooted so that rootfs is not our root,
5851 * the rootfs entry should always be skipped in mountinfo contents.
5852 */
5853static bool is_on_ramfs(void)
5854{
5855 FILE *f;
5856 char *p, *p2;
5857 char *line = NULL;
5858 size_t len = 0;
5859 int i;
5860
5861 f = fopen("/proc/self/mountinfo", "r");
5862 if (!f)
5863 return false;
5864
5865 while (getline(&line, &len, f) != -1) {
5866 for (p = line, i = 0; p && i < 4; i++)
5867 p = strchr(p + 1, ' ');
5868 if (!p)
5869 continue;
5870 p2 = strchr(p + 1, ' ');
5871 if (!p2)
5872 continue;
5873 *p2 = '\0';
5874 if (strcmp(p + 1, "/") == 0) {
5875 // this is '/'. is it the ramfs?
5876 p = strchr(p2 + 1, '-');
5877 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5878 free(line);
5879 fclose(f);
5880 return true;
5881 }
5882 }
5883 }
5884 free(line);
5885 fclose(f);
5886 return false;
5887}
5888
cc309f33 5889static int pivot_enter()
0a4dea41 5890{
cc309f33
CB
5891 int ret = -1, oldroot = -1, newroot = -1;
5892
5893 oldroot = open("/", O_DIRECTORY | O_RDONLY);
5894 if (oldroot < 0) {
5895 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5896 return ret;
5897 }
5898
5899 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5900 if (newroot < 0) {
5901 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5902 goto err;
5903 }
5904
5905 /* change into new root fs */
5906 if (fchdir(newroot) < 0) {
5907 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5908 goto err;
5909 }
5910
0a4dea41
CB
5911 /* pivot_root into our new root fs */
5912 if (pivot_root(".", ".") < 0) {
5913 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
cc309f33 5914 goto err;
0a4dea41
CB
5915 }
5916
5917 /*
5918 * At this point the old-root is mounted on top of our new-root.
5919 * To unmounted it we must not be chdir'd into it, so escape back
5920 * to the old-root.
5921 */
5922 if (fchdir(oldroot) < 0) {
5923 lxcfs_error("%s\n", "Failed to enter old root.");
cc309f33 5924 goto err;
0a4dea41
CB
5925 }
5926
5927 if (umount2(".", MNT_DETACH) < 0) {
5928 lxcfs_error("%s\n", "Failed to detach old root.");
cc309f33 5929 goto err;
0a4dea41
CB
5930 }
5931
5932 if (fchdir(newroot) < 0) {
5933 lxcfs_error("%s\n", "Failed to re-enter new root.");
cc309f33 5934 goto err;
0a4dea41
CB
5935 }
5936
cc309f33
CB
5937 ret = 0;
5938
5939err:
5940 if (oldroot > 0)
5941 close(oldroot);
5942 if (newroot > 0)
5943 close(newroot);
5944
5945 return ret;
0a4dea41
CB
5946}
5947
5948static int chroot_enter()
5949{
5950 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5951 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5952 return -1;
5953 }
5954
5955 if (chroot(".") < 0) {
5956 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
5957 return -1;
5958 }
5959
5960 if (chdir("/") < 0) {
5961 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
5962 return -1;
5963 }
5964
5965 return 0;
5966}
5967
0232cbac 5968static int permute_and_enter(void)
29a73c2f 5969{
0a4dea41
CB
5970 struct statfs sb;
5971
5972 if (statfs("/", &sb) < 0) {
5973 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 5974 return -1;
0a4dea41
CB
5975 }
5976
5977 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5978 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5979 * /proc/1/mountinfo. */
5980 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5981 return chroot_enter();
29a73c2f 5982
cc309f33 5983 if (pivot_enter() < 0) {
0a4dea41 5984 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 5985 return -1;
29a73c2f
CB
5986 }
5987
cc309f33 5988 return 0;
29a73c2f
CB
5989}
5990
5991/* Prepare our new clean root. */
0232cbac 5992static int permute_prepare(void)
29a73c2f
CB
5993{
5994 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 5995 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
5996 return -1;
5997 }
5998
5999 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 6000 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
6001 return -1;
6002 }
6003
6004 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 6005 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
6006 return -1;
6007 }
6008
6009 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 6010 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
6011 return -1;
6012 }
6013
6014 return 0;
6015}
6016
0232cbac
CB
6017/* Calls chroot() on ramfs, pivot_root() in all other cases. */
6018static bool permute_root(void)
29a73c2f
CB
6019{
6020 /* Prepare new root. */
0232cbac 6021 if (permute_prepare() < 0)
29a73c2f
CB
6022 return false;
6023
6024 /* Pivot into new root. */
0232cbac 6025 if (permute_and_enter() < 0)
29a73c2f
CB
6026 return false;
6027
6028 return true;
6029}
6030
a257a8ee
CB
6031static int preserve_mnt_ns(int pid)
6032{
6033 int ret;
6034 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6035 char path[len];
6036
6037 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6038 if (ret < 0 || (size_t)ret >= len)
6039 return -1;
6040
6041 return open(path, O_RDONLY | O_CLOEXEC);
6042}
6043
0a4dea41 6044static bool cgfs_prepare_mounts(void)
29a73c2f
CB
6045{
6046 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 6047 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
6048 return false;
6049 }
480262c9 6050
29a73c2f 6051 if (!umount_if_mounted()) {
b8defc3d 6052 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
6053 return false;
6054 }
6055
6056 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 6057 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
6058 return false;
6059 }
6060
a257a8ee
CB
6061 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6062 if (cgroup_mount_ns_fd < 0) {
6063 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6064 return false;
6065 }
6066
480262c9 6067 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 6068 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
6069 return false;
6070 }
480262c9 6071
29a73c2f 6072 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 6073 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
6074 return false;
6075 }
480262c9 6076
29a73c2f
CB
6077 return true;
6078}
6079
0a4dea41 6080static bool cgfs_mount_hierarchies(void)
29a73c2f
CB
6081{
6082 char *target;
6083 size_t clen, len;
6084 int i, ret;
6085
6086 for (i = 0; i < num_hierarchies; i++) {
6087 char *controller = hierarchies[i];
51c7ca35 6088
29a73c2f
CB
6089 clen = strlen(controller);
6090 len = strlen(BASEDIR) + clen + 2;
6091 target = malloc(len);
6092 if (!target)
6093 return false;
51c7ca35 6094
29a73c2f
CB
6095 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
6096 if (ret < 0 || ret >= len) {
6097 free(target);
6098 return false;
6099 }
6100 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
6101 free(target);
6102 return false;
6103 }
51c7ca35
CB
6104 if (!strcmp(controller, "unified"))
6105 ret = mount("none", target, "cgroup2", 0, NULL);
6106 else
6107 ret = mount(controller, target, "cgroup", 0, controller);
6108 if (ret < 0) {
6109 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
29a73c2f
CB
6110 free(target);
6111 return false;
6112 }
6113
6114 fd_hierarchies[i] = open(target, O_DIRECTORY);
6115 if (fd_hierarchies[i] < 0) {
6116 free(target);
6117 return false;
6118 }
6119 free(target);
6120 }
6121 return true;
6122}
6123
480262c9 6124static bool cgfs_setup_controllers(void)
29a73c2f 6125{
0a4dea41 6126 if (!cgfs_prepare_mounts())
29a73c2f 6127 return false;
29a73c2f 6128
0a4dea41 6129 if (!cgfs_mount_hierarchies()) {
b8defc3d 6130 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
29a73c2f
CB
6131 return false;
6132 }
6133
0232cbac 6134 if (!permute_root())
29a73c2f
CB
6135 return false;
6136
6137 return true;
6138}
6139
6140static void __attribute__((constructor)) collect_and_mount_subsystems(void)
237e200e
SH
6141{
6142 FILE *f;
e58dab00
CB
6143 char *cret, *line = NULL;
6144 char cwd[MAXPATHLEN];
237e200e 6145 size_t len = 0;
480262c9 6146 int i, init_ns = -1;
51c7ca35 6147 bool found_unified = false;
237e200e
SH
6148
6149 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
b8defc3d 6150 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
237e200e
SH
6151 return;
6152 }
e58dab00 6153
237e200e 6154 while (getline(&line, &len, f) != -1) {
51c7ca35 6155 char *idx, *p, *p2;
237e200e
SH
6156
6157 p = strchr(line, ':');
6158 if (!p)
6159 goto out;
51c7ca35 6160 idx = line;
237e200e
SH
6161 *(p++) = '\0';
6162
6163 p2 = strrchr(p, ':');
6164 if (!p2)
6165 goto out;
6166 *p2 = '\0';
6167
a67719f6
CB
6168 /* With cgroupv2 /proc/self/cgroup can contain entries of the
6169 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
6170 * because it parses out the empty string "" and later on passes
6171 * it to mount(). Let's skip such entries.
6172 */
51c7ca35
CB
6173 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
6174 found_unified = true;
6175 p = "unified";
6176 }
a67719f6 6177
237e200e
SH
6178 if (!store_hierarchy(line, p))
6179 goto out;
6180 }
6181
480262c9 6182 /* Preserve initial namespace. */
a257a8ee 6183 init_ns = preserve_mnt_ns(getpid());
b8defc3d
CB
6184 if (init_ns < 0) {
6185 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
480262c9 6186 goto out;
b8defc3d 6187 }
480262c9 6188
92c3ee11 6189 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
b8defc3d
CB
6190 if (!fd_hierarchies) {
6191 lxcfs_error("%s\n", strerror(errno));
29a73c2f 6192 goto out;
b8defc3d 6193 }
29a73c2f 6194
480262c9
CB
6195 for (i = 0; i < num_hierarchies; i++)
6196 fd_hierarchies[i] = -1;
6197
e58dab00
CB
6198 cret = getcwd(cwd, MAXPATHLEN);
6199 if (!cret)
6200 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6201
480262c9
CB
6202 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6203 * to privately mount lxcfs cgroups. */
b8defc3d
CB
6204 if (!cgfs_setup_controllers()) {
6205 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
29a73c2f 6206 goto out;
b8defc3d 6207 }
480262c9 6208
b8defc3d
CB
6209 if (setns(init_ns, 0) < 0) {
6210 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
29a73c2f 6211 goto out;
b8defc3d 6212 }
29a73c2f 6213
e58dab00
CB
6214 if (!cret || chdir(cwd) < 0)
6215 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6216
056adcef
JS
6217 if (!init_cpuview()) {
6218 lxcfs_error("%s\n", "failed to init CPU view");
6219 goto out;
6220 }
6221
237e200e
SH
6222 print_subsystems();
6223
6224out:
6225 free(line);
6226 fclose(f);
480262c9
CB
6227 if (init_ns >= 0)
6228 close(init_ns);
237e200e
SH
6229}
6230
6231static void __attribute__((destructor)) free_subsystems(void)
6232{
6233 int i;
6234
b8defc3d
CB
6235 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6236
29a73c2f 6237 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
6238 if (hierarchies[i])
6239 free(hierarchies[i]);
480262c9 6240 if (fd_hierarchies && fd_hierarchies[i] >= 0)
29a73c2f
CB
6241 close(fd_hierarchies[i]);
6242 }
237e200e 6243 free(hierarchies);
480262c9 6244 free(fd_hierarchies);
056adcef 6245 free_cpuview();
a257a8ee
CB
6246
6247 if (cgroup_mount_ns_fd >= 0)
6248 close(cgroup_mount_ns_fd);
237e200e 6249}