]> git.proxmox.com Git - mirror_lxcfs.git/blame - bindings.c
cgroups: add get_cgroup_fd() helper
[mirror_lxcfs.git] / bindings.c
CommitLineData
237e200e
SH
1/* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9#define FUSE_USE_VERSION 26
10
0ecddf02 11#define __STDC_FORMAT_MACROS
237e200e 12#include <dirent.h>
29a73c2f 13#include <errno.h>
237e200e
SH
14#include <fcntl.h>
15#include <fuse.h>
0ecddf02 16#include <inttypes.h>
237e200e 17#include <libgen.h>
237e200e 18#include <pthread.h>
29a73c2f 19#include <sched.h>
db1b32f6 20#include <stdarg.h>
29a73c2f 21#include <stdbool.h>
0ecddf02 22#include <stdint.h>
29a73c2f
CB
23#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26#include <time.h>
27#include <unistd.h>
28#include <wait.h>
d89504c4 29#include <linux/magic.h>
237e200e 30#include <linux/sched.h>
29a73c2f
CB
31#include <sys/epoll.h>
32#include <sys/mman.h>
33#include <sys/mount.h>
237e200e
SH
34#include <sys/param.h>
35#include <sys/socket.h>
29a73c2f 36#include <sys/syscall.h>
0ecddf02 37#include <sys/sysinfo.h>
d89504c4 38#include <sys/vfs.h>
237e200e 39
237e200e 40#include "bindings.h"
5fbea8a6
CB
41#include "cgroups/cgroup.h"
42#include "cgroups/cgroup_utils.h"
c9236032
HY
43#include "memory_utils.h"
44#include "config.h"
237e200e 45
29a73c2f
CB
46/* Define pivot_root() if missing from the C library */
47#ifndef HAVE_PIVOT_ROOT
48static int pivot_root(const char * new_root, const char * put_old)
49{
50#ifdef __NR_pivot_root
51return syscall(__NR_pivot_root, new_root, put_old);
52#else
53errno = ENOSYS;
54return -1;
55#endif
56}
57#else
58extern int pivot_root(const char * new_root, const char * put_old);
59#endif
60
8be92dd1
JS
61struct cpuacct_usage {
62 uint64_t user;
63 uint64_t system;
056adcef 64 uint64_t idle;
77005a6c 65 bool online;
8be92dd1
JS
66};
67
0e47acaa 68/* The function of hash table.*/
69#define LOAD_SIZE 100 /*the size of hash_table */
6db4f7a3 70#define FLUSH_TIME 5 /*the flush rate */
71#define DEPTH_DIR 3 /*the depth of per cgroup */
72/* The function of calculate loadavg .*/
73#define FSHIFT 11 /* nr of bits of precision */
74#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
75#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
76#define EXP_5 2014 /* 1/exp(5sec/5min) */
77#define EXP_15 2037 /* 1/exp(5sec/15min) */
78#define LOAD_INT(x) ((x) >> FSHIFT)
79#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
beb5024e 80/*
6db4f7a3 81 * This parameter is used for proc_loadavg_read().
82 * 1 means use loadavg, 0 means not use.
83 */
84static int loadavg = 0;
a83618e2 85static volatile sig_atomic_t loadavg_stop = 0;
056adcef 86static int calc_hash(const char *name)
0e47acaa 87{
88 unsigned int hash = 0;
89 unsigned int x = 0;
90 /* ELFHash algorithm. */
91 while (*name) {
92 hash = (hash << 4) + *name++;
93 x = hash & 0xf0000000;
94 if (x != 0)
95 hash ^= (x >> 24);
96 hash &= ~x;
97 }
b077527b 98 return (hash & 0x7fffffff);
0e47acaa 99}
100
101struct load_node {
102 char *cg; /*cg */
103 unsigned long avenrun[3]; /* Load averages */
104 unsigned int run_pid;
105 unsigned int total_pid;
106 unsigned int last_pid;
107 int cfd; /* The file descriptor of the mounted cgroup */
108 struct load_node *next;
109 struct load_node **pre;
110};
111
112struct load_head {
113 /*
114 * The lock is about insert load_node and refresh load_node.To the first
115 * load_node of each hash bucket, insert and refresh in this hash bucket is
116 * mutually exclusive.
117 */
118 pthread_mutex_t lock;
119 /*
120 * The rdlock is about read loadavg and delete load_node.To each hash
121 * bucket, read and delete is mutually exclusive. But at the same time, we
122 * allow paratactic read operation. This rdlock is at list level.
123 */
124 pthread_rwlock_t rdlock;
125 /*
126 * The rilock is about read loadavg and insert load_node.To the first
127 * load_node of each hash bucket, read and insert is mutually exclusive.
128 * But at the same time, we allow paratactic read operation.
129 */
130 pthread_rwlock_t rilock;
131 struct load_node *next;
132};
133
134static struct load_head load_hash[LOAD_SIZE]; /* hash table */
135/*
136 * init_load initialize the hash table.
137 * Return 0 on success, return -1 on failure.
138 */
139static int init_load(void)
140{
141 int i;
142 int ret;
143
144 for (i = 0; i < LOAD_SIZE; i++) {
145 load_hash[i].next = NULL;
146 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
147 if (ret != 0) {
148 lxcfs_error("%s\n", "Failed to initialize lock");
149 goto out3;
150 }
151 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
152 if (ret != 0) {
153 lxcfs_error("%s\n", "Failed to initialize rdlock");
154 goto out2;
155 }
156 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
157 if (ret != 0) {
158 lxcfs_error("%s\n", "Failed to initialize rilock");
159 goto out1;
160 }
161 }
162 return 0;
163out1:
164 pthread_rwlock_destroy(&load_hash[i].rdlock);
165out2:
166 pthread_mutex_destroy(&load_hash[i].lock);
167out3:
168 while (i > 0) {
169 i--;
170 pthread_mutex_destroy(&load_hash[i].lock);
171 pthread_rwlock_destroy(&load_hash[i].rdlock);
172 pthread_rwlock_destroy(&load_hash[i].rilock);
173 }
174 return -1;
175}
176
177static void insert_node(struct load_node **n, int locate)
178{
179 struct load_node *f;
180
181 pthread_mutex_lock(&load_hash[locate].lock);
182 pthread_rwlock_wrlock(&load_hash[locate].rilock);
183 f = load_hash[locate].next;
184 load_hash[locate].next = *n;
185
186 (*n)->pre = &(load_hash[locate].next);
187 if (f)
188 f->pre = &((*n)->next);
189 (*n)->next = f;
190 pthread_mutex_unlock(&load_hash[locate].lock);
191 pthread_rwlock_unlock(&load_hash[locate].rilock);
192}
193/*
194 * locate_node() finds special node. Not return NULL means success.
195 * It should be noted that rdlock isn't unlocked at the end of code
196 * because this function is used to read special node. Delete is not
197 * allowed before read has ended.
198 * unlock rdlock only in proc_loadavg_read().
199 */
200static struct load_node *locate_node(char *cg, int locate)
201{
202 struct load_node *f = NULL;
203 int i = 0;
204
205 pthread_rwlock_rdlock(&load_hash[locate].rilock);
206 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
207 if (load_hash[locate].next == NULL) {
208 pthread_rwlock_unlock(&load_hash[locate].rilock);
209 return f;
210 }
211 f = load_hash[locate].next;
212 pthread_rwlock_unlock(&load_hash[locate].rilock);
213 while (f && ((i = strcmp(f->cg, cg)) != 0))
214 f = f->next;
215 return f;
216}
54a6d46a 217
0e47acaa 218/* Delete the load_node n and return the next node of it. */
219static struct load_node *del_node(struct load_node *n, int locate)
220{
221 struct load_node *g;
222
223 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
224 if (n->next == NULL) {
225 *(n->pre) = NULL;
226 } else {
227 *(n->pre) = n->next;
228 n->next->pre = n->pre;
229 }
230 g = n->next;
54a6d46a
CB
231 free_disarm(n->cg);
232 free_disarm(n);
0e47acaa 233 pthread_rwlock_unlock(&load_hash[locate].rdlock);
234 return g;
235}
236
a83618e2 237static void load_free(void)
9c480eb7 238{
9c480eb7 239 struct load_node *f, *p;
240
54a6d46a 241 for (int i = 0; i < LOAD_SIZE; i++) {
9c480eb7 242 pthread_mutex_lock(&load_hash[i].lock);
243 pthread_rwlock_wrlock(&load_hash[i].rilock);
244 pthread_rwlock_wrlock(&load_hash[i].rdlock);
245 if (load_hash[i].next == NULL) {
246 pthread_mutex_unlock(&load_hash[i].lock);
247 pthread_mutex_destroy(&load_hash[i].lock);
248 pthread_rwlock_unlock(&load_hash[i].rilock);
249 pthread_rwlock_destroy(&load_hash[i].rilock);
250 pthread_rwlock_unlock(&load_hash[i].rdlock);
251 pthread_rwlock_destroy(&load_hash[i].rdlock);
252 continue;
253 }
54a6d46a
CB
254
255 for (f = load_hash[i].next; f;) {
256 free_disarm(f->cg);
9c480eb7 257 p = f->next;
54a6d46a 258 free_disarm(f);
9c480eb7 259 f = p;
260 }
54a6d46a 261
9c480eb7 262 pthread_mutex_unlock(&load_hash[i].lock);
263 pthread_mutex_destroy(&load_hash[i].lock);
264 pthread_rwlock_unlock(&load_hash[i].rilock);
265 pthread_rwlock_destroy(&load_hash[i].rilock);
266 pthread_rwlock_unlock(&load_hash[i].rdlock);
267 pthread_rwlock_destroy(&load_hash[i].rdlock);
268 }
269}
056adcef
JS
270
271/* Data for CPU view */
272struct cg_proc_stat {
273 char *cg;
274 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
275 struct cpuacct_usage *view; // Usage stats reported to the container
276 int cpu_count;
2f49b662 277 pthread_mutex_t lock; // For node manipulation
056adcef
JS
278 struct cg_proc_stat *next;
279};
280
281struct cg_proc_stat_head {
282 struct cg_proc_stat *next;
951acc94 283 time_t lastcheck;
2f49b662
JS
284
285 /*
286 * For access to the list. Reading can be parallel, pruning is exclusive.
287 */
288 pthread_rwlock_t lock;
056adcef
JS
289};
290
291#define CPUVIEW_HASH_SIZE 100
292static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
293
294static bool cpuview_init_head(struct cg_proc_stat_head **head)
295{
296 *head = malloc(sizeof(struct cg_proc_stat_head));
297 if (!(*head)) {
298 lxcfs_error("%s\n", strerror(errno));
299 return false;
300 }
301
951acc94 302 (*head)->lastcheck = time(NULL);
056adcef 303 (*head)->next = NULL;
2f49b662
JS
304
305 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
306 lxcfs_error("%s\n", "Failed to initialize list lock");
54a6d46a 307 free_disarm(*head);
2f49b662
JS
308 return false;
309 }
310
056adcef
JS
311 return true;
312}
313
314static bool init_cpuview()
315{
316 int i;
317
318 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
319 proc_stat_history[i] = NULL;
320
321 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
322 if (!cpuview_init_head(&proc_stat_history[i]))
323 goto err;
324 }
325
326 return true;
327
328err:
329 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
54a6d46a
CB
330 if (proc_stat_history[i])
331 free_disarm(proc_stat_history[i]);
056adcef
JS
332 }
333
334 return false;
335}
336
951acc94
JS
337static void free_proc_stat_node(struct cg_proc_stat *node)
338{
2f49b662 339 pthread_mutex_destroy(&node->lock);
54a6d46a
CB
340 free_disarm(node->cg);
341 free_disarm(node->usage);
342 free_disarm(node->view);
343 free_disarm(node);
951acc94
JS
344}
345
056adcef
JS
346static void cpuview_free_head(struct cg_proc_stat_head *head)
347{
348 struct cg_proc_stat *node, *tmp;
349
350 if (head->next) {
351 node = head->next;
352
353 for (;;) {
354 tmp = node;
355 node = node->next;
951acc94 356 free_proc_stat_node(tmp);
056adcef
JS
357
358 if (!node)
359 break;
360 }
361 }
362
2f49b662 363 pthread_rwlock_destroy(&head->lock);
54a6d46a 364 free_disarm(head);
056adcef
JS
365}
366
367static void free_cpuview()
368{
369 int i;
370
371 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
372 if (proc_stat_history[i])
373 cpuview_free_head(proc_stat_history[i]);
374 }
375}
376
237e200e
SH
377/*
378 * A table caching which pid is init for a pid namespace.
379 * When looking up which pid is init for $qpid, we first
380 * 1. Stat /proc/$qpid/ns/pid.
381 * 2. Check whether the ino_t is in our store.
382 * a. if not, fork a child in qpid's ns to send us
383 * ucred.pid = 1, and read the initpid. Cache
384 * initpid and creation time for /proc/initpid
385 * in a new store entry.
386 * b. if so, verify that /proc/initpid still matches
387 * what we have saved. If not, clear the store
388 * entry and go back to a. If so, return the
389 * cached initpid.
390 */
391struct pidns_init_store {
392 ino_t ino; // inode number for /proc/$pid/ns/pid
393 pid_t initpid; // the pid of nit in that ns
394 long int ctime; // the time at which /proc/$initpid was created
395 struct pidns_init_store *next;
396 long int lastcheck;
397};
398
399/* lol - look at how they are allocated in the kernel */
400#define PIDNS_HASH_SIZE 4096
401#define HASH(x) ((x) % PIDNS_HASH_SIZE)
402
403static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
404static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
405static void lock_mutex(pthread_mutex_t *l)
406{
407 int ret;
408
409 if ((ret = pthread_mutex_lock(l)) != 0) {
b8defc3d 410 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
411 exit(1);
412 }
413}
414
77f4399a 415struct cgroup_ops *cgroup_ops;
29a73c2f 416
237e200e
SH
417static void unlock_mutex(pthread_mutex_t *l)
418{
419 int ret;
420
421 if ((ret = pthread_mutex_unlock(l)) != 0) {
b8defc3d 422 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
423 exit(1);
424 }
425}
426
427static void store_lock(void)
428{
429 lock_mutex(&pidns_store_mutex);
430}
431
432static void store_unlock(void)
433{
434 unlock_mutex(&pidns_store_mutex);
435}
436
437/* Must be called under store_lock */
438static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
439{
440 struct stat initsb;
441 char fnam[100];
442
443 snprintf(fnam, 100, "/proc/%d", e->initpid);
444 if (stat(fnam, &initsb) < 0)
445 return false;
7dd6560a
CB
446
447 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
448 initsb.st_ctime, e->initpid);
449
237e200e
SH
450 if (e->ctime != initsb.st_ctime)
451 return false;
452 return true;
453}
454
455/* Must be called under store_lock */
456static void remove_initpid(struct pidns_init_store *e)
457{
458 struct pidns_init_store *tmp;
459 int h;
460
7dd6560a
CB
461 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
462
237e200e
SH
463 h = HASH(e->ino);
464 if (pidns_hash_table[h] == e) {
465 pidns_hash_table[h] = e->next;
54a6d46a 466 free_disarm(e);
237e200e
SH
467 return;
468 }
469
470 tmp = pidns_hash_table[h];
471 while (tmp) {
472 if (tmp->next == e) {
473 tmp->next = e->next;
54a6d46a 474 free_disarm(e);
237e200e
SH
475 return;
476 }
477 tmp = tmp->next;
478 }
479}
480
481#define PURGE_SECS 5
482/* Must be called under store_lock */
483static void prune_initpid_store(void)
484{
485 static long int last_prune = 0;
486 struct pidns_init_store *e, *prev, *delme;
487 long int now, threshold;
488 int i;
489
490 if (!last_prune) {
491 last_prune = time(NULL);
492 return;
493 }
494 now = time(NULL);
495 if (now < last_prune + PURGE_SECS)
496 return;
7dd6560a
CB
497
498 lxcfs_debug("%s\n", "Pruning.");
499
237e200e
SH
500 last_prune = now;
501 threshold = now - 2 * PURGE_SECS;
502
503 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
504 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
505 if (e->lastcheck < threshold) {
7dd6560a
CB
506
507 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
508
237e200e
SH
509 delme = e;
510 if (prev)
511 prev->next = e->next;
512 else
513 pidns_hash_table[i] = e->next;
514 e = e->next;
54a6d46a 515 free_disarm(delme);
237e200e
SH
516 } else {
517 prev = e;
518 e = e->next;
519 }
520 }
521 }
522}
523
524/* Must be called under store_lock */
525static void save_initpid(struct stat *sb, pid_t pid)
526{
527 struct pidns_init_store *e;
528 char fpath[100];
529 struct stat procsb;
530 int h;
531
7dd6560a
CB
532 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
533
237e200e
SH
534 snprintf(fpath, 100, "/proc/%d", pid);
535 if (stat(fpath, &procsb) < 0)
536 return;
537 do {
538 e = malloc(sizeof(*e));
539 } while (!e);
540 e->ino = sb->st_ino;
541 e->initpid = pid;
542 e->ctime = procsb.st_ctime;
543 h = HASH(e->ino);
544 e->next = pidns_hash_table[h];
545 e->lastcheck = time(NULL);
546 pidns_hash_table[h] = e;
547}
548
549/*
550 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
551 * entry for the inode number and creation time. Verify that the init pid
552 * is still valid. If not, remove it. Return the entry if valid, NULL
553 * otherwise.
554 * Must be called under store_lock
555 */
556static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
557{
558 int h = HASH(sb->st_ino);
559 struct pidns_init_store *e = pidns_hash_table[h];
560
561 while (e) {
562 if (e->ino == sb->st_ino) {
563 if (initpid_still_valid(e, sb)) {
564 e->lastcheck = time(NULL);
565 return e;
566 }
567 remove_initpid(e);
568 return NULL;
569 }
570 e = e->next;
571 }
572
573 return NULL;
574}
575
0f657ce3 576static int is_dir(const char *path, int fd)
237e200e
SH
577{
578 struct stat statbuf;
0f657ce3 579 int ret = fstatat(fd, path, &statbuf, fd);
237e200e
SH
580 if (ret == 0 && S_ISDIR(statbuf.st_mode))
581 return 1;
582 return 0;
583}
584
c9236032
HY
585static int preserve_ns(const int pid, const char *ns)
586{
587 int ret;
588/* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
589#define __NS_PATH_LEN 50
590 char path[__NS_PATH_LEN];
591
592 /* This way we can use this function to also check whether namespaces
593 * are supported by the kernel by passing in the NULL or the empty
594 * string.
595 */
596 ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
597 !ns || strcmp(ns, "") == 0 ? "" : "/",
598 !ns || strcmp(ns, "") == 0 ? "" : ns);
599 if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
600 errno = EFBIG;
601 return -1;
602 }
603
604 return open(path, O_RDONLY | O_CLOEXEC);
605}
606
607/**
608 * in_same_namespace - Check whether two processes are in the same namespace.
609 * @pid1 - PID of the first process.
610 * @pid2 - PID of the second process.
611 * @ns - Name of the namespace to check. Must correspond to one of the names
612 * for the namespaces as shown in /proc/<pid/ns/
613 *
614 * If the two processes are not in the same namespace returns an fd to the
615 * namespace of the second process identified by @pid2. If the two processes are
616 * in the same namespace returns -EINVAL, -1 if an error occurred.
617 */
618static int in_same_namespace(pid_t pid1, pid_t pid2, const char *ns)
619{
620 __do_close_prot_errno int ns_fd1 = -1, ns_fd2 = -1;
621 int ret = -1;
622 struct stat ns_st1, ns_st2;
623
624 ns_fd1 = preserve_ns(pid1, ns);
625 if (ns_fd1 < 0) {
626 /* The kernel does not support this namespace. This is not an
627 * error.
628 */
629 if (errno == ENOENT)
630 return -EINVAL;
631
632 return -1;
633 }
634
635 ns_fd2 = preserve_ns(pid2, ns);
636 if (ns_fd2 < 0)
637 return -1;
638
639 ret = fstat(ns_fd1, &ns_st1);
640 if (ret < 0)
641 return -1;
642
643 ret = fstat(ns_fd2, &ns_st2);
644 if (ret < 0)
645 return -1;
646
647 /* processes are in the same namespace */
648 if ((ns_st1.st_dev == ns_st2.st_dev) && (ns_st1.st_ino == ns_st2.st_ino))
649 return -EINVAL;
650
651 /* processes are in different namespaces */
652 return move_fd(ns_fd2);
653}
654
655static bool is_shared_pidns(pid_t pid)
656{
657 if (pid != 1)
658 return false;
659
6e3637bb 660 if (in_same_namespace(pid, getpid(), "pid") == -EINVAL)
c9236032
HY
661 return true;
662
663 return false;
664}
665
ba59ea09 666static bool write_string(const char *fnam, const char *string, int fd)
237e200e
SH
667{
668 FILE *f;
669 size_t len, ret;
670
beb5024e
CB
671 f = fdopen(fd, "w");
672 if (!f)
237e200e 673 return false;
beb5024e 674
237e200e
SH
675 len = strlen(string);
676 ret = fwrite(string, 1, len, f);
677 if (ret != len) {
beb5024e
CB
678 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
679 strerror(errno), string, fnam);
237e200e
SH
680 fclose(f);
681 return false;
682 }
beb5024e 683
237e200e 684 if (fclose(f) < 0) {
beb5024e 685 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
237e200e
SH
686 return false;
687 }
beb5024e 688
237e200e
SH
689 return true;
690}
691
237e200e
SH
692struct cgfs_files {
693 char *name;
694 uint32_t uid, gid;
695 uint32_t mode;
696};
697
237e200e
SH
698static void print_subsystems(void)
699{
5fbea8a6 700 int i = 0;
237e200e 701
0646f250 702 fprintf(stderr, "mount namespace: %d\n", cgroup_ops->mntns_fd);
cc97d34c 703 fprintf(stderr, "hierarchies:\n");
5fbea8a6
CB
704 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
705 __do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false);
706 fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: "");
237e200e
SH
707 }
708}
709
237e200e
SH
710bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
711 const char *value)
712{
ba59ea09 713 int ret, fd, cfd;
237e200e 714 size_t len;
5fbea8a6 715 char *fnam;
237e200e 716
d298bba1 717 cfd = get_cgroup_fd(controller);
5fbea8a6 718 if (cfd < 0)
237e200e 719 return false;
f5a6d92e
CB
720
721 /* Make sure we pass a relative path to *at() family of functions.
722 * . + /cgroup + / + file + \0
723 */
ba59ea09 724 len = strlen(cgroup) + strlen(file) + 3;
237e200e 725 fnam = alloca(len);
075387cd 726 ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
ba59ea09
CB
727 if (ret < 0 || (size_t)ret >= len)
728 return false;
729
730 fd = openat(cfd, fnam, O_WRONLY);
731 if (fd < 0)
732 return false;
f676eb79 733
ba59ea09 734 return write_string(fnam, value, fd);
237e200e
SH
735}
736
737// Chown all the files in the cgroup directory. We do this when we create
738// a cgroup on behalf of a user.
f23fe717 739static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e 740{
f23fe717 741 struct dirent *direntp;
237e200e
SH
742 char path[MAXPATHLEN];
743 size_t len;
744 DIR *d;
f23fe717 745 int fd1, ret;
237e200e
SH
746
747 len = strlen(dirname);
748 if (len >= MAXPATHLEN) {
b8defc3d 749 lxcfs_error("Pathname too long: %s\n", dirname);
237e200e
SH
750 return;
751 }
752
f23fe717
CB
753 fd1 = openat(fd, dirname, O_DIRECTORY);
754 if (fd1 < 0)
755 return;
756
757 d = fdopendir(fd1);
237e200e 758 if (!d) {
b8defc3d 759 lxcfs_error("Failed to open %s\n", dirname);
237e200e
SH
760 return;
761 }
762
f23fe717 763 while ((direntp = readdir(d))) {
237e200e
SH
764 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
765 continue;
766 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
767 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 768 lxcfs_error("Pathname too long under %s\n", dirname);
237e200e
SH
769 continue;
770 }
f23fe717 771 if (fchownat(fd, path, uid, gid, 0) < 0)
b8defc3d 772 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
237e200e
SH
773 }
774 closedir(d);
775}
776
777int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
778{
5dd3e6fd 779 int cfd;
237e200e 780 size_t len;
5fbea8a6 781 char *dirnam;
237e200e 782
d298bba1 783 cfd = get_cgroup_fd(controller);
5fbea8a6 784 if (cfd < 0)
237e200e 785 return -EINVAL;
f5a6d92e
CB
786
787 /* Make sure we pass a relative path to *at() family of functions.
788 * . + /cg + \0
789 */
f23fe717 790 len = strlen(cg) + 2;
237e200e 791 dirnam = alloca(len);
075387cd 792 snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
237e200e 793
f23fe717 794 if (mkdirat(cfd, dirnam, 0755) < 0)
237e200e
SH
795 return -errno;
796
797 if (uid == 0 && gid == 0)
798 return 0;
799
f23fe717 800 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
237e200e
SH
801 return -errno;
802
f23fe717 803 chown_all_cgroup_files(dirnam, uid, gid, cfd);
237e200e
SH
804
805 return 0;
806}
807
7213ec5c 808static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
237e200e 809{
b7672ded 810 struct dirent *direntp;
237e200e
SH
811 DIR *dir;
812 bool ret = false;
813 char pathname[MAXPATHLEN];
b7672ded
CB
814 int dupfd;
815
816 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
817 if (dupfd < 0)
818 return false;
237e200e 819
b7672ded 820 dir = fdopendir(dupfd);
237e200e 821 if (!dir) {
7dd6560a 822 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
7213ec5c 823 close(dupfd);
237e200e
SH
824 return false;
825 }
826
b7672ded 827 while ((direntp = readdir(dir))) {
237e200e
SH
828 struct stat mystat;
829 int rc;
830
237e200e
SH
831 if (!strcmp(direntp->d_name, ".") ||
832 !strcmp(direntp->d_name, ".."))
833 continue;
834
835 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
836 if (rc < 0 || rc >= MAXPATHLEN) {
b8defc3d 837 lxcfs_error("%s\n", "Pathname too long.");
237e200e
SH
838 continue;
839 }
840
2e81a5e3
CB
841 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
842 if (rc) {
7dd6560a 843 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
237e200e
SH
844 continue;
845 }
7dd6560a 846 if (S_ISDIR(mystat.st_mode))
2e81a5e3 847 if (!recursive_rmdir(pathname, fd, cfd))
7dd6560a 848 lxcfs_debug("Error removing %s.\n", pathname);
237e200e
SH
849 }
850
851 ret = true;
852 if (closedir(dir) < 0) {
b8defc3d 853 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
237e200e
SH
854 ret = false;
855 }
856
2e81a5e3 857 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
7dd6560a 858 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
237e200e
SH
859 ret = false;
860 }
7213ec5c
CB
861
862 close(dupfd);
237e200e
SH
863
864 return ret;
865}
866
867bool cgfs_remove(const char *controller, const char *cg)
868{
b7672ded 869 int fd, cfd;
237e200e 870 size_t len;
5fbea8a6 871 char *dirnam;
7213ec5c 872 bool bret;
237e200e 873
d298bba1 874 cfd = get_cgroup_fd(controller);
5fbea8a6 875 if (cfd < 0)
237e200e 876 return false;
f5a6d92e
CB
877
878 /* Make sure we pass a relative path to *at() family of functions.
879 * . + /cg + \0
880 */
b7672ded 881 len = strlen(cg) + 2;
237e200e 882 dirnam = alloca(len);
075387cd 883 snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
b7672ded
CB
884
885 fd = openat(cfd, dirnam, O_DIRECTORY);
886 if (fd < 0)
887 return false;
888
7213ec5c
CB
889 bret = recursive_rmdir(dirnam, fd, cfd);
890 close(fd);
891 return bret;
237e200e
SH
892}
893
894bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
895{
5dd3e6fd 896 int cfd;
237e200e 897 size_t len;
5fbea8a6 898 char *pathname;
237e200e 899
d298bba1 900 cfd = get_cgroup_fd(controller);
5fbea8a6 901 if (cfd < 0)
237e200e 902 return false;
f5a6d92e
CB
903
904 /* Make sure we pass a relative path to *at() family of functions.
905 * . + /file + \0
906 */
534690b4 907 len = strlen(file) + 2;
237e200e 908 pathname = alloca(len);
075387cd 909 snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
534690b4 910 if (fchmodat(cfd, pathname, mode, 0) < 0)
237e200e
SH
911 return false;
912 return true;
913}
914
0f657ce3 915static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e
SH
916{
917 size_t len;
918 char *fname;
919
920 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
921 fname = alloca(len);
922 snprintf(fname, len, "%s/tasks", dirname);
0f657ce3 923 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
924 return -errno;
925 snprintf(fname, len, "%s/cgroup.procs", dirname);
0f657ce3 926 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
927 return -errno;
928 return 0;
929}
930
931int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
932{
5dd3e6fd 933 int cfd;
237e200e 934 size_t len;
5fbea8a6 935 char *pathname;
237e200e 936
d298bba1 937 cfd = get_cgroup_fd(controller);
5fbea8a6
CB
938 if (cfd < 0)
939 return false;
f5a6d92e
CB
940
941 /* Make sure we pass a relative path to *at() family of functions.
942 * . + /file + \0
943 */
0f657ce3 944 len = strlen(file) + 2;
237e200e 945 pathname = alloca(len);
075387cd 946 snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
0f657ce3 947 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
237e200e
SH
948 return -errno;
949
0f657ce3 950 if (is_dir(pathname, cfd))
237e200e 951 // like cgmanager did, we want to chown the tasks file as well
0f657ce3 952 return chown_tasks_files(pathname, uid, gid, cfd);
237e200e
SH
953
954 return 0;
955}
956
957FILE *open_pids_file(const char *controller, const char *cgroup)
958{
3ffd08ee 959 int fd, cfd;
237e200e 960 size_t len;
5fbea8a6 961 char *pathname;
237e200e 962
d298bba1 963 cfd = get_cgroup_fd(controller);
5fbea8a6
CB
964 if (cfd < 0)
965 return false;
f5a6d92e
CB
966
967 /* Make sure we pass a relative path to *at() family of functions.
968 * . + /cgroup + / "cgroup.procs" + \0
969 */
3ffd08ee 970 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
237e200e 971 pathname = alloca(len);
075387cd 972 snprintf(pathname, len, "%s%s/cgroup.procs", dot_or_empty(cgroup), cgroup);
3ffd08ee
CB
973
974 fd = openat(cfd, pathname, O_WRONLY);
975 if (fd < 0)
976 return NULL;
977
978 return fdopen(fd, "w");
237e200e
SH
979}
980
f366da65
WB
981static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
982 void ***list, size_t typesize,
983 void* (*iterator)(const char*, const char*, const char*))
237e200e 984{
4ea38a4c 985 int cfd, fd, ret;
237e200e 986 size_t len;
5fbea8a6 987 char *cg;
237e200e 988 char pathname[MAXPATHLEN];
f366da65 989 size_t sz = 0, asz = 0;
4ea38a4c 990 struct dirent *dirent;
237e200e 991 DIR *dir;
237e200e 992
d298bba1 993 cfd = get_cgroup_fd(controller);
f366da65 994 *list = NULL;
5fbea8a6 995 if (cfd < 0)
e97c834b 996 return false;
237e200e 997
f5a6d92e 998 /* Make sure we pass a relative path to *at() family of functions. */
4ea38a4c
CB
999 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1000 cg = alloca(len);
075387cd 1001 ret = snprintf(cg, len, "%s%s", dot_or_empty(cgroup), cgroup);
4ea38a4c 1002 if (ret < 0 || (size_t)ret >= len) {
b8defc3d 1003 lxcfs_error("Pathname too long under %s\n", cgroup);
4ea38a4c
CB
1004 return false;
1005 }
237e200e 1006
4ea38a4c
CB
1007 fd = openat(cfd, cg, O_DIRECTORY);
1008 if (fd < 0)
1009 return false;
1010
1011 dir = fdopendir(fd);
237e200e
SH
1012 if (!dir)
1013 return false;
1014
4ea38a4c 1015 while ((dirent = readdir(dir))) {
237e200e 1016 struct stat mystat;
237e200e 1017
4ea38a4c
CB
1018 if (!strcmp(dirent->d_name, ".") ||
1019 !strcmp(dirent->d_name, ".."))
237e200e
SH
1020 continue;
1021
4ea38a4c
CB
1022 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1023 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 1024 lxcfs_error("Pathname too long under %s\n", cg);
237e200e
SH
1025 continue;
1026 }
1027
4ea38a4c 1028 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
237e200e 1029 if (ret) {
b8defc3d 1030 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
237e200e
SH
1031 continue;
1032 }
f366da65
WB
1033 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1034 (directories && !S_ISDIR(mystat.st_mode)))
237e200e
SH
1035 continue;
1036
1037 if (sz+2 >= asz) {
f366da65 1038 void **tmp;
237e200e
SH
1039 asz += BATCH_SIZE;
1040 do {
f366da65 1041 tmp = realloc(*list, asz * typesize);
237e200e
SH
1042 } while (!tmp);
1043 *list = tmp;
1044 }
4ea38a4c 1045 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
237e200e
SH
1046 (*list)[sz+1] = NULL;
1047 sz++;
1048 }
1049 if (closedir(dir) < 0) {
b8defc3d 1050 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
237e200e
SH
1051 return false;
1052 }
1053 return true;
1054}
1055
f366da65
WB
1056static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1057{
1058 char *dup;
1059 do {
1060 dup = strdup(dir_entry);
1061 } while (!dup);
1062 return dup;
1063}
1064
1065bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1066{
1067 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1068}
1069
237e200e
SH
1070void free_key(struct cgfs_files *k)
1071{
1072 if (!k)
1073 return;
54a6d46a
CB
1074 free_disarm(k->name);
1075 free_disarm(k);
237e200e
SH
1076}
1077
1078void free_keys(struct cgfs_files **keys)
1079{
1080 int i;
1081
1082 if (!keys)
1083 return;
1084 for (i = 0; keys[i]; i++) {
1085 free_key(keys[i]);
1086 }
54a6d46a 1087 free_disarm(keys);
237e200e
SH
1088}
1089
951acc94
JS
1090bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1091{
1092 int ret, cfd;
1093 size_t len;
5fbea8a6 1094 char *fnam;
951acc94 1095
d298bba1 1096 cfd = get_cgroup_fd(controller);
5fbea8a6 1097 if (cfd < 0)
951acc94
JS
1098 return false;
1099
1100 /* Make sure we pass a relative path to *at() family of functions.
1101 * . + /cgroup + / + file + \0
1102 */
1103 len = strlen(cgroup) + strlen(file) + 3;
1104 fnam = alloca(len);
075387cd 1105 ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
951acc94
JS
1106 if (ret < 0 || (size_t)ret >= len)
1107 return false;
1108
1109 return (faccessat(cfd, fnam, F_OK, 0) == 0);
1110}
1111
237e200e
SH
1112struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1113{
4ea38a4c 1114 int ret, cfd;
237e200e 1115 size_t len;
5fbea8a6 1116 char *fnam;
237e200e
SH
1117 struct stat sb;
1118 struct cgfs_files *newkey;
237e200e 1119
d298bba1 1120 cfd = get_cgroup_fd(controller);
5fbea8a6 1121 if (cfd < 0)
237e200e
SH
1122 return false;
1123
1124 if (file && *file == '/')
1125 file++;
1126
06081b29 1127 if (file && strchr(file, '/'))
237e200e
SH
1128 return NULL;
1129
f5a6d92e
CB
1130 /* Make sure we pass a relative path to *at() family of functions.
1131 * . + /cgroup + / + file + \0
1132 */
4ea38a4c 1133 len = strlen(cgroup) + 3;
237e200e
SH
1134 if (file)
1135 len += strlen(file) + 1;
1136 fnam = alloca(len);
075387cd 1137 snprintf(fnam, len, "%s%s%s%s", dot_or_empty(cgroup), cgroup,
4ea38a4c 1138 file ? "/" : "", file ? file : "");
237e200e 1139
4ea38a4c 1140 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1141 if (ret < 0)
1142 return NULL;
1143
1144 do {
1145 newkey = malloc(sizeof(struct cgfs_files));
1146 } while (!newkey);
1147 if (file)
1148 newkey->name = must_copy_string(file);
06081b29
CB
1149 else if (strrchr(cgroup, '/'))
1150 newkey->name = must_copy_string(strrchr(cgroup, '/'));
237e200e
SH
1151 else
1152 newkey->name = must_copy_string(cgroup);
1153 newkey->uid = sb.st_uid;
1154 newkey->gid = sb.st_gid;
1155 newkey->mode = sb.st_mode;
1156
1157 return newkey;
1158}
1159
f366da65 1160static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
237e200e 1161{
f366da65
WB
1162 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1163 if (!entry) {
b8defc3d
CB
1164 lxcfs_error("Error getting files under %s:%s\n", controller,
1165 cgroup);
237e200e 1166 }
f366da65
WB
1167 return entry;
1168}
1169
1170bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1171{
1172 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
237e200e
SH
1173}
1174
1175bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
5dd3e6fd
CB
1176{
1177 int cfd;
1178 size_t len;
5fbea8a6 1179 char *fnam;
237e200e
SH
1180 int ret;
1181 struct stat sb;
1182
d298bba1 1183 cfd = get_cgroup_fd(controller);
5fbea8a6 1184 if (cfd < 0)
237e200e 1185 return false;
f5a6d92e
CB
1186
1187 /* Make sure we pass a relative path to *at() family of functions.
1188 * . + /cgroup + / + f + \0
1189 */
d04232f2 1190 len = strlen(cgroup) + strlen(f) + 3;
237e200e 1191 fnam = alloca(len);
075387cd 1192 ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, f);
d04232f2
CB
1193 if (ret < 0 || (size_t)ret >= len)
1194 return false;
237e200e 1195
d04232f2 1196 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1197 if (ret < 0 || !S_ISDIR(sb.st_mode))
1198 return false;
f5a6d92e 1199
237e200e
SH
1200 return true;
1201}
1202
1203#define SEND_CREDS_OK 0
1204#define SEND_CREDS_NOTSK 1
1205#define SEND_CREDS_FAIL 2
1206static bool recv_creds(int sock, struct ucred *cred, char *v);
1207static int wait_for_pid(pid_t pid);
1208static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
b10bdd6c 1209static int send_creds_clone_wrapper(void *arg);
237e200e
SH
1210
1211/*
b10bdd6c 1212 * clone a task which switches to @task's namespace and writes '1'.
237e200e
SH
1213 * over a unix sock so we can read the task's reaper's pid in our
1214 * namespace
b10bdd6c
FG
1215 *
1216 * Note: glibc's fork() does not respect pidns, which can lead to failed
1217 * assertions inside glibc (and thus failed forks) if the child's pid in
1218 * the pidns and the parent pid outside are identical. Using clone prevents
1219 * this issue.
237e200e
SH
1220 */
1221static void write_task_init_pid_exit(int sock, pid_t target)
1222{
237e200e
SH
1223 char fnam[100];
1224 pid_t pid;
237e200e 1225 int fd, ret;
b10bdd6c
FG
1226 size_t stack_size = sysconf(_SC_PAGESIZE);
1227 void *stack = alloca(stack_size);
237e200e
SH
1228
1229 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1230 if (ret < 0 || ret >= sizeof(fnam))
1231 _exit(1);
1232
1233 fd = open(fnam, O_RDONLY);
1234 if (fd < 0) {
1235 perror("write_task_init_pid_exit open of ns/pid");
1236 _exit(1);
1237 }
1238 if (setns(fd, 0)) {
1239 perror("write_task_init_pid_exit setns 1");
1240 close(fd);
1241 _exit(1);
1242 }
b10bdd6c 1243 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
237e200e
SH
1244 if (pid < 0)
1245 _exit(1);
1246 if (pid != 0) {
1247 if (!wait_for_pid(pid))
1248 _exit(1);
1249 _exit(0);
1250 }
b10bdd6c
FG
1251}
1252
1253static int send_creds_clone_wrapper(void *arg) {
1254 struct ucred cred;
1255 char v;
1256 int sock = *(int *)arg;
237e200e
SH
1257
1258 /* we are the child */
1259 cred.uid = 0;
1260 cred.gid = 0;
1261 cred.pid = 1;
1262 v = '1';
1263 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
b10bdd6c
FG
1264 return 1;
1265 return 0;
237e200e
SH
1266}
1267
1268static pid_t get_init_pid_for_task(pid_t task)
1269{
1270 int sock[2];
1271 pid_t pid;
1272 pid_t ret = -1;
1273 char v = '0';
1274 struct ucred cred;
1275
1276 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1277 perror("socketpair");
1278 return -1;
1279 }
1280
1281 pid = fork();
1282 if (pid < 0)
1283 goto out;
1284 if (!pid) {
1285 close(sock[1]);
1286 write_task_init_pid_exit(sock[0], task);
1287 _exit(0);
1288 }
1289
1290 if (!recv_creds(sock[1], &cred, &v))
1291 goto out;
1292 ret = cred.pid;
1293
1294out:
1295 close(sock[0]);
1296 close(sock[1]);
1297 if (pid > 0)
1298 wait_for_pid(pid);
1299 return ret;
1300}
1301
71f17cd2 1302pid_t lookup_initpid_in_store(pid_t qpid)
237e200e
SH
1303{
1304 pid_t answer = 0;
1305 struct stat sb;
1306 struct pidns_init_store *e;
1307 char fnam[100];
1308
1309 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1310 store_lock();
1311 if (stat(fnam, &sb) < 0)
1312 goto out;
1313 e = lookup_verify_initpid(&sb);
1314 if (e) {
1315 answer = e->initpid;
1316 goto out;
1317 }
1318 answer = get_init_pid_for_task(qpid);
1319 if (answer > 0)
1320 save_initpid(&sb, answer);
1321
1322out:
1323 /* we prune at end in case we are returning
1324 * the value we were about to return */
1325 prune_initpid_store();
1326 store_unlock();
1327 return answer;
1328}
1329
1330static int wait_for_pid(pid_t pid)
1331{
1332 int status, ret;
1333
1334 if (pid <= 0)
1335 return -1;
1336
1337again:
1338 ret = waitpid(pid, &status, 0);
1339 if (ret == -1) {
1340 if (errno == EINTR)
1341 goto again;
1342 return -1;
1343 }
1344 if (ret != pid)
1345 goto again;
1346 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1347 return -1;
1348 return 0;
1349}
1350
237e200e 1351/*
db1b32f6
SX
1352 * append the given formatted string to *src.
1353 * src: a pointer to a char* in which to append the formatted string.
237e200e
SH
1354 * sz: the number of characters printed so far, minus trailing \0.
1355 * asz: the allocated size so far
db1b32f6
SX
1356 * format: string format. See printf for details.
1357 * ...: varargs. See printf for details.
237e200e 1358 */
db1b32f6 1359static void must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...)
237e200e 1360{
db1b32f6
SX
1361 char tmp[BUF_RESERVE_SIZE];
1362 va_list args;
237e200e 1363
db1b32f6
SX
1364 va_start (args, format);
1365 int tmplen = vsnprintf(tmp, BUF_RESERVE_SIZE, format, args);
1366 va_end(args);
237e200e
SH
1367
1368 if (!*src || tmplen + *sz + 1 >= *asz) {
1369 char *tmp;
1370 do {
1371 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1372 } while (!tmp);
1373 *src = tmp;
1374 *asz += BUF_RESERVE_SIZE;
1375 }
bbfd0e33 1376 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
237e200e 1377 *sz += tmplen;
237e200e
SH
1378}
1379
db1b32f6
SX
1380/*
1381 * append pid to *src.
1382 * src: a pointer to a char* in which ot append the pid.
1383 * sz: the number of characters printed so far, minus trailing \0.
1384 * asz: the allocated size so far
1385 * pid: the pid to append
1386 */
1387static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1388{
1389 must_strcat(src, sz, asz, "%d\n", (int)pid);
1390}
1391
237e200e
SH
1392/*
1393 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1394 * valid in the caller's namespace, return the id mapped into
1395 * pid's namespace.
1396 * Returns the mapped id, or -1 on error.
1397 */
1398unsigned int
1399convert_id_to_ns(FILE *idfile, unsigned int in_id)
1400{
1401 unsigned int nsuid, // base id for a range in the idfile's namespace
1402 hostuid, // base id for a range in the caller's namespace
1403 count; // number of ids in this range
1404 char line[400];
1405 int ret;
1406
1407 fseek(idfile, 0L, SEEK_SET);
1408 while (fgets(line, 400, idfile)) {
1409 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1410 if (ret != 3)
1411 continue;
1412 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1413 /*
1414 * uids wrapped around - unexpected as this is a procfile,
1415 * so just bail.
1416 */
b8defc3d 1417 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
237e200e
SH
1418 nsuid, hostuid, count, line);
1419 return -1;
1420 }
1421 if (hostuid <= in_id && hostuid+count > in_id) {
1422 /*
1423 * now since hostuid <= in_id < hostuid+count, and
1424 * hostuid+count and nsuid+count do not wrap around,
1425 * we know that nsuid+(in_id-hostuid) which must be
1426 * less that nsuid+(count) must not wrap around
1427 */
1428 return (in_id - hostuid) + nsuid;
1429 }
1430 }
1431
1432 // no answer found
1433 return -1;
1434}
1435
1436/*
1437 * for is_privileged_over,
1438 * specify whether we require the calling uid to be root in his
1439 * namespace
1440 */
1441#define NS_ROOT_REQD true
1442#define NS_ROOT_OPT false
1443
1444#define PROCLEN 100
1445
1446static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1447{
1448 char fpath[PROCLEN];
1449 int ret;
1450 bool answer = false;
1451 uid_t nsuid;
1452
1453 if (victim == -1 || uid == -1)
1454 return false;
1455
1456 /*
1457 * If the request is one not requiring root in the namespace,
1458 * then having the same uid suffices. (i.e. uid 1000 has write
1459 * access to files owned by uid 1000
1460 */
1461 if (!req_ns_root && uid == victim)
1462 return true;
1463
1464 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1465 if (ret < 0 || ret >= PROCLEN)
1466 return false;
1467 FILE *f = fopen(fpath, "r");
1468 if (!f)
1469 return false;
1470
1471 /* if caller's not root in his namespace, reject */
1472 nsuid = convert_id_to_ns(f, uid);
1473 if (nsuid)
1474 goto out;
1475
1476 /*
1477 * If victim is not mapped into caller's ns, reject.
1478 * XXX I'm not sure this check is needed given that fuse
1479 * will be sending requests where the vfs has converted
1480 */
1481 nsuid = convert_id_to_ns(f, victim);
1482 if (nsuid == -1)
1483 goto out;
1484
1485 answer = true;
1486
1487out:
1488 fclose(f);
1489 return answer;
1490}
1491
1492static bool perms_include(int fmode, mode_t req_mode)
1493{
1494 mode_t r;
1495
1496 switch (req_mode & O_ACCMODE) {
1497 case O_RDONLY:
1498 r = S_IROTH;
1499 break;
1500 case O_WRONLY:
1501 r = S_IWOTH;
1502 break;
1503 case O_RDWR:
1504 r = S_IROTH | S_IWOTH;
1505 break;
1506 default:
1507 return false;
1508 }
1509 return ((fmode & r) == r);
1510}
1511
1512
1513/*
1514 * taskcg is a/b/c
1515 * querycg is /a/b/c/d/e
1516 * we return 'd'
1517 */
1518static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1519{
1520 char *start, *end;
1521
1522 if (strlen(taskcg) <= strlen(querycg)) {
b8defc3d 1523 lxcfs_error("%s\n", "I was fed bad input.");
237e200e
SH
1524 return NULL;
1525 }
1526
06081b29 1527 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
237e200e
SH
1528 start = strdup(taskcg + 1);
1529 else
1530 start = strdup(taskcg + strlen(querycg) + 1);
1531 if (!start)
1532 return NULL;
1533 end = strchr(start, '/');
1534 if (end)
1535 *end = '\0';
1536 return start;
1537}
1538
71f17cd2 1539char *get_pid_cgroup(pid_t pid, const char *contrl)
237e200e 1540{
5dd3e6fd 1541 int cfd;
237e200e 1542
d298bba1 1543 cfd = get_cgroup_fd(contrl);
5fbea8a6
CB
1544 if (cfd < 0)
1545 return false;
237e200e 1546
5fbea8a6
CB
1547 if (pure_unified_layout(cgroup_ops))
1548 return cg_unified_get_current_cgroup(pid);
237e200e 1549
5fbea8a6 1550 return cg_legacy_get_current_cgroup(pid, contrl);
237e200e
SH
1551}
1552
1553/*
1554 * check whether a fuse context may access a cgroup dir or file
1555 *
1556 * If file is not null, it is a cgroup file to check under cg.
1557 * If file is null, then we are checking perms on cg itself.
1558 *
1559 * For files we can check the mode of the list_keys result.
1560 * For cgroups, we must make assumptions based on the files under the
1561 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1562 * yet.
1563 */
1564static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1565{
1566 struct cgfs_files *k = NULL;
1567 bool ret = false;
1568
1569 k = cgfs_get_key(contrl, cg, file);
1570 if (!k)
1571 return false;
1572
1573 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1574 if (perms_include(k->mode >> 6, mode)) {
1575 ret = true;
1576 goto out;
1577 }
1578 }
1579 if (fc->gid == k->gid) {
1580 if (perms_include(k->mode >> 3, mode)) {
1581 ret = true;
1582 goto out;
1583 }
1584 }
1585 ret = perms_include(k->mode, mode);
1586
1587out:
1588 free_key(k);
1589 return ret;
1590}
1591
1592#define INITSCOPE "/init.scope"
71f17cd2 1593void prune_init_slice(char *cg)
237e200e
SH
1594{
1595 char *point;
1596 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1597
1598 if (cg_len < initscope_len)
1599 return;
1600
1601 point = cg + cg_len - initscope_len;
1602 if (strcmp(point, INITSCOPE) == 0) {
1603 if (point == cg)
1604 *(point+1) = '\0';
1605 else
1606 *point = '\0';
1607 }
1608}
1609
1610/*
1611 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1612 * If pid is in /a, he may act on /a/b, but not on /b.
1613 * if the answer is false and nextcg is not NULL, then *nextcg will point
1614 * to a string containing the next cgroup directory under cg, which must be
1615 * freed by the caller.
1616 */
1617static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1618{
1619 bool answer = false;
1620 char *c2 = get_pid_cgroup(pid, contrl);
1621 char *linecmp;
1622
1623 if (!c2)
1624 return false;
1625 prune_init_slice(c2);
1626
1627 /*
12c31268
CB
1628 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1629 * they pass in a cgroup without leading '/'
1630 *
1631 * The original line here was:
1632 * linecmp = *cg == '/' ? c2 : c2+1;
1633 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1634 * Serge, do you know?
237e200e 1635 */
12c31268
CB
1636 if (*cg == '/' || !strncmp(cg, "./", 2))
1637 linecmp = c2;
1638 else
1639 linecmp = c2 + 1;
237e200e
SH
1640 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1641 if (nextcg) {
1642 *nextcg = get_next_cgroup_dir(linecmp, cg);
1643 }
1644 goto out;
1645 }
1646 answer = true;
1647
1648out:
1649 free(c2);
1650 return answer;
1651}
1652
1653/*
1654 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1655 */
1656static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1657{
1658 bool answer = false;
1659 char *c2, *task_cg;
1660 size_t target_len, task_len;
1661
f7bff426 1662 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
237e200e
SH
1663 return true;
1664
1665 c2 = get_pid_cgroup(pid, contrl);
1666 if (!c2)
1667 return false;
1668 prune_init_slice(c2);
1669
1670 task_cg = c2 + 1;
1671 target_len = strlen(cg);
1672 task_len = strlen(task_cg);
1673 if (task_len == 0) {
1674 /* Task is in the root cg, it can see everything. This case is
1675 * not handled by the strmcps below, since they test for the
1676 * last /, but that is the first / that we've chopped off
1677 * above.
1678 */
1679 answer = true;
1680 goto out;
1681 }
1682 if (strcmp(cg, task_cg) == 0) {
1683 answer = true;
1684 goto out;
1685 }
1686 if (target_len < task_len) {
1687 /* looking up a parent dir */
1688 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1689 answer = true;
1690 goto out;
1691 }
1692 if (target_len > task_len) {
1693 /* looking up a child dir */
1694 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1695 answer = true;
1696 goto out;
1697 }
1698
1699out:
1700 free(c2);
1701 return answer;
1702}
1703
1704/*
1705 * given /cgroup/freezer/a/b, return "freezer".
1706 * the returned char* should NOT be freed.
1707 */
1708static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1709{
1710 const char *p1;
1711 char *contr, *slash;
1712
99142521 1713 if (strlen(path) < 9) {
e254948f 1714 errno = EACCES;
237e200e 1715 return NULL;
99142521
CB
1716 }
1717 if (*(path + 7) != '/') {
1718 errno = EINVAL;
237e200e 1719 return NULL;
99142521 1720 }
3adc421c 1721 p1 = path + 8;
237e200e 1722 contr = strdupa(p1);
99142521
CB
1723 if (!contr) {
1724 errno = ENOMEM;
237e200e 1725 return NULL;
99142521 1726 }
237e200e
SH
1727 slash = strstr(contr, "/");
1728 if (slash)
1729 *slash = '\0';
1730
5fbea8a6
CB
1731 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1732 if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0)
1733 return (*h)->__controllers;
237e200e 1734 }
99142521 1735 errno = ENOENT;
237e200e
SH
1736 return NULL;
1737}
1738
1739/*
1740 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1741 * Note that the returned value may include files (keynames) etc
1742 */
1743static const char *find_cgroup_in_path(const char *path)
1744{
1745 const char *p1;
1746
bc70ba9b 1747 if (strlen(path) < 9) {
e254948f 1748 errno = EACCES;
237e200e 1749 return NULL;
bc70ba9b
CB
1750 }
1751 p1 = strstr(path + 8, "/");
1752 if (!p1) {
1753 errno = EINVAL;
237e200e 1754 return NULL;
bc70ba9b
CB
1755 }
1756 errno = 0;
1757 return p1 + 1;
237e200e
SH
1758}
1759
1760/*
1761 * split the last path element from the path in @cg.
1762 * @dir is newly allocated and should be freed, @last not
1763*/
1764static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1765{
1766 char *p;
1767
1768 do {
1769 *dir = strdup(cg);
1770 } while (!*dir);
1771 *last = strrchr(cg, '/');
1772 if (!*last) {
1773 *last = NULL;
1774 return;
1775 }
1776 p = strrchr(*dir, '/');
1777 *p = '\0';
1778}
1779
1780/*
1781 * FUSE ops for /cgroup
1782 */
1783
1784int cg_getattr(const char *path, struct stat *sb)
1785{
1786 struct timespec now;
1787 struct fuse_context *fc = fuse_get_context();
1788 char * cgdir = NULL;
1789 char *last = NULL, *path1, *path2;
1790 struct cgfs_files *k = NULL;
1791 const char *cgroup;
1792 const char *controller = NULL;
1793 int ret = -ENOENT;
1794
1795
5fbea8a6 1796 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
237e200e
SH
1797 return -EIO;
1798
1799 memset(sb, 0, sizeof(struct stat));
1800
1801 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1802 return -EINVAL;
1803
1804 sb->st_uid = sb->st_gid = 0;
1805 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1806 sb->st_size = 0;
1807
1808 if (strcmp(path, "/cgroup") == 0) {
1809 sb->st_mode = S_IFDIR | 00755;
1810 sb->st_nlink = 2;
1811 return 0;
1812 }
1813
1814 controller = pick_controller_from_path(fc, path);
1815 if (!controller)
2f7036d0 1816 return -errno;
237e200e
SH
1817 cgroup = find_cgroup_in_path(path);
1818 if (!cgroup) {
1819 /* this is just /cgroup/controller, return it as a dir */
1820 sb->st_mode = S_IFDIR | 00755;
1821 sb->st_nlink = 2;
1822 return 0;
1823 }
1824
1825 get_cgdir_and_path(cgroup, &cgdir, &last);
1826
1827 if (!last) {
1828 path1 = "/";
1829 path2 = cgdir;
1830 } else {
1831 path1 = cgdir;
1832 path2 = last;
1833 }
1834
1835 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 1836 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
1837 initpid = fc->pid;
1838 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1839 * Then check that caller's cgroup is under path if last is a child
1840 * cgroup, or cgdir if last is a file */
1841
1842 if (is_child_cgroup(controller, path1, path2)) {
1843 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1844 ret = -ENOENT;
1845 goto out;
1846 }
1847 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1848 /* this is just /cgroup/controller, return it as a dir */
1849 sb->st_mode = S_IFDIR | 00555;
1850 sb->st_nlink = 2;
1851 ret = 0;
1852 goto out;
1853 }
1854 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1855 ret = -EACCES;
1856 goto out;
1857 }
1858
1859 // get uid, gid, from '/tasks' file and make up a mode
1860 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1861 sb->st_mode = S_IFDIR | 00755;
1862 k = cgfs_get_key(controller, cgroup, NULL);
1863 if (!k) {
1864 sb->st_uid = sb->st_gid = 0;
1865 } else {
1866 sb->st_uid = k->uid;
1867 sb->st_gid = k->gid;
1868 }
1869 free_key(k);
1870 sb->st_nlink = 2;
1871 ret = 0;
1872 goto out;
1873 }
1874
1875 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1876 sb->st_mode = S_IFREG | k->mode;
1877 sb->st_nlink = 1;
1878 sb->st_uid = k->uid;
1879 sb->st_gid = k->gid;
1880 sb->st_size = 0;
1881 free_key(k);
1882 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1883 ret = -ENOENT;
1884 goto out;
1885 }
237e200e
SH
1886 ret = 0;
1887 }
1888
1889out:
1890 free(cgdir);
1891 return ret;
1892}
1893
1894int cg_opendir(const char *path, struct fuse_file_info *fi)
1895{
1896 struct fuse_context *fc = fuse_get_context();
1897 const char *cgroup;
1898 struct file_info *dir_info;
1899 char *controller = NULL;
1900
5fbea8a6 1901 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
237e200e
SH
1902 return -EIO;
1903
1904 if (strcmp(path, "/cgroup") == 0) {
1905 cgroup = NULL;
1906 controller = NULL;
1907 } else {
1908 // return list of keys for the controller, and list of child cgroups
1909 controller = pick_controller_from_path(fc, path);
1910 if (!controller)
2f7036d0 1911 return -errno;
237e200e
SH
1912
1913 cgroup = find_cgroup_in_path(path);
1914 if (!cgroup) {
1915 /* this is just /cgroup/controller, return its contents */
1916 cgroup = "/";
1917 }
1918 }
1919
1920 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 1921 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
1922 initpid = fc->pid;
1923 if (cgroup) {
1924 if (!caller_may_see_dir(initpid, controller, cgroup))
1925 return -ENOENT;
1926 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1927 return -EACCES;
1928 }
1929
1930 /* we'll free this at cg_releasedir */
1931 dir_info = malloc(sizeof(*dir_info));
1932 if (!dir_info)
1933 return -ENOMEM;
1934 dir_info->controller = must_copy_string(controller);
1935 dir_info->cgroup = must_copy_string(cgroup);
1936 dir_info->type = LXC_TYPE_CGDIR;
1937 dir_info->buf = NULL;
1938 dir_info->file = NULL;
1939 dir_info->buflen = 0;
1940
1941 fi->fh = (unsigned long)dir_info;
1942 return 0;
1943}
1944
1945int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1946 struct fuse_file_info *fi)
1947{
1948 struct file_info *d = (struct file_info *)fi->fh;
1949 struct cgfs_files **list = NULL;
1950 int i, ret;
1951 char *nextcg = NULL;
1952 struct fuse_context *fc = fuse_get_context();
1953 char **clist = NULL;
1954
5fbea8a6
CB
1955 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1956 return -EIO;
1957
d639f863
CB
1958 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1959 return -EIO;
1960
237e200e 1961 if (d->type != LXC_TYPE_CGDIR) {
b8defc3d 1962 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
237e200e
SH
1963 return -EIO;
1964 }
1965 if (!d->cgroup && !d->controller) {
5fbea8a6
CB
1966 /*
1967 * ls /var/lib/lxcfs/cgroup - just show list of controllers.
1968 * This only works with the legacy hierarchy.
1969 */
1970 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1971 if (is_unified_hierarchy(*h))
1972 continue;
237e200e 1973
5fbea8a6 1974 if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0))
237e200e 1975 return -EIO;
237e200e 1976 }
5fbea8a6 1977
237e200e
SH
1978 return 0;
1979 }
1980
1981 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1982 // not a valid cgroup
1983 ret = -EINVAL;
1984 goto out;
1985 }
1986
1987 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 1988 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
1989 initpid = fc->pid;
1990 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1991 if (nextcg) {
1992 ret = filler(buf, nextcg, NULL, 0);
1993 free(nextcg);
1994 if (ret != 0) {
1995 ret = -EIO;
1996 goto out;
1997 }
1998 }
1999 ret = 0;
2000 goto out;
2001 }
2002
b737a54a 2003 for (i = 0; list && list[i]; i++) {
237e200e
SH
2004 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2005 ret = -EIO;
2006 goto out;
2007 }
2008 }
2009
2010 // now get the list of child cgroups
2011
2012 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2013 ret = 0;
2014 goto out;
2015 }
f366da65
WB
2016 if (clist) {
2017 for (i = 0; clist[i]; i++) {
2018 if (filler(buf, clist[i], NULL, 0) != 0) {
2019 ret = -EIO;
2020 goto out;
2021 }
237e200e
SH
2022 }
2023 }
2024 ret = 0;
2025
2026out:
2027 free_keys(list);
2028 if (clist) {
2029 for (i = 0; clist[i]; i++)
2030 free(clist[i]);
2031 free(clist);
2032 }
2033 return ret;
2034}
2035
71f17cd2 2036void do_release_file_info(struct fuse_file_info *fi)
237e200e 2037{
43215927
SH
2038 struct file_info *f = (struct file_info *)fi->fh;
2039
237e200e
SH
2040 if (!f)
2041 return;
43215927
SH
2042
2043 fi->fh = 0;
2044
54a6d46a
CB
2045 free_disarm(f->controller);
2046 free_disarm(f->cgroup);
2047 free_disarm(f->file);
2048 free_disarm(f->buf);
2049 free_disarm(f);
237e200e
SH
2050}
2051
2052int cg_releasedir(const char *path, struct fuse_file_info *fi)
2053{
43215927 2054 do_release_file_info(fi);
237e200e
SH
2055 return 0;
2056}
2057
2058int cg_open(const char *path, struct fuse_file_info *fi)
2059{
2060 const char *cgroup;
2061 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2062 struct cgfs_files *k = NULL;
2063 struct file_info *file_info;
2064 struct fuse_context *fc = fuse_get_context();
2065 int ret;
2066
5fbea8a6 2067 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
237e200e
SH
2068 return -EIO;
2069
2070 controller = pick_controller_from_path(fc, path);
2071 if (!controller)
2f7036d0 2072 return -errno;
237e200e
SH
2073 cgroup = find_cgroup_in_path(path);
2074 if (!cgroup)
bc70ba9b 2075 return -errno;
237e200e
SH
2076
2077 get_cgdir_and_path(cgroup, &cgdir, &last);
2078 if (!last) {
2079 path1 = "/";
2080 path2 = cgdir;
2081 } else {
2082 path1 = cgdir;
2083 path2 = last;
2084 }
2085
2086 k = cgfs_get_key(controller, path1, path2);
2087 if (!k) {
2088 ret = -EINVAL;
2089 goto out;
2090 }
2091 free_key(k);
2092
2093 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 2094 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
2095 initpid = fc->pid;
2096 if (!caller_may_see_dir(initpid, controller, path1)) {
2097 ret = -ENOENT;
2098 goto out;
2099 }
2100 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
237e200e
SH
2101 ret = -EACCES;
2102 goto out;
2103 }
2104
2105 /* we'll free this at cg_release */
2106 file_info = malloc(sizeof(*file_info));
2107 if (!file_info) {
2108 ret = -ENOMEM;
2109 goto out;
2110 }
2111 file_info->controller = must_copy_string(controller);
2112 file_info->cgroup = must_copy_string(path1);
2113 file_info->file = must_copy_string(path2);
2114 file_info->type = LXC_TYPE_CGFILE;
2115 file_info->buf = NULL;
2116 file_info->buflen = 0;
2117
2118 fi->fh = (unsigned long)file_info;
2119 ret = 0;
2120
2121out:
2122 free(cgdir);
2123 return ret;
2124}
2125
bddbb106
SH
2126int cg_access(const char *path, int mode)
2127{
6f0f6b83 2128 int ret;
bddbb106 2129 const char *cgroup;
6f0f6b83
CB
2130 char *path1, *path2, *controller;
2131 char *last = NULL, *cgdir = NULL;
bddbb106
SH
2132 struct cgfs_files *k = NULL;
2133 struct fuse_context *fc = fuse_get_context();
6f0f6b83 2134
5fbea8a6
CB
2135 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2136 return -EIO;
2137
9873c5e8 2138 if (strcmp(path, "/cgroup") == 0)
6f0f6b83 2139 return 0;
bddbb106 2140
bddbb106
SH
2141 controller = pick_controller_from_path(fc, path);
2142 if (!controller)
2f7036d0 2143 return -errno;
bddbb106 2144 cgroup = find_cgroup_in_path(path);
575316c4
SH
2145 if (!cgroup) {
2146 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
3f441bc7
SH
2147 if ((mode & W_OK) == 0)
2148 return 0;
2149 return -EACCES;
575316c4 2150 }
bddbb106
SH
2151
2152 get_cgdir_and_path(cgroup, &cgdir, &last);
2153 if (!last) {
2154 path1 = "/";
2155 path2 = cgdir;
2156 } else {
2157 path1 = cgdir;
2158 path2 = last;
2159 }
2160
2161 k = cgfs_get_key(controller, path1, path2);
2162 if (!k) {
3f441bc7
SH
2163 if ((mode & W_OK) == 0)
2164 ret = 0;
2165 else
2166 ret = -EACCES;
bddbb106
SH
2167 goto out;
2168 }
2169 free_key(k);
2170
2171 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 2172 if (initpid <= 1 || is_shared_pidns(initpid))
bddbb106
SH
2173 initpid = fc->pid;
2174 if (!caller_may_see_dir(initpid, controller, path1)) {
2175 ret = -ENOENT;
2176 goto out;
2177 }
2178 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2179 ret = -EACCES;
2180 goto out;
2181 }
2182
2183 ret = 0;
2184
2185out:
2186 free(cgdir);
2187 return ret;
2188}
2189
237e200e
SH
2190int cg_release(const char *path, struct fuse_file_info *fi)
2191{
43215927 2192 do_release_file_info(fi);
237e200e
SH
2193 return 0;
2194}
2195
2196#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2197
2198static bool wait_for_sock(int sock, int timeout)
2199{
2200 struct epoll_event ev;
2201 int epfd, ret, now, starttime, deltatime, saved_errno;
2202
2203 if ((starttime = time(NULL)) < 0)
2204 return false;
2205
2206 if ((epfd = epoll_create(1)) < 0) {
b8defc3d 2207 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
237e200e
SH
2208 return false;
2209 }
2210
2211 ev.events = POLLIN_SET;
2212 ev.data.fd = sock;
2213 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
b8defc3d 2214 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
237e200e
SH
2215 close(epfd);
2216 return false;
2217 }
2218
2219again:
2220 if ((now = time(NULL)) < 0) {
2221 close(epfd);
2222 return false;
2223 }
2224
2225 deltatime = (starttime + timeout) - now;
2226 if (deltatime < 0) { // timeout
2227 errno = 0;
2228 close(epfd);
2229 return false;
2230 }
2231 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2232 if (ret < 0 && errno == EINTR)
2233 goto again;
2234 saved_errno = errno;
2235 close(epfd);
2236
2237 if (ret <= 0) {
2238 errno = saved_errno;
2239 return false;
2240 }
2241 return true;
2242}
2243
2244static int msgrecv(int sockfd, void *buf, size_t len)
2245{
2246 if (!wait_for_sock(sockfd, 2))
2247 return -1;
2248 return recv(sockfd, buf, len, MSG_DONTWAIT);
2249}
2250
2251static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2252{
2253 struct msghdr msg = { 0 };
2254 struct iovec iov;
2255 struct cmsghdr *cmsg;
2256 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2257 char buf[1];
2258 buf[0] = 'p';
2259
2260 if (pingfirst) {
2261 if (msgrecv(sock, buf, 1) != 1) {
b8defc3d 2262 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
237e200e
SH
2263 return SEND_CREDS_FAIL;
2264 }
2265 }
2266
2267 msg.msg_control = cmsgbuf;
2268 msg.msg_controllen = sizeof(cmsgbuf);
2269
2270 cmsg = CMSG_FIRSTHDR(&msg);
2271 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2272 cmsg->cmsg_level = SOL_SOCKET;
2273 cmsg->cmsg_type = SCM_CREDENTIALS;
2274 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2275
2276 msg.msg_name = NULL;
2277 msg.msg_namelen = 0;
2278
2279 buf[0] = v;
2280 iov.iov_base = buf;
2281 iov.iov_len = sizeof(buf);
2282 msg.msg_iov = &iov;
2283 msg.msg_iovlen = 1;
2284
2285 if (sendmsg(sock, &msg, 0) < 0) {
b8defc3d 2286 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
237e200e
SH
2287 if (errno == 3)
2288 return SEND_CREDS_NOTSK;
2289 return SEND_CREDS_FAIL;
2290 }
2291
2292 return SEND_CREDS_OK;
2293}
2294
2295static bool recv_creds(int sock, struct ucred *cred, char *v)
2296{
2297 struct msghdr msg = { 0 };
2298 struct iovec iov;
2299 struct cmsghdr *cmsg;
2300 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2301 char buf[1];
2302 int ret;
2303 int optval = 1;
2304
2305 *v = '1';
2306
2307 cred->pid = -1;
2308 cred->uid = -1;
2309 cred->gid = -1;
2310
2311 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
b8defc3d 2312 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
237e200e
SH
2313 return false;
2314 }
2315 buf[0] = '1';
2316 if (write(sock, buf, 1) != 1) {
b8defc3d 2317 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
237e200e
SH
2318 return false;
2319 }
2320
2321 msg.msg_name = NULL;
2322 msg.msg_namelen = 0;
2323 msg.msg_control = cmsgbuf;
2324 msg.msg_controllen = sizeof(cmsgbuf);
2325
2326 iov.iov_base = buf;
2327 iov.iov_len = sizeof(buf);
2328 msg.msg_iov = &iov;
2329 msg.msg_iovlen = 1;
2330
2331 if (!wait_for_sock(sock, 2)) {
b8defc3d 2332 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
237e200e
SH
2333 return false;
2334 }
2335 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2336 if (ret < 0) {
b8defc3d 2337 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
237e200e
SH
2338 return false;
2339 }
2340
2341 cmsg = CMSG_FIRSTHDR(&msg);
2342
2343 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2344 cmsg->cmsg_level == SOL_SOCKET &&
2345 cmsg->cmsg_type == SCM_CREDENTIALS) {
2346 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2347 }
2348 *v = buf[0];
2349
2350 return true;
2351}
2352
35174b0f
FG
2353struct pid_ns_clone_args {
2354 int *cpipe;
2355 int sock;
2356 pid_t tpid;
2357 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2358};
2359
2360/*
2361 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2362 * with clone(). This simply writes '1' as ACK back to the parent
2363 * before calling the actual wrapped function.
2364 */
2365static int pid_ns_clone_wrapper(void *arg) {
2366 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2367 char b = '1';
2368
2369 close(args->cpipe[0]);
b8defc3d
CB
2370 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2371 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
35174b0f
FG
2372 close(args->cpipe[1]);
2373 return args->wrapped(args->sock, args->tpid);
2374}
237e200e
SH
2375
2376/*
2377 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2378 * int value back over the socket. This shifts the pid from the
2379 * sender's pidns into tpid's pidns.
2380 */
35174b0f 2381static int pid_to_ns(int sock, pid_t tpid)
237e200e
SH
2382{
2383 char v = '0';
2384 struct ucred cred;
2385
2386 while (recv_creds(sock, &cred, &v)) {
2387 if (v == '1')
35174b0f 2388 return 0;
237e200e 2389 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
35174b0f 2390 return 1;
237e200e 2391 }
35174b0f 2392 return 0;
237e200e
SH
2393}
2394
35174b0f 2395
237e200e
SH
2396/*
2397 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
35174b0f
FG
2398 * in your old pidns. Only children which you clone will be in the target
2399 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2400 * actually convert pids.
2401 *
2402 * Note: glibc's fork() does not respect pidns, which can lead to failed
2403 * assertions inside glibc (and thus failed forks) if the child's pid in
2404 * the pidns and the parent pid outside are identical. Using clone prevents
2405 * this issue.
237e200e
SH
2406 */
2407static void pid_to_ns_wrapper(int sock, pid_t tpid)
2408{
2409 int newnsfd = -1, ret, cpipe[2];
2410 char fnam[100];
2411 pid_t cpid;
2412 char v;
2413
2414 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2415 if (ret < 0 || ret >= sizeof(fnam))
2416 _exit(1);
2417 newnsfd = open(fnam, O_RDONLY);
2418 if (newnsfd < 0)
2419 _exit(1);
2420 if (setns(newnsfd, 0) < 0)
2421 _exit(1);
2422 close(newnsfd);
2423
2424 if (pipe(cpipe) < 0)
2425 _exit(1);
2426
35174b0f
FG
2427 struct pid_ns_clone_args args = {
2428 .cpipe = cpipe,
2429 .sock = sock,
2430 .tpid = tpid,
2431 .wrapped = &pid_to_ns
2432 };
2433 size_t stack_size = sysconf(_SC_PAGESIZE);
2434 void *stack = alloca(stack_size);
2435
2436 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2437 if (cpid < 0)
2438 _exit(1);
2439
237e200e
SH
2440 // give the child 1 second to be done forking and
2441 // write its ack
2442 if (!wait_for_sock(cpipe[0], 1))
2443 _exit(1);
2444 ret = read(cpipe[0], &v, 1);
2445 if (ret != sizeof(char) || v != '1')
2446 _exit(1);
2447
2448 if (!wait_for_pid(cpid))
2449 _exit(1);
2450 _exit(0);
2451}
2452
2453/*
2454 * To read cgroup files with a particular pid, we will setns into the child
2455 * pidns, open a pipe, fork a child - which will be the first to really be in
2456 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2457 */
2458bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2459{
2460 int sock[2] = {-1, -1};
2461 char *tmpdata = NULL;
2462 int ret;
2463 pid_t qpid, cpid = -1;
2464 bool answer = false;
2465 char v = '0';
2466 struct ucred cred;
2467 size_t sz = 0, asz = 0;
2468
1ca6a467 2469 if (!cgroup_ops->get(cgroup_ops, contrl, cg, file, &tmpdata))
237e200e
SH
2470 return false;
2471
2472 /*
2473 * Now we read the pids from returned data one by one, pass
2474 * them into a child in the target namespace, read back the
2475 * translated pids, and put them into our to-return data
2476 */
2477
2478 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2479 perror("socketpair");
2480 free(tmpdata);
2481 return false;
2482 }
2483
2484 cpid = fork();
2485 if (cpid == -1)
2486 goto out;
2487
2488 if (!cpid) // child - exits when done
2489 pid_to_ns_wrapper(sock[1], tpid);
2490
2491 char *ptr = tmpdata;
2492 cred.uid = 0;
2493 cred.gid = 0;
2494 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2495 cred.pid = qpid;
2496 ret = send_creds(sock[0], &cred, v, true);
2497
2498 if (ret == SEND_CREDS_NOTSK)
2499 goto next;
2500 if (ret == SEND_CREDS_FAIL)
2501 goto out;
2502
2503 // read converted results
2504 if (!wait_for_sock(sock[0], 2)) {
b8defc3d 2505 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
237e200e
SH
2506 goto out;
2507 }
2508 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2509 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
237e200e
SH
2510 goto out;
2511 }
2512 must_strcat_pid(d, &sz, &asz, qpid);
2513next:
2514 ptr = strchr(ptr, '\n');
2515 if (!ptr)
2516 break;
2517 ptr++;
2518 }
2519
2520 cred.pid = getpid();
2521 v = '1';
2522 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2523 // failed to ask child to exit
b8defc3d 2524 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
237e200e
SH
2525 goto out;
2526 }
2527
2528 answer = true;
2529
2530out:
2531 free(tmpdata);
2532 if (cpid != -1)
2533 wait_for_pid(cpid);
2534 if (sock[0] != -1) {
2535 close(sock[0]);
2536 close(sock[1]);
2537 }
2538 return answer;
2539}
2540
2541int cg_read(const char *path, char *buf, size_t size, off_t offset,
2542 struct fuse_file_info *fi)
2543{
2544 struct fuse_context *fc = fuse_get_context();
2545 struct file_info *f = (struct file_info *)fi->fh;
2546 struct cgfs_files *k = NULL;
2547 char *data = NULL;
2548 int ret, s;
2549 bool r;
2550
5fbea8a6
CB
2551 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2552 return -EIO;
2553
237e200e 2554 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 2555 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
237e200e
SH
2556 return -EIO;
2557 }
2558
2559 if (offset)
2560 return 0;
2561
237e200e
SH
2562 if (!f->controller)
2563 return -EINVAL;
2564
2565 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2566 return -EINVAL;
2567 }
2568 free_key(k);
2569
2570
888f8f3c 2571 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
237e200e
SH
2572 ret = -EACCES;
2573 goto out;
2574 }
2575
2576 if (strcmp(f->file, "tasks") == 0 ||
2577 strcmp(f->file, "/tasks") == 0 ||
2578 strcmp(f->file, "/cgroup.procs") == 0 ||
2579 strcmp(f->file, "cgroup.procs") == 0)
2580 // special case - we have to translate the pids
2581 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2582 else
1ca6a467 2583 r = cgroup_ops->get(cgroup_ops, f->controller, f->cgroup, f->file, &data);
237e200e
SH
2584
2585 if (!r) {
2586 ret = -EINVAL;
2587 goto out;
2588 }
2589
2590 if (!data) {
2591 ret = 0;
2592 goto out;
2593 }
2594 s = strlen(data);
2595 if (s > size)
2596 s = size;
2597 memcpy(buf, data, s);
2598 if (s > 0 && s < size && data[s-1] != '\n')
2599 buf[s++] = '\n';
2600
2601 ret = s;
2602
2603out:
2604 free(data);
2605 return ret;
2606}
2607
35174b0f 2608static int pid_from_ns(int sock, pid_t tpid)
237e200e
SH
2609{
2610 pid_t vpid;
2611 struct ucred cred;
2612 char v;
2613 int ret;
2614
2615 cred.uid = 0;
2616 cred.gid = 0;
2617 while (1) {
2618 if (!wait_for_sock(sock, 2)) {
b8defc3d 2619 lxcfs_error("%s\n", "Timeout reading from parent.");
35174b0f 2620 return 1;
237e200e
SH
2621 }
2622 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
b8defc3d 2623 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
35174b0f 2624 return 1;
237e200e
SH
2625 }
2626 if (vpid == -1) // done
2627 break;
2628 v = '0';
2629 cred.pid = vpid;
2630 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2631 v = '1';
2632 cred.pid = getpid();
2633 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
35174b0f 2634 return 1;
237e200e
SH
2635 }
2636 }
35174b0f 2637 return 0;
237e200e
SH
2638}
2639
2640static void pid_from_ns_wrapper(int sock, pid_t tpid)
2641{
2642 int newnsfd = -1, ret, cpipe[2];
2643 char fnam[100];
2644 pid_t cpid;
2645 char v;
2646
2647 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2648 if (ret < 0 || ret >= sizeof(fnam))
2649 _exit(1);
2650 newnsfd = open(fnam, O_RDONLY);
2651 if (newnsfd < 0)
2652 _exit(1);
2653 if (setns(newnsfd, 0) < 0)
2654 _exit(1);
2655 close(newnsfd);
2656
2657 if (pipe(cpipe) < 0)
2658 _exit(1);
2659
35174b0f
FG
2660 struct pid_ns_clone_args args = {
2661 .cpipe = cpipe,
2662 .sock = sock,
2663 .tpid = tpid,
2664 .wrapped = &pid_from_ns
2665 };
f0f8b851
SH
2666 size_t stack_size = sysconf(_SC_PAGESIZE);
2667 void *stack = alloca(stack_size);
35174b0f
FG
2668
2669 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2670 if (cpid < 0)
2671 _exit(1);
2672
237e200e
SH
2673 // give the child 1 second to be done forking and
2674 // write its ack
2675 if (!wait_for_sock(cpipe[0], 1))
f0f8b851 2676 _exit(1);
237e200e 2677 ret = read(cpipe[0], &v, 1);
f0f8b851
SH
2678 if (ret != sizeof(char) || v != '1')
2679 _exit(1);
237e200e
SH
2680
2681 if (!wait_for_pid(cpid))
2682 _exit(1);
2683 _exit(0);
237e200e
SH
2684}
2685
2686/*
2687 * Given host @uid, return the uid to which it maps in
2688 * @pid's user namespace, or -1 if none.
2689 */
2690bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2691{
2692 FILE *f;
2693 char line[400];
2694
2695 sprintf(line, "/proc/%d/uid_map", pid);
2696 if ((f = fopen(line, "r")) == NULL) {
2697 return false;
2698 }
2699
2700 *answer = convert_id_to_ns(f, uid);
2701 fclose(f);
2702
2703 if (*answer == -1)
2704 return false;
2705 return true;
2706}
2707
2708/*
2709 * get_pid_creds: get the real uid and gid of @pid from
2710 * /proc/$$/status
2711 * (XXX should we use euid here?)
2712 */
2713void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2714{
2715 char line[400];
2716 uid_t u;
2717 gid_t g;
2718 FILE *f;
2719
2720 *uid = -1;
2721 *gid = -1;
2722 sprintf(line, "/proc/%d/status", pid);
2723 if ((f = fopen(line, "r")) == NULL) {
b8defc3d 2724 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
237e200e
SH
2725 return;
2726 }
2727 while (fgets(line, 400, f)) {
2728 if (strncmp(line, "Uid:", 4) == 0) {
2729 if (sscanf(line+4, "%u", &u) != 1) {
b8defc3d 2730 lxcfs_error("bad uid line for pid %u\n", pid);
237e200e
SH
2731 fclose(f);
2732 return;
2733 }
2734 *uid = u;
2735 } else if (strncmp(line, "Gid:", 4) == 0) {
2736 if (sscanf(line+4, "%u", &g) != 1) {
b8defc3d 2737 lxcfs_error("bad gid line for pid %u\n", pid);
237e200e
SH
2738 fclose(f);
2739 return;
2740 }
2741 *gid = g;
2742 }
2743 }
2744 fclose(f);
2745}
2746
2747/*
2748 * May the requestor @r move victim @v to a new cgroup?
2749 * This is allowed if
2750 * . they are the same task
2751 * . they are ownedy by the same uid
2752 * . @r is root on the host, or
2753 * . @v's uid is mapped into @r's where @r is root.
2754 */
2755bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2756{
2757 uid_t v_uid, tmpuid;
2758 gid_t v_gid;
2759
2760 if (r == v)
2761 return true;
2762 if (r_uid == 0)
2763 return true;
2764 get_pid_creds(v, &v_uid, &v_gid);
2765 if (r_uid == v_uid)
2766 return true;
2767 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2768 && hostuid_to_ns(v_uid, r, &tmpuid))
2769 return true;
2770 return false;
2771}
2772
2773static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2774 const char *file, const char *buf)
2775{
2776 int sock[2] = {-1, -1};
2777 pid_t qpid, cpid = -1;
2778 FILE *pids_file = NULL;
2779 bool answer = false, fail = false;
2780
2781 pids_file = open_pids_file(contrl, cg);
2782 if (!pids_file)
2783 return false;
2784
2785 /*
2786 * write the pids to a socket, have helper in writer's pidns
2787 * call movepid for us
2788 */
2789 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2790 perror("socketpair");
2791 goto out;
2792 }
2793
2794 cpid = fork();
2795 if (cpid == -1)
2796 goto out;
2797
2798 if (!cpid) { // child
2799 fclose(pids_file);
2800 pid_from_ns_wrapper(sock[1], tpid);
2801 }
2802
2803 const char *ptr = buf;
2804 while (sscanf(ptr, "%d", &qpid) == 1) {
2805 struct ucred cred;
2806 char v;
2807
2808 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2809 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
237e200e
SH
2810 goto out;
2811 }
2812
2813 if (recv_creds(sock[0], &cred, &v)) {
2814 if (v == '0') {
2815 if (!may_move_pid(tpid, tuid, cred.pid)) {
2816 fail = true;
2817 break;
2818 }
2819 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2820 fail = true;
2821 }
2822 }
2823
2824 ptr = strchr(ptr, '\n');
2825 if (!ptr)
2826 break;
2827 ptr++;
2828 }
2829
2830 /* All good, write the value */
2831 qpid = -1;
2832 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
b8defc3d 2833 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
237e200e
SH
2834
2835 if (!fail)
2836 answer = true;
2837
2838out:
2839 if (cpid != -1)
2840 wait_for_pid(cpid);
2841 if (sock[0] != -1) {
2842 close(sock[0]);
2843 close(sock[1]);
2844 }
2845 if (pids_file) {
2846 if (fclose(pids_file) != 0)
2847 answer = false;
2848 }
2849 return answer;
2850}
2851
2852int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2853 struct fuse_file_info *fi)
2854{
2855 struct fuse_context *fc = fuse_get_context();
2856 char *localbuf = NULL;
2857 struct cgfs_files *k = NULL;
2858 struct file_info *f = (struct file_info *)fi->fh;
2859 bool r;
2860
5fbea8a6
CB
2861 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2862 return -EIO;
2863
237e200e 2864 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 2865 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
237e200e
SH
2866 return -EIO;
2867 }
2868
2869 if (offset)
2870 return 0;
2871
237e200e
SH
2872 localbuf = alloca(size+1);
2873 localbuf[size] = '\0';
2874 memcpy(localbuf, buf, size);
2875
2876 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2877 size = -EINVAL;
2878 goto out;
2879 }
2880
2881 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2882 size = -EACCES;
2883 goto out;
2884 }
2885
2886 if (strcmp(f->file, "tasks") == 0 ||
2887 strcmp(f->file, "/tasks") == 0 ||
2888 strcmp(f->file, "/cgroup.procs") == 0 ||
2889 strcmp(f->file, "cgroup.procs") == 0)
2890 // special case - we have to translate the pids
2891 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2892 else
2893 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2894
2895 if (!r)
2896 size = -EINVAL;
2897
2898out:
2899 free_key(k);
2900 return size;
2901}
2902
2903int cg_chown(const char *path, uid_t uid, gid_t gid)
2904{
2905 struct fuse_context *fc = fuse_get_context();
2906 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2907 struct cgfs_files *k = NULL;
2908 const char *cgroup;
2909 int ret;
2910
5fbea8a6 2911 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
237e200e
SH
2912 return -EIO;
2913
2914 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 2915 return -EPERM;
237e200e
SH
2916
2917 controller = pick_controller_from_path(fc, path);
2918 if (!controller)
bc70ba9b
CB
2919 return errno == ENOENT ? -EPERM : -errno;
2920
237e200e
SH
2921 cgroup = find_cgroup_in_path(path);
2922 if (!cgroup)
2923 /* this is just /cgroup/controller */
bc70ba9b 2924 return -EPERM;
237e200e
SH
2925
2926 get_cgdir_and_path(cgroup, &cgdir, &last);
2927
2928 if (!last) {
2929 path1 = "/";
2930 path2 = cgdir;
2931 } else {
2932 path1 = cgdir;
2933 path2 = last;
2934 }
2935
2936 if (is_child_cgroup(controller, path1, path2)) {
2937 // get uid, gid, from '/tasks' file and make up a mode
2938 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2939 k = cgfs_get_key(controller, cgroup, "tasks");
2940
2941 } else
2942 k = cgfs_get_key(controller, path1, path2);
2943
2944 if (!k) {
2945 ret = -EINVAL;
2946 goto out;
2947 }
2948
2949 /*
2950 * This being a fuse request, the uid and gid must be valid
2951 * in the caller's namespace. So we can just check to make
2952 * sure that the caller is root in his uid, and privileged
2953 * over the file's current owner.
2954 */
2955 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2956 ret = -EACCES;
2957 goto out;
2958 }
2959
2960 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2961
2962out:
2963 free_key(k);
2964 free(cgdir);
2965
2966 return ret;
2967}
2968
2969int cg_chmod(const char *path, mode_t mode)
2970{
2971 struct fuse_context *fc = fuse_get_context();
2972 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2973 struct cgfs_files *k = NULL;
2974 const char *cgroup;
2975 int ret;
2976
5fbea8a6 2977 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
237e200e
SH
2978 return -EIO;
2979
2980 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 2981 return -EPERM;
237e200e
SH
2982
2983 controller = pick_controller_from_path(fc, path);
2984 if (!controller)
bc70ba9b
CB
2985 return errno == ENOENT ? -EPERM : -errno;
2986
237e200e
SH
2987 cgroup = find_cgroup_in_path(path);
2988 if (!cgroup)
2989 /* this is just /cgroup/controller */
bc70ba9b 2990 return -EPERM;
237e200e
SH
2991
2992 get_cgdir_and_path(cgroup, &cgdir, &last);
2993
2994 if (!last) {
2995 path1 = "/";
2996 path2 = cgdir;
2997 } else {
2998 path1 = cgdir;
2999 path2 = last;
3000 }
3001
3002 if (is_child_cgroup(controller, path1, path2)) {
3003 // get uid, gid, from '/tasks' file and make up a mode
3004 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3005 k = cgfs_get_key(controller, cgroup, "tasks");
3006
3007 } else
3008 k = cgfs_get_key(controller, path1, path2);
3009
3010 if (!k) {
3011 ret = -EINVAL;
3012 goto out;
3013 }
3014
3015 /*
3016 * This being a fuse request, the uid and gid must be valid
3017 * in the caller's namespace. So we can just check to make
3018 * sure that the caller is root in his uid, and privileged
3019 * over the file's current owner.
3020 */
3021 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3022 ret = -EPERM;
3023 goto out;
3024 }
3025
3026 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3027 ret = -EINVAL;
3028 goto out;
3029 }
3030
3031 ret = 0;
3032out:
3033 free_key(k);
3034 free(cgdir);
3035 return ret;
3036}
3037
3038int cg_mkdir(const char *path, mode_t mode)
3039{
3040 struct fuse_context *fc = fuse_get_context();
3041 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3042 const char *cgroup;
3043 int ret;
3044
5fbea8a6 3045 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
237e200e
SH
3046 return -EIO;
3047
237e200e
SH
3048 controller = pick_controller_from_path(fc, path);
3049 if (!controller)
2f7036d0 3050 return errno == ENOENT ? -EPERM : -errno;
237e200e
SH
3051
3052 cgroup = find_cgroup_in_path(path);
3053 if (!cgroup)
bc70ba9b 3054 return -errno;
237e200e
SH
3055
3056 get_cgdir_and_path(cgroup, &cgdir, &last);
3057 if (!last)
3058 path1 = "/";
3059 else
3060 path1 = cgdir;
3061
3062 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 3063 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
3064 initpid = fc->pid;
3065 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3066 if (!next)
3067 ret = -EINVAL;
3068 else if (last && strcmp(next, last) == 0)
3069 ret = -EEXIST;
3070 else
2f7036d0 3071 ret = -EPERM;
237e200e
SH
3072 goto out;
3073 }
3074
3075 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3076 ret = -EACCES;
3077 goto out;
3078 }
3079 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3080 ret = -EACCES;
3081 goto out;
3082 }
3083
3084 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3085
3086out:
3087 free(cgdir);
3088 free(next);
3089 return ret;
3090}
3091
3092int cg_rmdir(const char *path)
3093{
3094 struct fuse_context *fc = fuse_get_context();
3095 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3096 const char *cgroup;
3097 int ret;
3098
5fbea8a6 3099 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
237e200e
SH
3100 return -EIO;
3101
3102 controller = pick_controller_from_path(fc, path);
e254948f
CB
3103 if (!controller) /* Someone's trying to delete "/cgroup". */
3104 return -EPERM;
237e200e
SH
3105
3106 cgroup = find_cgroup_in_path(path);
e254948f
CB
3107 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3108 return -EPERM;
237e200e
SH
3109
3110 get_cgdir_and_path(cgroup, &cgdir, &last);
3111 if (!last) {
e254948f
CB
3112 /* Someone's trying to delete a cgroup on the same level as the
3113 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3114 * rmdir "/cgroup/blkio/init.slice".
3115 */
3116 ret = -EPERM;
237e200e
SH
3117 goto out;
3118 }
3119
3120 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 3121 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
3122 initpid = fc->pid;
3123 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
de77249b 3124 if (!last || (next && (strcmp(next, last) == 0)))
237e200e
SH
3125 ret = -EBUSY;
3126 else
3127 ret = -ENOENT;
3128 goto out;
3129 }
3130
3131 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3132 ret = -EACCES;
3133 goto out;
3134 }
3135 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3136 ret = -EACCES;
3137 goto out;
3138 }
3139
3140 if (!cgfs_remove(controller, cgroup)) {
3141 ret = -EINVAL;
3142 goto out;
3143 }
3144
3145 ret = 0;
3146
3147out:
3148 free(cgdir);
3149 free(next);
3150 return ret;
3151}
3152
3153static bool startswith(const char *line, const char *pref)
3154{
3155 if (strncmp(line, pref, strlen(pref)) == 0)
3156 return true;
3157 return false;
3158}
3159
66c5e848
CB
3160/* Note that "memory.stat" in cgroup2 is hierarchical by default. */
3161static void parse_memstat(int version,
3162 char *memstat,
3163 unsigned long *cached,
3164 unsigned long *active_anon,
3165 unsigned long *inactive_anon,
3166 unsigned long *active_file,
3167 unsigned long *inactive_file,
3168 unsigned long *unevictable,
3169 unsigned long *shmem)
237e200e
SH
3170{
3171 char *eol;
3172
237e200e 3173 while (*memstat) {
66c5e848
CB
3174 if (startswith(memstat, is_unified_controller(version)
3175 ? "cache"
3176 : "total_cache")) {
4accebfb 3177 sscanf(memstat + 11, "%lu", cached);
c6095b08 3178 *cached /= 1024;
66c5e848
CB
3179 } else if (startswith(memstat, is_unified_controller(version)
3180 ? "active_anon"
3181 : "total_active_anon")) {
4accebfb 3182 sscanf(memstat + 17, "%lu", active_anon);
c6095b08 3183 *active_anon /= 1024;
66c5e848
CB
3184 } else if (startswith(memstat, is_unified_controller(version)
3185 ? "inactive_anon"
3186 : "total_inactive_anon")) {
4accebfb 3187 sscanf(memstat + 19, "%lu", inactive_anon);
c6095b08 3188 *inactive_anon /= 1024;
66c5e848
CB
3189 } else if (startswith(memstat, is_unified_controller(version)
3190 ? "active_file"
3191 : "total_active_file")) {
4accebfb 3192 sscanf(memstat + 17, "%lu", active_file);
c6095b08 3193 *active_file /= 1024;
66c5e848
CB
3194 } else if (startswith(memstat, is_unified_controller(version)
3195 ? "inactive_file"
3196 : "total_inactive_file")) {
4accebfb 3197 sscanf(memstat + 19, "%lu", inactive_file);
c6095b08 3198 *inactive_file /= 1024;
66c5e848
CB
3199 } else if (startswith(memstat, is_unified_controller(version)
3200 ? "unevictable"
3201 : "total_unevictable")) {
4accebfb 3202 sscanf(memstat + 17, "%lu", unevictable);
c6095b08 3203 *unevictable /= 1024;
66c5e848
CB
3204 } else if (startswith(memstat, is_unified_controller(version)
3205 ? "shmem"
3206 : "total_shmem")) {
559eaa8f
JS
3207 sscanf(memstat + 11, "%lu", shmem);
3208 *shmem /= 1024;
237e200e
SH
3209 }
3210 eol = strchr(memstat, '\n');
3211 if (!eol)
3212 return;
3213 memstat = eol+1;
3214 }
3215}
3216
3217static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3218{
3219 char *eol;
3220 char key[32];
3221
3222 memset(key, 0, 32);
3223 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3224
3225 size_t len = strlen(key);
3226 *v = 0;
3227
3228 while (*str) {
3229 if (startswith(str, key)) {
3230 sscanf(str + len, "%lu", v);
3231 return;
3232 }
3233 eol = strchr(str, '\n');
3234 if (!eol)
3235 return;
3236 str = eol+1;
3237 }
3238}
3239
5fbea8a6 3240int read_file_fuse(const char *path, char *buf, size_t size, struct file_info *d)
237e200e 3241{
54a6d46a
CB
3242 __do_free char *line = NULL;
3243 __do_fclose FILE *f = NULL;
3244 size_t linelen = 0, total_len = 0;
237e200e
SH
3245 char *cache = d->buf;
3246 size_t cache_size = d->buflen;
54a6d46a
CB
3247
3248 f = fopen(path, "r");
237e200e
SH
3249 if (!f)
3250 return 0;
3251
3252 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3253 ssize_t l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
3254 if (l < 0) {
3255 perror("Error writing to cache");
54a6d46a 3256 return 0;
237e200e
SH
3257 }
3258 if (l >= cache_size) {
b8defc3d 3259 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
54a6d46a 3260 return 0;
237e200e
SH
3261 }
3262 cache += l;
3263 cache_size -= l;
3264 total_len += l;
3265 }
3266
3267 d->size = total_len;
a262ddb7
CB
3268 if (total_len > size)
3269 total_len = size;
237e200e
SH
3270
3271 /* read from off 0 */
3272 memcpy(buf, d->buf, total_len);
54a6d46a
CB
3273
3274 if (d->size > total_len)
3275 d->cached = d->size - total_len;
3276 return total_len;
237e200e
SH
3277}
3278
3279/*
3280 * FUSE ops for /proc
3281 */
3282
66c5e848 3283static unsigned long get_memlimit(const char *cgroup, bool swap)
237e200e 3284{
66c5e848 3285 int ret;
cfb72d52 3286 __do_free char *memlimit_str = NULL;
237e200e
SH
3287 unsigned long memlimit = -1;
3288
66c5e848
CB
3289 if (swap)
3290 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
3291 else
3292 ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
3293 if (ret > 0)
237e200e
SH
3294 memlimit = strtoul(memlimit_str, NULL, 10);
3295
237e200e
SH
3296 return memlimit;
3297}
3298
66c5e848 3299static unsigned long get_min_memlimit(const char *cgroup, bool swap)
237e200e 3300{
cfb72d52
CB
3301 __do_free char *copy = NULL;
3302 unsigned long memlimit = 0;
3303 unsigned long retlimit;
237e200e 3304
66c5e848
CB
3305 copy = strdup(cgroup);
3306 retlimit = get_memlimit(copy, swap);
237e200e
SH
3307
3308 while (strcmp(copy, "/") != 0) {
66c5e848
CB
3309 char *it = copy;
3310
3311 it = dirname(it);
3312 memlimit = get_memlimit(it, swap);
237e200e
SH
3313 if (memlimit != -1 && memlimit < retlimit)
3314 retlimit = memlimit;
3315 };
3316
3317 return retlimit;
3318}
3319
3320static int proc_meminfo_read(char *buf, size_t size, off_t offset,
cfb72d52 3321 struct fuse_file_info *fi)
237e200e 3322{
66c5e848 3323 __do_free char *cgroup = NULL, *line = NULL,
cfb72d52
CB
3324 *memusage_str = NULL, *memstat_str = NULL,
3325 *memswlimit_str = NULL, *memswusage_str = NULL;
3326 __do_fclose FILE *f = NULL;
237e200e 3327 struct fuse_context *fc = fuse_get_context();
7e60aa1b 3328 struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
237e200e 3329 struct file_info *d = (struct file_info *)fi->fh;
cfb72d52
CB
3330 unsigned long memlimit = 0, memusage = 0, memswlimit = 0,
3331 memswusage = 0, cached = 0, hosttotal = 0, active_anon = 0,
3332 inactive_anon = 0, active_file = 0, inactive_file = 0,
3333 unevictable = 0, shmem = 0, hostswtotal = 0;
66c5e848 3334 size_t linelen = 0, total_len = 0;
237e200e
SH
3335 char *cache = d->buf;
3336 size_t cache_size = d->buflen;
66c5e848 3337 int ret;
237e200e 3338
cfb72d52
CB
3339 if (offset) {
3340 int left;
3341
237e200e
SH
3342 if (offset > d->size)
3343 return -EINVAL;
cfb72d52 3344
237e200e
SH
3345 if (!d->cached)
3346 return 0;
cfb72d52
CB
3347
3348 left = d->size - offset;
3349 total_len = left > size ? size : left;
237e200e 3350 memcpy(buf, cache + offset, total_len);
cfb72d52 3351
237e200e
SH
3352 return total_len;
3353 }
3354
3355 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 3356 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e 3357 initpid = fc->pid;
cfb72d52 3358
66c5e848
CB
3359 cgroup = get_pid_cgroup(initpid, "memory");
3360 if (!cgroup)
5fbea8a6 3361 return read_file_fuse("/proc/meminfo", buf, size, d);
237e200e 3362
66c5e848 3363 prune_init_slice(cgroup);
cfb72d52 3364
66c5e848 3365 memlimit = get_min_memlimit(cgroup, false);
cfb72d52 3366
66c5e848
CB
3367 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
3368 if (ret < 0)
cfb72d52
CB
3369 return 0;
3370
66c5e848
CB
3371 ret = cgroup_ops->get_memory_stats(cgroup_ops, cgroup, &memstat_str);
3372 if (ret < 0)
cfb72d52 3373 return 0;
66c5e848
CB
3374 parse_memstat(ret, memstat_str, &cached, &active_anon, &inactive_anon,
3375 &active_file, &inactive_file, &unevictable, &shmem);
cfb72d52
CB
3376
3377 /*
3378 * Following values are allowed to fail, because swapaccount might be
3379 * turned off for current kernel.
3380 */
66c5e848
CB
3381 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memswlimit_str);
3382 if (ret >= 0)
3383 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
3384 if (ret >= 0) {
3385 memswlimit = get_min_memlimit(cgroup, true);
cfb72d52 3386 memswusage = strtoul(memswusage_str, NULL, 10);
237e200e
SH
3387 memswlimit = memswlimit / 1024;
3388 memswusage = memswusage / 1024;
3389 }
3390
3391 memusage = strtoul(memusage_str, NULL, 10);
3392 memlimit /= 1024;
3393 memusage /= 1024;
3394
237e200e
SH
3395 f = fopen("/proc/meminfo", "r");
3396 if (!f)
cfb72d52 3397 return 0;
237e200e
SH
3398
3399 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3400 ssize_t l;
237e200e
SH
3401 char *printme, lbuf[100];
3402
3403 memset(lbuf, 0, 100);
3404 if (startswith(line, "MemTotal:")) {
594a10e6 3405 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
237e200e
SH
3406 if (hosttotal < memlimit)
3407 memlimit = hosttotal;
3408 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3409 printme = lbuf;
3410 } else if (startswith(line, "MemFree:")) {
3411 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3412 printme = lbuf;
3413 } else if (startswith(line, "MemAvailable:")) {
ad19b86d 3414 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
237e200e 3415 printme = lbuf;
cfb72d52
CB
3416 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 &&
3417 opts && opts->swap_off == false) {
594a10e6 3418 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
4127e51b 3419 if (hostswtotal < memswlimit)
3420 memswlimit = hostswtotal;
3421 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
237e200e 3422 printme = lbuf;
b3aa7f1a 3423 } else if (startswith(line, "SwapTotal:") && opts && opts->swap_off == true) {
7e60aa1b 3424 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", 0UL);
3425 printme = lbuf;
cfb72d52
CB
3426 } else if (startswith(line, "SwapFree:") && memswlimit > 0 &&
3427 memswusage > 0 && opts && opts->swap_off == false) {
4127e51b 3428 unsigned long swaptotal = memswlimit,
cfb72d52
CB
3429 swapusage = memusage > memswusage
3430 ? 0
3431 : memswusage - memusage,
3432 swapfree = swapusage < swaptotal
3433 ? swaptotal - swapusage
3434 : 0;
b4665ce0 3435 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
237e200e 3436 printme = lbuf;
b3aa7f1a 3437 } else if (startswith(line, "SwapFree:") && opts && opts->swap_off == true) {
7e60aa1b 3438 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", 0UL);
3439 printme = lbuf;
da35d72a
SH
3440 } else if (startswith(line, "Slab:")) {
3441 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3442 printme = lbuf;
237e200e
SH
3443 } else if (startswith(line, "Buffers:")) {
3444 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3445 printme = lbuf;
3446 } else if (startswith(line, "Cached:")) {
3447 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3448 printme = lbuf;
3449 } else if (startswith(line, "SwapCached:")) {
3450 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3451 printme = lbuf;
2f306ad3 3452 } else if (startswith(line, "Active:")) {
c6095b08
SH
3453 snprintf(lbuf, 100, "Active: %8lu kB\n",
3454 active_anon + active_file);
3455 printme = lbuf;
2f306ad3 3456 } else if (startswith(line, "Inactive:")) {
c6095b08
SH
3457 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3458 inactive_anon + inactive_file);
3459 printme = lbuf;
3460 } else if (startswith(line, "Active(anon)")) {
3461 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3462 printme = lbuf;
3463 } else if (startswith(line, "Inactive(anon)")) {
3464 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3465 printme = lbuf;
3466 } else if (startswith(line, "Active(file)")) {
3467 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3468 printme = lbuf;
3469 } else if (startswith(line, "Inactive(file)")) {
3470 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3471 printme = lbuf;
3472 } else if (startswith(line, "Unevictable")) {
3473 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3474 printme = lbuf;
3475 } else if (startswith(line, "SReclaimable")) {
3476 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3477 printme = lbuf;
3478 } else if (startswith(line, "SUnreclaim")) {
3479 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3480 printme = lbuf;
559eaa8f
JS
3481 } else if (startswith(line, "Shmem:")) {
3482 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3483 printme = lbuf;
28cdea9b
JS
3484 } else if (startswith(line, "ShmemHugePages")) {
3485 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3486 printme = lbuf;
3487 } else if (startswith(line, "ShmemPmdMapped")) {
3488 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3489 printme = lbuf;
237e200e
SH
3490 } else
3491 printme = line;
3492
3493 l = snprintf(cache, cache_size, "%s", printme);
3494 if (l < 0) {
3495 perror("Error writing to cache");
cfb72d52 3496 return 0;
237e200e
SH
3497
3498 }
3499 if (l >= cache_size) {
b8defc3d 3500 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
cfb72d52 3501 return 0;
237e200e
SH
3502 }
3503
3504 cache += l;
3505 cache_size -= l;
3506 total_len += l;
3507 }
3508
3509 d->cached = 1;
3510 d->size = total_len;
3511 if (total_len > size ) total_len = size;
3512 memcpy(buf, d->buf, total_len);
3513
66c5e848 3514 return total_len;
237e200e
SH
3515}
3516
3517/*
3518 * Read the cpuset.cpus for cg
3519 * Return the answer in a newly allocated string which must be freed
3520 */
71f17cd2 3521char *get_cpuset(const char *cg)
237e200e 3522{
2c3bcd9e
CB
3523 char *value = NULL;
3524 int ret;
237e200e 3525
2c3bcd9e
CB
3526 ret = cgroup_ops->get_cpuset_cpus(cgroup_ops, cg, &value);
3527 if (ret < 0)
237e200e 3528 return NULL;
2c3bcd9e
CB
3529
3530 return value;
237e200e
SH
3531}
3532
3533bool cpu_in_cpuset(int cpu, const char *cpuset);
3534
3535static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3536{
3537 int cpu;
3538
3539 if (sscanf(line, "processor : %d", &cpu) != 1)
3540 return false;
3541 return cpu_in_cpuset(cpu, cpuset);
3542}
3543
c59d6a55
JS
3544/*
3545 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3546 * depending on `param`. Parameter value is returned throuh `value`.
3547 */
3548static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3549{
54a6d46a
CB
3550 __do_free char *str = NULL;
3551 char file[11 + 6 + 1]; /* cpu.cfs__us + quota/period + \0 */
c59d6a55 3552
54a6d46a 3553 snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
c59d6a55 3554
1ca6a467 3555 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
54a6d46a 3556 return false;
c59d6a55
JS
3557
3558 if (sscanf(str, "%ld", value) != 1)
54a6d46a 3559 return false;
c59d6a55 3560
54a6d46a 3561 return true;
c59d6a55
JS
3562}
3563
3564/*
3565 * Return the maximum number of visible CPUs based on CPU quotas.
3566 * If there is no quota set, zero is returned.
3567 */
3568int max_cpu_count(const char *cg)
3569{
3570 int rv, nprocs;
3571 int64_t cfs_quota, cfs_period;
8c23db36
HY
3572 int nr_cpus_in_cpuset = 0;
3573 char *cpuset = NULL;
c59d6a55
JS
3574
3575 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3576 return 0;
3577
3578 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3579 return 0;
3580
8c23db36
HY
3581 cpuset = get_cpuset(cg);
3582 if (cpuset)
3583 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
3584
3585 if (cfs_quota <= 0 || cfs_period <= 0){
3586 if (nr_cpus_in_cpuset > 0)
3587 return nr_cpus_in_cpuset;
3588
c59d6a55 3589 return 0;
8c23db36 3590 }
c59d6a55
JS
3591
3592 rv = cfs_quota / cfs_period;
3593
3594 /* In case quota/period does not yield a whole number, add one CPU for
3595 * the remainder.
3596 */
3597 if ((cfs_quota % cfs_period) > 0)
3598 rv += 1;
3599
3600 nprocs = get_nprocs();
3601
3602 if (rv > nprocs)
3603 rv = nprocs;
3604
8c23db36
HY
3605 /* use min value in cpu quota and cpuset */
3606 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
3607 rv = nr_cpus_in_cpuset;
3608
c59d6a55
JS
3609 return rv;
3610}
3611
db1b32f6
SX
3612/*
3613 * Return the exact number of visible CPUs based on CPU quotas.
3614 * If there is no quota set, zero is returned.
3615 */
3616static double exact_cpu_count(const char *cg)
3617{
3618 double rv;
3619 int nprocs;
3620 int64_t cfs_quota, cfs_period;
3621
3622 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3623 return 0;
3624
3625 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3626 return 0;
3627
3628 if (cfs_quota <= 0 || cfs_period <= 0)
3629 return 0;
3630
3631 rv = (double)cfs_quota / (double)cfs_period;
3632
3633 nprocs = get_nprocs();
3634
3635 if (rv > nprocs)
3636 rv = nprocs;
3637
3638 return rv;
3639}
3640
237e200e
SH
3641/*
3642 * check whether this is a '^processor" line in /proc/cpuinfo
3643 */
3644static bool is_processor_line(const char *line)
3645{
3646 int cpu;
3647
3648 if (sscanf(line, "processor : %d", &cpu) == 1)
3649 return true;
3650 return false;
3651}
3652
3653static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
54a6d46a 3654 struct fuse_file_info *fi)
237e200e 3655{
54a6d46a
CB
3656 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
3657 __do_fclose FILE *f = NULL;
237e200e
SH
3658 struct fuse_context *fc = fuse_get_context();
3659 struct file_info *d = (struct file_info *)fi->fh;
54a6d46a 3660 size_t linelen = 0, total_len = 0;
f676eb79 3661 bool am_printing = false, firstline = true, is_s390x = false;
c59d6a55
JS
3662 int curcpu = -1, cpu, max_cpus = 0;
3663 bool use_view;
237e200e
SH
3664 char *cache = d->buf;
3665 size_t cache_size = d->buflen;
237e200e
SH
3666
3667 if (offset){
54a6d46a
CB
3668 int left;
3669
237e200e
SH
3670 if (offset > d->size)
3671 return -EINVAL;
54a6d46a 3672
237e200e
SH
3673 if (!d->cached)
3674 return 0;
54a6d46a
CB
3675
3676 left = d->size - offset;
237e200e
SH
3677 total_len = left > size ? size: left;
3678 memcpy(buf, cache + offset, total_len);
54a6d46a 3679
237e200e
SH
3680 return total_len;
3681 }
3682
3683 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 3684 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
3685 initpid = fc->pid;
3686 cg = get_pid_cgroup(initpid, "cpuset");
3687 if (!cg)
5fbea8a6 3688 return read_file_fuse("proc/cpuinfo", buf, size, d);
6d2f6996 3689 prune_init_slice(cg);
237e200e
SH
3690
3691 cpuset = get_cpuset(cg);
3692 if (!cpuset)
54a6d46a 3693 return 0;
237e200e 3694
77f4399a 3695 use_view = cgroup_ops->can_use_cpuview(cgroup_ops);
c59d6a55
JS
3696 if (use_view)
3697 max_cpus = max_cpu_count(cg);
3698
237e200e
SH
3699 f = fopen("/proc/cpuinfo", "r");
3700 if (!f)
54a6d46a 3701 return 0;
237e200e
SH
3702
3703 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3704 ssize_t l;
f676eb79
SH
3705 if (firstline) {
3706 firstline = false;
3707 if (strstr(line, "IBM/S390") != NULL) {
3708 is_s390x = true;
3709 am_printing = true;
5ed9d4e2 3710 continue;
f676eb79
SH
3711 }
3712 }
5ed9d4e2
SH
3713 if (strncmp(line, "# processors:", 12) == 0)
3714 continue;
237e200e 3715 if (is_processor_line(line)) {
c59d6a55
JS
3716 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3717 break;
237e200e
SH
3718 am_printing = cpuline_in_cpuset(line, cpuset);
3719 if (am_printing) {
3720 curcpu ++;
3721 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3722 if (l < 0) {
3723 perror("Error writing to cache");
54a6d46a 3724 return 0;
237e200e
SH
3725 }
3726 if (l >= cache_size) {
b8defc3d 3727 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
54a6d46a 3728 return 0;
237e200e
SH
3729 }
3730 cache += l;
3731 cache_size -= l;
3732 total_len += l;
3733 }
3734 continue;
f676eb79
SH
3735 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3736 char *p;
c59d6a55
JS
3737 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3738 break;
f676eb79
SH
3739 if (!cpu_in_cpuset(cpu, cpuset))
3740 continue;
3741 curcpu ++;
3742 p = strchr(line, ':');
3743 if (!p || !*p)
54a6d46a 3744 return 0;
f676eb79 3745 p++;
5ed9d4e2 3746 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
f676eb79
SH
3747 if (l < 0) {
3748 perror("Error writing to cache");
54a6d46a 3749 return 0;
f676eb79
SH
3750 }
3751 if (l >= cache_size) {
b8defc3d 3752 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
54a6d46a 3753 return 0;
f676eb79
SH
3754 }
3755 cache += l;
3756 cache_size -= l;
3757 total_len += l;
3758 continue;
3759
237e200e
SH
3760 }
3761 if (am_printing) {
3762 l = snprintf(cache, cache_size, "%s", line);
3763 if (l < 0) {
3764 perror("Error writing to cache");
54a6d46a 3765 return 0;
237e200e
SH
3766 }
3767 if (l >= cache_size) {
b8defc3d 3768 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
54a6d46a 3769 return 0;
237e200e
SH
3770 }
3771 cache += l;
3772 cache_size -= l;
3773 total_len += l;
3774 }
3775 }
3776
5ed9d4e2 3777 if (is_s390x) {
54a6d46a 3778 __do_free char *origcache = d->buf;
a262ddb7 3779 ssize_t l;
54a6d46a
CB
3780
3781 d->buf = malloc(d->buflen);
3782 if (!d->buf) {
3783 d->buf = move_ptr(origcache);
3784 return 0;
3785 }
3786
5ed9d4e2
SH
3787 cache = d->buf;
3788 cache_size = d->buflen;
3789 total_len = 0;
3790 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
54a6d46a
CB
3791 if (l < 0 || l >= cache_size)
3792 return 0;
3793
5ed9d4e2
SH
3794 cache_size -= l;
3795 cache += l;
3796 total_len += l;
3797 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
54a6d46a
CB
3798 if (l < 0 || l >= cache_size)
3799 return 0;
3800
5ed9d4e2
SH
3801 cache_size -= l;
3802 cache += l;
3803 total_len += l;
3804 l = snprintf(cache, cache_size, "%s", origcache);
5ed9d4e2 3805 if (l < 0 || l >= cache_size)
54a6d46a 3806 return 0;
5ed9d4e2
SH
3807 total_len += l;
3808 }
3809
237e200e
SH
3810 d->cached = 1;
3811 d->size = total_len;
3812 if (total_len > size ) total_len = size;
3813
3814 /* read from off 0 */
3815 memcpy(buf, d->buf, total_len);
54a6d46a 3816 return total_len;
237e200e
SH
3817}
3818
0ecddf02 3819static uint64_t get_reaper_start_time(pid_t pid)
9ac264cf 3820{
9ac264cf 3821 int ret;
0ecddf02
CB
3822 FILE *f;
3823 uint64_t starttime;
3824 /* strlen("/proc/") = 6
3825 * +
3826 * LXCFS_NUMSTRLEN64
3827 * +
3828 * strlen("/stat") = 5
3829 * +
3830 * \0 = 1
3831 * */
3832#define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3833 char path[__PROC_PID_STAT_LEN];
9ac264cf
JB
3834 pid_t qpid;
3835
3836 qpid = lookup_initpid_in_store(pid);
0ecddf02
CB
3837 if (qpid <= 0) {
3838 /* Caller can check for EINVAL on 0. */
3839 errno = EINVAL;
9ac264cf 3840 return 0;
0ecddf02 3841 }
9ac264cf 3842
0ecddf02
CB
3843 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3844 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3845 /* Caller can check for EINVAL on 0. */
3846 errno = EINVAL;
9ac264cf 3847 return 0;
0ecddf02 3848 }
9ac264cf 3849
0ecddf02
CB
3850 f = fopen(path, "r");
3851 if (!f) {
3852 /* Caller can check for EINVAL on 0. */
3853 errno = EINVAL;
9ac264cf 3854 return 0;
0ecddf02 3855 }
9ac264cf 3856
0ecddf02
CB
3857 /* Note that the *scanf() argument supression requires that length
3858 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3859 * at us. It's like telling someone you're not married and then asking
3860 * if you can bring your wife to the party.
3861 */
3862 ret = fscanf(f, "%*d " /* (1) pid %d */
3863 "%*s " /* (2) comm %s */
3864 "%*c " /* (3) state %c */
3865 "%*d " /* (4) ppid %d */
3866 "%*d " /* (5) pgrp %d */
3867 "%*d " /* (6) session %d */
3868 "%*d " /* (7) tty_nr %d */
3869 "%*d " /* (8) tpgid %d */
3870 "%*u " /* (9) flags %u */
3871 "%*u " /* (10) minflt %lu */
3872 "%*u " /* (11) cminflt %lu */
3873 "%*u " /* (12) majflt %lu */
3874 "%*u " /* (13) cmajflt %lu */
3875 "%*u " /* (14) utime %lu */
3876 "%*u " /* (15) stime %lu */
3877 "%*d " /* (16) cutime %ld */
3878 "%*d " /* (17) cstime %ld */
3879 "%*d " /* (18) priority %ld */
3880 "%*d " /* (19) nice %ld */
3881 "%*d " /* (20) num_threads %ld */
3882 "%*d " /* (21) itrealvalue %ld */
3883 "%" PRIu64, /* (22) starttime %llu */
3884 &starttime);
3885 if (ret != 1) {
3886 fclose(f);
3887 /* Caller can check for EINVAL on 0. */
3888 errno = EINVAL;
3889 return 0;
3890 }
3891
3892 fclose(f);
3893
3894 errno = 0;
3895 return starttime;
3896}
3897
1c4b4e38 3898static double get_reaper_start_time_in_sec(pid_t pid)
0ecddf02 3899{
1c4b4e38
CB
3900 uint64_t clockticks, ticks_per_sec;
3901 int64_t ret;
3902 double res = 0;
0ecddf02
CB
3903
3904 clockticks = get_reaper_start_time(pid);
3905 if (clockticks == 0 && errno == EINVAL) {
3906 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3907 return 0;
3908 }
3909
1c4b4e38
CB
3910 ret = sysconf(_SC_CLK_TCK);
3911 if (ret < 0 && errno == EINVAL) {
0ecddf02
CB
3912 lxcfs_debug(
3913 "%s\n",
3914 "failed to determine number of clock ticks in a second");
3915 return 0;
3916 }
3917
1c4b4e38
CB
3918 ticks_per_sec = (uint64_t)ret;
3919 res = (double)clockticks / ticks_per_sec;
3920 return res;
0ecddf02
CB
3921}
3922
1c4b4e38 3923static double get_reaper_age(pid_t pid)
0ecddf02 3924{
1c4b4e38
CB
3925 uint64_t uptime_ms;
3926 double procstart, procage;
0ecddf02
CB
3927
3928 /* We need to substract the time the process has started since system
3929 * boot minus the time when the system has started to get the actual
3930 * reaper age.
3931 */
3932 procstart = get_reaper_start_time_in_sec(pid);
3933 procage = procstart;
3934 if (procstart > 0) {
3935 int ret;
3936 struct timespec spec;
3937
3938 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3939 if (ret < 0)
3940 return 0;
1c4b4e38 3941
0ecddf02
CB
3942 /* We could make this more precise here by using the tv_nsec
3943 * field in the timespec struct and convert it to milliseconds
3944 * and then create a double for the seconds and milliseconds but
3945 * that seems more work than it is worth.
3946 */
1c4b4e38
CB
3947 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
3948 procage = (uptime_ms - (procstart * 1000)) / 1000;
0ecddf02
CB
3949 }
3950
3951 return procage;
3952}
3953
8be92dd1
JS
3954/*
3955 * Returns 0 on success.
3956 * It is the caller's responsibility to free `return_usage`, unless this
3957 * function returns an error.
3958 */
79612c8b 3959static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
8be92dd1 3960{
54a6d46a
CB
3961 __do_free char *usage_str = NULL;
3962 __do_free struct cpuacct_usage *cpu_usage = NULL;
77005a6c 3963 int cpucount = get_nprocs_conf();
54a6d46a
CB
3964 int read_pos = 0, read_cnt=0;
3965 int i, j, ret;
8be92dd1
JS
3966 int cg_cpu;
3967 uint64_t cg_user, cg_system;
3968 int64_t ticks_per_sec;
8be92dd1
JS
3969
3970 ticks_per_sec = sysconf(_SC_CLK_TCK);
3971
3972 if (ticks_per_sec < 0 && errno == EINVAL) {
db1b32f6 3973 lxcfs_v(
8be92dd1
JS
3974 "%s\n",
3975 "read_cpuacct_usage_all failed to determine number of clock ticks "
3976 "in a second");
3977 return -1;
3978 }
3979
3980 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
3981 if (!cpu_usage)
3982 return -ENOMEM;
3983
db1b32f6 3984 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1ca6a467 3985 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
54a6d46a
CB
3986 char *data = NULL;
3987 int i = 0, read_pos = 0, read_cnt=0;
3988 size_t sz = 0, asz = 0;
3989
3990 /* read cpuacct.usage_percpu instead. */
db1b32f6 3991 lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", "");
54a6d46a
CB
3992 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
3993 return -1;
db1b32f6
SX
3994 lxcfs_v("usage_str: %s\n", usage_str);
3995
54a6d46a 3996 /* convert cpuacct.usage_percpu into cpuacct.usage_all. */
db1b32f6 3997 lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", "");
1c4b4e38 3998
db1b32f6
SX
3999 must_strcat(&data, &sz, &asz, "cpu user system\n");
4000
db1b32f6
SX
4001 while (sscanf(usage_str + read_pos, "%lu %n", &cg_user, &read_cnt) > 0) {
4002 lxcfs_debug("i: %d, cg_user: %lu, read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt);
4003 must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user);
4004 i++;
4005 read_pos += read_cnt;
4006 }
4007
db1b32f6
SX
4008 usage_str = data;
4009
4010 lxcfs_v("usage_str: %s\n", usage_str);
8be92dd1
JS
4011 }
4012
4013 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4014 lxcfs_error("read_cpuacct_usage_all reading first line from "
4015 "%s/cpuacct.usage_all failed.\n", cg);
54a6d46a 4016 return -1;
8be92dd1
JS
4017 }
4018
4019 read_pos += read_cnt;
4020
4021 for (i = 0, j = 0; i < cpucount; i++) {
4022 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4023 &cg_system, &read_cnt);
4024
4025 if (ret == EOF)
4026 break;
4027
4028 if (ret != 3) {
4029 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4030 "failed.\n", cg);
54a6d46a 4031 return -1;
8be92dd1
JS
4032 }
4033
4034 read_pos += read_cnt;
4035
8be92dd1
JS
4036 /* Convert the time from nanoseconds to USER_HZ */
4037 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4038 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4039 j++;
4040 }
4041
54a6d46a 4042 *return_usage = move_ptr(cpu_usage);
79612c8b 4043 *size = cpucount;
54a6d46a 4044 return 0;
8be92dd1
JS
4045}
4046
056adcef
JS
4047static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4048{
4049 int i;
4050 unsigned long sum = 0;
4051
4052 for (i = 0; i < cpu_count; i++) {
77005a6c
JS
4053 if (!newer[i].online)
4054 continue;
4055
056adcef
JS
4056 /* When cpuset is changed on the fly, the CPUs might get reordered.
4057 * We could either reset all counters, or check that the substractions
4058 * below will return expected results.
4059 */
4060 if (newer[i].user > older[i].user)
4061 diff[i].user = newer[i].user - older[i].user;
4062 else
4063 diff[i].user = 0;
4064
4065 if (newer[i].system > older[i].system)
4066 diff[i].system = newer[i].system - older[i].system;
4067 else
4068 diff[i].system = 0;
4069
4070 if (newer[i].idle > older[i].idle)
4071 diff[i].idle = newer[i].idle - older[i].idle;
4072 else
4073 diff[i].idle = 0;
4074
4075 sum += diff[i].user;
4076 sum += diff[i].system;
4077 sum += diff[i].idle;
4078 }
4079
4080 return sum;
4081}
4082
4083static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4084{
4085 unsigned long free_space, to_add;
4086
4087 free_space = threshold - usage->user - usage->system;
4088
4089 if (free_space > usage->idle)
4090 free_space = usage->idle;
4091
4092 to_add = free_space > *surplus ? *surplus : free_space;
4093
4094 *counter += to_add;
4095 usage->idle -= to_add;
4096 *surplus -= to_add;
4097}
4098
951acc94
JS
4099static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4100{
4101 struct cg_proc_stat *first = NULL, *prev, *tmp;
4102
4103 for (prev = NULL; node; ) {
4104 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4105 tmp = node;
4106 lxcfs_debug("Removing stat node for %s\n", node->cg);
4107
4108 if (prev)
4109 prev->next = node->next;
4110 else
4111 first = node->next;
4112
4113 node = node->next;
4114 free_proc_stat_node(tmp);
4115 } else {
4116 if (!first)
4117 first = node;
4118 prev = node;
4119 node = node->next;
4120 }
4121 }
4122
4123 return first;
4124}
4125
4126#define PROC_STAT_PRUNE_INTERVAL 10
4127static void prune_proc_stat_history(void)
4128{
4129 int i;
4130 time_t now = time(NULL);
4131
4132 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
2f49b662
JS
4133 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4134
4135 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4136 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
951acc94 4137 return;
2f49b662 4138 }
951acc94 4139
2f49b662
JS
4140 if (proc_stat_history[i]->next) {
4141 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4142 proc_stat_history[i]->lastcheck = now;
4143 }
951acc94 4144
2f49b662 4145 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
951acc94
JS
4146 }
4147}
4148
2f49b662 4149static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
056adcef 4150{
056adcef
JS
4151 struct cg_proc_stat *node;
4152
2f49b662
JS
4153 pthread_rwlock_rdlock(&head->lock);
4154
4155 if (!head->next) {
4156 pthread_rwlock_unlock(&head->lock);
056adcef 4157 return NULL;
2f49b662 4158 }
056adcef
JS
4159
4160 node = head->next;
4161
4162 do {
4163 if (strcmp(cg, node->cg) == 0)
951acc94 4164 goto out;
056adcef
JS
4165 } while ((node = node->next));
4166
951acc94
JS
4167 node = NULL;
4168
4169out:
2f49b662 4170 pthread_rwlock_unlock(&head->lock);
951acc94
JS
4171 prune_proc_stat_history();
4172 return node;
056adcef
JS
4173}
4174
4175static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4176{
4177 struct cg_proc_stat *node;
4178 int i;
4179
4180 node = malloc(sizeof(struct cg_proc_stat));
4181 if (!node)
4182 goto err;
4183
4184 node->cg = NULL;
4185 node->usage = NULL;
4186 node->view = NULL;
4187
4188 node->cg = malloc(strlen(cg) + 1);
4189 if (!node->cg)
4190 goto err;
4191
4192 strcpy(node->cg, cg);
4193
4194 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4195 if (!node->usage)
4196 goto err;
4197
4198 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4199
4200 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4201 if (!node->view)
4202 goto err;
4203
4204 node->cpu_count = cpu_count;
4205 node->next = NULL;
4206
2f49b662
JS
4207 if (pthread_mutex_init(&node->lock, NULL) != 0) {
4208 lxcfs_error("%s\n", "Failed to initialize node lock");
4209 goto err;
4210 }
4211
056adcef
JS
4212 for (i = 0; i < cpu_count; i++) {
4213 node->view[i].user = 0;
4214 node->view[i].system = 0;
4215 node->view[i].idle = 0;
4216 }
4217
4218 return node;
4219
4220err:
4221 if (node && node->cg)
4222 free(node->cg);
4223 if (node && node->usage)
4224 free(node->usage);
4225 if (node && node->view)
4226 free(node->view);
4227 if (node)
4228 free(node);
4229
4230 return NULL;
4231}
4232
2f49b662 4233static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
056adcef
JS
4234{
4235 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4236 struct cg_proc_stat_head *head = proc_stat_history[hash];
2f49b662
JS
4237 struct cg_proc_stat *node, *rv = new_node;
4238
4239 pthread_rwlock_wrlock(&head->lock);
056adcef
JS
4240
4241 if (!head->next) {
4242 head->next = new_node;
2f49b662 4243 goto out;
056adcef
JS
4244 }
4245
2f49b662
JS
4246 node = head->next;
4247
056adcef 4248 for (;;) {
2f49b662
JS
4249 if (strcmp(node->cg, new_node->cg) == 0) {
4250 /* The node is already present, return it */
4251 free_proc_stat_node(new_node);
4252 rv = node;
4253 goto out;
4254 }
056adcef
JS
4255
4256 if (node->next) {
4257 node = node->next;
4258 continue;
4259 }
4260
4261 node->next = new_node;
2f49b662
JS
4262 goto out;
4263 }
4264
4265out:
4266 pthread_rwlock_unlock(&head->lock);
4267 return rv;
4268}
4269
895f28e5
JS
4270static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4271{
54a6d46a 4272 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
895f28e5
JS
4273
4274 /* Allocate new memory */
4275 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4276 if (!new_usage)
4277 return false;
4278
4279 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
54a6d46a 4280 if (!new_view)
895f28e5 4281 return false;
895f28e5
JS
4282
4283 /* Copy existing data & initialize new elements */
54a6d46a 4284 for (int i = 0; i < cpu_count; i++) {
895f28e5
JS
4285 if (i < node->cpu_count) {
4286 new_usage[i].user = node->usage[i].user;
4287 new_usage[i].system = node->usage[i].system;
4288 new_usage[i].idle = node->usage[i].idle;
4289
4290 new_view[i].user = node->view[i].user;
4291 new_view[i].system = node->view[i].system;
4292 new_view[i].idle = node->view[i].idle;
4293 } else {
4294 new_usage[i].user = 0;
4295 new_usage[i].system = 0;
4296 new_usage[i].idle = 0;
4297
4298 new_view[i].user = 0;
4299 new_view[i].system = 0;
4300 new_view[i].idle = 0;
4301 }
4302 }
4303
4304 free(node->usage);
54a6d46a 4305 node->usage = move_ptr(new_usage);
895f28e5 4306
54a6d46a
CB
4307 free(node->view);
4308 node->view = move_ptr(new_view);
895f28e5
JS
4309 node->cpu_count = cpu_count;
4310
4311 return true;
4312}
4313
2f49b662
JS
4314static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4315{
4316 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4317 struct cg_proc_stat_head *head = proc_stat_history[hash];
4318 struct cg_proc_stat *node;
4319
4320 node = find_proc_stat_node(head, cg);
4321
4322 if (!node) {
4323 node = new_proc_stat_node(usage, cpu_count, cg);
4324 if (!node)
4325 return NULL;
4326
4327 node = add_proc_stat_node(node);
4328 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
056adcef 4329 }
2f49b662
JS
4330
4331 pthread_mutex_lock(&node->lock);
895f28e5
JS
4332
4333 /* If additional CPUs on the host have been enabled, CPU usage counter
4334 * arrays have to be expanded */
4335 if (node->cpu_count < cpu_count) {
4336 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4337 node->cpu_count, cpu_count, cg);
4338
4339 if (!expand_proc_stat_node(node, cpu_count)) {
4340 pthread_mutex_unlock(&node->lock);
4341 lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4342 node->cpu_count, cpu_count, cg);
4343 return NULL;
4344 }
4345 }
4346
2f49b662 4347 return node;
056adcef
JS
4348}
4349
4350static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4351{
4352 int i;
4353
4354 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4355 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4356
4357 for (i = 0; i < cpu_count; i++) {
4358 node->view[i].user = 0;
4359 node->view[i].system = 0;
4360 node->view[i].idle = 0;
4361 }
4362
4363 node->cpu_count = cpu_count;
4364}
4365
54a6d46a
CB
4366static int cpuview_proc_stat(const char *cg, const char *cpuset,
4367 struct cpuacct_usage *cg_cpu_usage,
4368 int cg_cpu_usage_size, FILE *f, char *buf,
4369 size_t buf_size)
056adcef 4370{
54a6d46a
CB
4371 __do_free char *line = NULL;
4372 __do_free struct cpuacct_usage *diff = NULL;
4373 size_t linelen = 0, total_len = 0, l;
056adcef 4374 int curcpu = -1; /* cpu numbering starts at 0 */
77005a6c 4375 int physcpu, i;
056adcef 4376 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
54a6d46a
CB
4377 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0,
4378 irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
056adcef
JS
4379 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4380 unsigned long user_surplus = 0, system_surplus = 0;
4381 unsigned long total_sum, threshold;
4382 struct cg_proc_stat *stat_node;
77005a6c 4383 int nprocs = get_nprocs_conf();
056adcef 4384
79612c8b
JS
4385 if (cg_cpu_usage_size < nprocs)
4386 nprocs = cg_cpu_usage_size;
4387
056adcef
JS
4388 /* Read all CPU stats and stop when we've encountered other lines */
4389 while (getline(&line, &linelen, f) != -1) {
77005a6c 4390 int ret;
056adcef
JS
4391 char cpu_char[10]; /* That's a lot of cores */
4392 uint64_t all_used, cg_used;
4393
4394 if (strlen(line) == 0)
4395 continue;
54a6d46a
CB
4396
4397 /* not a ^cpuN line containing a number N */
4398 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
056adcef 4399 break;
056adcef 4400
77005a6c 4401 if (sscanf(cpu_char, "%d", &physcpu) != 1)
056adcef 4402 continue;
77005a6c 4403
79612c8b
JS
4404 if (physcpu >= cg_cpu_usage_size)
4405 continue;
4406
056adcef
JS
4407 curcpu ++;
4408 cpu_cnt ++;
4409
77005a6c 4410 if (!cpu_in_cpuset(physcpu, cpuset)) {
54a6d46a 4411 for (i = curcpu; i <= physcpu; i++)
77005a6c 4412 cg_cpu_usage[i].online = false;
77005a6c
JS
4413 continue;
4414 }
4415
4416 if (curcpu < physcpu) {
4417 /* Some CPUs may be disabled */
4418 for (i = curcpu; i < physcpu; i++)
4419 cg_cpu_usage[i].online = false;
4420
4421 curcpu = physcpu;
4422 }
4423
4424 cg_cpu_usage[curcpu].online = true;
4425
056adcef
JS
4426 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4427 &user,
4428 &nice,
4429 &system,
4430 &idle,
4431 &iowait,
4432 &irq,
4433 &softirq,
4434 &steal,
4435 &guest,
4436 &guest_nice);
4437
4438 if (ret != 10)
4439 continue;
4440
4441 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4442 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4443
4444 if (all_used >= cg_used) {
4445 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4446
4447 } else {
4448 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4449 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4450 curcpu, cg, all_used, cg_used);
4451 cg_cpu_usage[curcpu].idle = idle;
4452 }
4453 }
4454
4455 /* Cannot use more CPUs than is available due to cpuset */
4456 if (max_cpus > cpu_cnt)
4457 max_cpus = cpu_cnt;
4458
2f49b662 4459 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
056adcef
JS
4460
4461 if (!stat_node) {
2f49b662 4462 lxcfs_error("unable to find/create stat node for %s\n", cg);
54a6d46a 4463 return 0;
056adcef
JS
4464 }
4465
4466 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4467 if (!diff) {
54a6d46a 4468 return 0;
056adcef
JS
4469 }
4470
4471 /*
4472 * If the new values are LOWER than values stored in memory, it means
4473 * the cgroup has been reset/recreated and we should reset too.
4474 */
77005a6c
JS
4475 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4476 if (!cg_cpu_usage[curcpu].online)
4477 continue;
4478
4479 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4480 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4481
4482 break;
4483 }
056adcef 4484
77005a6c
JS
4485 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4486
4487 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4488 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4489
4490 if (!stat_node->usage[curcpu].online)
4491 continue;
4492
4493 i++;
056adcef 4494
056adcef
JS
4495 stat_node->usage[curcpu].user += diff[curcpu].user;
4496 stat_node->usage[curcpu].system += diff[curcpu].system;
4497 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4498
77005a6c 4499 if (max_cpus > 0 && i >= max_cpus) {
056adcef
JS
4500 user_surplus += diff[curcpu].user;
4501 system_surplus += diff[curcpu].system;
4502 }
4503 }
4504
4505 /* Calculate usage counters of visible CPUs */
4506 if (max_cpus > 0) {
54a6d46a
CB
4507 unsigned long diff_user = 0;
4508 unsigned long diff_system = 0;
4509 unsigned long diff_idle = 0;
4510 unsigned long max_diff_idle = 0;
4511 unsigned long max_diff_idle_index = 0;
4512 double exact_cpus;
4513
056adcef
JS
4514 /* threshold = maximum usage per cpu, including idle */
4515 threshold = total_sum / cpu_cnt * max_cpus;
4516
77005a6c 4517 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
77005a6c
JS
4518 if (!stat_node->usage[curcpu].online)
4519 continue;
4520
4521 i++;
4522
db1b32f6
SX
4523 if (i == max_cpus)
4524 break;
4525
056adcef
JS
4526 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4527 continue;
4528
4529 /* Add user */
54a6d46a
CB
4530 add_cpu_usage(&user_surplus, &diff[curcpu],
4531 &diff[curcpu].user, threshold);
056adcef
JS
4532
4533 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4534 continue;
4535
4536 /* If there is still room, add system */
54a6d46a
CB
4537 add_cpu_usage(&system_surplus, &diff[curcpu],
4538 &diff[curcpu].system, threshold);
056adcef
JS
4539 }
4540
4541 if (user_surplus > 0)
4542 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4543 if (system_surplus > 0)
4544 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4545
77005a6c 4546 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
77005a6c
JS
4547 if (!stat_node->usage[curcpu].online)
4548 continue;
4549
4550 i++;
4551
db1b32f6
SX
4552 if (i == max_cpus)
4553 break;
4554
056adcef
JS
4555 stat_node->view[curcpu].user += diff[curcpu].user;
4556 stat_node->view[curcpu].system += diff[curcpu].system;
4557 stat_node->view[curcpu].idle += diff[curcpu].idle;
4558
4559 user_sum += stat_node->view[curcpu].user;
4560 system_sum += stat_node->view[curcpu].system;
4561 idle_sum += stat_node->view[curcpu].idle;
056adcef 4562
db1b32f6
SX
4563 diff_user += diff[curcpu].user;
4564 diff_system += diff[curcpu].system;
4565 diff_idle += diff[curcpu].idle;
4566 if (diff[curcpu].idle > max_diff_idle) {
4567 max_diff_idle = diff[curcpu].idle;
4568 max_diff_idle_index = curcpu;
4569 }
4570
4571 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
4572 }
4573 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
4574
54a6d46a
CB
4575 /* revise cpu usage view to support partial cpu case. */
4576 exact_cpus = exact_cpu_count(cg);
db1b32f6 4577 if (exact_cpus < (double)max_cpus){
db1b32f6 4578 unsigned long delta = (unsigned long)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
54a6d46a
CB
4579
4580 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
db1b32f6
SX
4581 lxcfs_v("delta: %lu\n", delta);
4582 lxcfs_v("idle_sum before: %lu\n", idle_sum);
4583 idle_sum = idle_sum > delta ? idle_sum - delta : 0;
4584 lxcfs_v("idle_sum after: %lu\n", idle_sum);
1c4b4e38 4585
db1b32f6
SX
4586 curcpu = max_diff_idle_index;
4587 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
4588 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
4589 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
4590 }
056adcef 4591 } else {
77005a6c
JS
4592 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4593 if (!stat_node->usage[curcpu].online)
4594 continue;
4595
056adcef
JS
4596 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4597 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4598 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4599
4600 user_sum += stat_node->view[curcpu].user;
4601 system_sum += stat_node->view[curcpu].system;
4602 idle_sum += stat_node->view[curcpu].idle;
4603 }
4604 }
4605
4606 /* Render the file */
4607 /* cpu-all */
4608 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4609 user_sum,
4610 system_sum,
4611 idle_sum);
db1b32f6 4612 lxcfs_v("cpu-all: %s\n", buf);
056adcef
JS
4613
4614 if (l < 0) {
4615 perror("Error writing to cache");
54a6d46a 4616 return 0;
056adcef
JS
4617 }
4618 if (l >= buf_size) {
4619 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
54a6d46a 4620 return 0;
056adcef
JS
4621 }
4622
4623 buf += l;
4624 buf_size -= l;
4625 total_len += l;
4626
4627 /* Render visible CPUs */
77005a6c
JS
4628 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4629 if (!stat_node->usage[curcpu].online)
4630 continue;
4631
4632 i++;
4633
4634 if (max_cpus > 0 && i == max_cpus)
056adcef
JS
4635 break;
4636
4637 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
77005a6c 4638 i,
056adcef
JS
4639 stat_node->view[curcpu].user,
4640 stat_node->view[curcpu].system,
4641 stat_node->view[curcpu].idle);
db1b32f6 4642 lxcfs_v("cpu: %s\n", buf);
056adcef
JS
4643
4644 if (l < 0) {
4645 perror("Error writing to cache");
54a6d46a 4646 return 0;
056adcef
JS
4647
4648 }
4649 if (l >= buf_size) {
4650 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
54a6d46a 4651 return 0;
056adcef
JS
4652 }
4653
4654 buf += l;
4655 buf_size -= l;
4656 total_len += l;
4657 }
4658
4659 /* Pass the rest of /proc/stat, start with the last line read */
4660 l = snprintf(buf, buf_size, "%s", line);
4661
4662 if (l < 0) {
4663 perror("Error writing to cache");
54a6d46a 4664 return 0;
056adcef
JS
4665
4666 }
4667 if (l >= buf_size) {
4668 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
54a6d46a 4669 return 0;
056adcef
JS
4670 }
4671
4672 buf += l;
4673 buf_size -= l;
4674 total_len += l;
4675
4676 /* Pass the rest of the host's /proc/stat */
4677 while (getline(&line, &linelen, f) != -1) {
4678 l = snprintf(buf, buf_size, "%s", line);
4679 if (l < 0) {
4680 perror("Error writing to cache");
54a6d46a 4681 return 0;
056adcef
JS
4682 }
4683 if (l >= buf_size) {
4684 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
54a6d46a 4685 return 0;
056adcef
JS
4686 }
4687 buf += l;
4688 buf_size -= l;
4689 total_len += l;
4690 }
4691
2f49b662
JS
4692 if (stat_node)
4693 pthread_mutex_unlock(&stat_node->lock);
54a6d46a 4694 return total_len;
056adcef
JS
4695}
4696
f34de69a 4697#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
237e200e 4698static int proc_stat_read(char *buf, size_t size, off_t offset,
54a6d46a 4699 struct fuse_file_info *fi)
237e200e 4700{
54a6d46a
CB
4701 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
4702 __do_free struct cpuacct_usage *cg_cpu_usage = NULL;
4703 __do_fclose FILE *f = NULL;
237e200e
SH
4704 struct fuse_context *fc = fuse_get_context();
4705 struct file_info *d = (struct file_info *)fi->fh;
54a6d46a 4706 size_t linelen = 0, total_len = 0;
237e200e 4707 int curcpu = -1; /* cpu numbering starts at 0 */
77005a6c 4708 int physcpu = 0;
54a6d46a
CB
4709 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0,
4710 irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4711 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0,
4712 iowait_sum = 0, irq_sum = 0, softirq_sum = 0,
4713 steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
237e200e
SH
4714 char cpuall[CPUALL_MAX_SIZE];
4715 /* reserve for cpu all */
4716 char *cache = d->buf + CPUALL_MAX_SIZE;
4717 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
79612c8b 4718 int cg_cpu_usage_size = 0;
237e200e
SH
4719
4720 if (offset){
4721 if (offset > d->size)
4722 return -EINVAL;
4723 if (!d->cached)
4724 return 0;
4725 int left = d->size - offset;
4726 total_len = left > size ? size: left;
4727 memcpy(buf, d->buf + offset, total_len);
4728 return total_len;
4729 }
4730
4731 pid_t initpid = lookup_initpid_in_store(fc->pid);
db1b32f6 4732 lxcfs_v("initpid: %d\n", initpid);
237e200e
SH
4733 if (initpid <= 0)
4734 initpid = fc->pid;
a4bd57ae
HY
4735
4736 /*
4737 * when container run with host pid namespace initpid == 1, cgroup will "/"
4738 * we should return host os's /proc contents.
4739 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
4740 */
4741 if (initpid == 1) {
5fbea8a6 4742 return read_file_fuse("/proc/stat", buf, size, d);
a4bd57ae
HY
4743 }
4744
237e200e 4745 cg = get_pid_cgroup(initpid, "cpuset");
db1b32f6 4746 lxcfs_v("cg: %s\n", cg);
237e200e 4747 if (!cg)
5fbea8a6 4748 return read_file_fuse("/proc/stat", buf, size, d);
6d2f6996 4749 prune_init_slice(cg);
237e200e
SH
4750
4751 cpuset = get_cpuset(cg);
4752 if (!cpuset)
54a6d46a 4753 return 0;
237e200e 4754
8be92dd1
JS
4755 /*
4756 * Read cpuacct.usage_all for all CPUs.
4757 * If the cpuacct cgroup is present, it is used to calculate the container's
4758 * CPU usage. If not, values from the host's /proc/stat are used.
4759 */
79612c8b 4760 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
db1b32f6 4761 lxcfs_v("%s\n", "proc_stat_read failed to read from cpuacct, "
8be92dd1
JS
4762 "falling back to the host's /proc/stat");
4763 }
4764
237e200e
SH
4765 f = fopen("/proc/stat", "r");
4766 if (!f)
54a6d46a 4767 return 0;
237e200e
SH
4768
4769 //skip first line
4770 if (getline(&line, &linelen, f) < 0) {
b8defc3d 4771 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
54a6d46a 4772 return 0;
237e200e
SH
4773 }
4774
77f4399a 4775 if (cgroup_ops->can_use_cpuview(cgroup_ops) && cg_cpu_usage) {
79612c8b
JS
4776 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4777 f, d->buf, d->buflen);
056adcef
JS
4778 goto out;
4779 }
4780
237e200e 4781 while (getline(&line, &linelen, f) != -1) {
a262ddb7 4782 ssize_t l;
237e200e
SH
4783 char cpu_char[10]; /* That's a lot of cores */
4784 char *c;
8be92dd1
JS
4785 uint64_t all_used, cg_used, new_idle;
4786 int ret;
237e200e 4787
b4665ce0
SH
4788 if (strlen(line) == 0)
4789 continue;
237e200e
SH
4790 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4791 /* not a ^cpuN line containing a number N, just print it */
9502bae2 4792 l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
4793 if (l < 0) {
4794 perror("Error writing to cache");
54a6d46a 4795 return 0;
237e200e
SH
4796 }
4797 if (l >= cache_size) {
b8defc3d 4798 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
54a6d46a 4799 return 0;
237e200e
SH
4800 }
4801 cache += l;
4802 cache_size -= l;
4803 total_len += l;
4804 continue;
4805 }
4806
77005a6c 4807 if (sscanf(cpu_char, "%d", &physcpu) != 1)
237e200e 4808 continue;
77005a6c 4809 if (!cpu_in_cpuset(physcpu, cpuset))
237e200e
SH
4810 continue;
4811 curcpu ++;
4812
8be92dd1 4813 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
7144f069
CB
4814 &user,
4815 &nice,
4816 &system,
4817 &idle,
4818 &iowait,
4819 &irq,
4820 &softirq,
4821 &steal,
4822 &guest,
8be92dd1
JS
4823 &guest_nice);
4824
4825 if (ret != 10 || !cg_cpu_usage) {
4826 c = strchr(line, ' ');
4827 if (!c)
4828 continue;
4829 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4830 if (l < 0) {
4831 perror("Error writing to cache");
54a6d46a 4832 return 0;
8be92dd1
JS
4833
4834 }
4835 if (l >= cache_size) {
4836 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
54a6d46a 4837 return 0;
8be92dd1
JS
4838 }
4839
4840 cache += l;
4841 cache_size -= l;
4842 total_len += l;
4843
4844 if (ret != 10)
4845 continue;
4846 }
4847
4848 if (cg_cpu_usage) {
79612c8b
JS
4849 if (physcpu >= cg_cpu_usage_size)
4850 break;
4851
8be92dd1 4852 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
77005a6c 4853 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
8be92dd1
JS
4854
4855 if (all_used >= cg_used) {
4856 new_idle = idle + (all_used - cg_used);
4857
4858 } else {
4859 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4860 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4861 curcpu, cg, all_used, cg_used);
4862 new_idle = idle;
4863 }
4864
4865 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
77005a6c 4866 curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
8be92dd1
JS
4867 new_idle);
4868
4869 if (l < 0) {
4870 perror("Error writing to cache");
54a6d46a 4871 return 0;
8be92dd1
JS
4872
4873 }
4874 if (l >= cache_size) {
4875 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
54a6d46a 4876 return 0;
8be92dd1
JS
4877 }
4878
4879 cache += l;
4880 cache_size -= l;
4881 total_len += l;
4882
77005a6c
JS
4883 user_sum += cg_cpu_usage[physcpu].user;
4884 system_sum += cg_cpu_usage[physcpu].system;
8be92dd1
JS
4885 idle_sum += new_idle;
4886
4887 } else {
4888 user_sum += user;
4889 nice_sum += nice;
4890 system_sum += system;
4891 idle_sum += idle;
4892 iowait_sum += iowait;
4893 irq_sum += irq;
4894 softirq_sum += softirq;
4895 steal_sum += steal;
4896 guest_sum += guest;
4897 guest_nice_sum += guest_nice;
4898 }
237e200e
SH
4899 }
4900
4901 cache = d->buf;
4902
7144f069
CB
4903 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4904 user_sum,
4905 nice_sum,
4906 system_sum,
4907 idle_sum,
4908 iowait_sum,
4909 irq_sum,
4910 softirq_sum,
4911 steal_sum,
4912 guest_sum,
4913 guest_nice_sum);
4914 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
237e200e
SH
4915 memcpy(cache, cpuall, cpuall_len);
4916 cache += cpuall_len;
7144f069 4917 } else {
237e200e 4918 /* shouldn't happen */
b8defc3d 4919 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
237e200e
SH
4920 cpuall_len = 0;
4921 }
4922
4923 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4924 total_len += cpuall_len;
056adcef
JS
4925
4926out:
237e200e
SH
4927 d->cached = 1;
4928 d->size = total_len;
7144f069
CB
4929 if (total_len > size)
4930 total_len = size;
237e200e
SH
4931
4932 memcpy(buf, d->buf, total_len);
54a6d46a 4933 return total_len;
237e200e
SH
4934}
4935
0ecddf02
CB
4936/* This function retrieves the busy time of a group of tasks by looking at
4937 * cpuacct.usage. Unfortunately, this only makes sense when the container has
4938 * been given it's own cpuacct cgroup. If not, this function will take the busy
4939 * time of all other taks that do not actually belong to the container into
4940 * account as well. If someone has a clever solution for this please send a
4941 * patch!
4942 */
1c4b4e38 4943static double get_reaper_busy(pid_t task)
237e200e 4944{
54a6d46a 4945 __do_free char *cgroup = NULL, *usage_str = NULL;
237e200e 4946 unsigned long usage = 0;
54a6d46a 4947 pid_t initpid;
237e200e 4948
54a6d46a 4949 initpid = lookup_initpid_in_store(task);
237e200e
SH
4950 if (initpid <= 0)
4951 return 0;
4952
4953 cgroup = get_pid_cgroup(initpid, "cpuacct");
4954 if (!cgroup)
54a6d46a 4955 return 0;
6d2f6996 4956 prune_init_slice(cgroup);
54a6d46a
CB
4957 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cgroup, "cpuacct.usage",
4958 &usage_str))
4959 return 0;
237e200e 4960
54a6d46a
CB
4961 usage = strtoul(usage_str, NULL, 10);
4962 return ((double)usage / 1000000000);
237e200e
SH
4963}
4964
4965#if RELOADTEST
4966void iwashere(void)
4967{
237e200e
SH
4968 int fd;
4969
ec2b5e7c 4970 fd = creat("/tmp/lxcfs-iwashere", 0644);
237e200e
SH
4971 if (fd >= 0)
4972 close(fd);
4973}
4974#endif
4975
4976/*
4977 * We read /proc/uptime and reuse its second field.
4978 * For the first field, we use the mtime for the reaper for
4979 * the calling pid as returned by getreaperage
4980 */
4981static int proc_uptime_read(char *buf, size_t size, off_t offset,
4982 struct fuse_file_info *fi)
4983{
4984 struct fuse_context *fc = fuse_get_context();
4985 struct file_info *d = (struct file_info *)fi->fh;
1c4b4e38 4986 double busytime = get_reaper_busy(fc->pid);
237e200e 4987 char *cache = d->buf;
a262ddb7 4988 ssize_t total_len = 0;
1c4b4e38 4989 double idletime, reaperage;
237e200e
SH
4990
4991#if RELOADTEST
4992 iwashere();
4993#endif
4994
4995 if (offset){
237e200e
SH
4996 if (!d->cached)
4997 return 0;
bbdf646b
BM
4998 if (offset > d->size)
4999 return -EINVAL;
237e200e
SH
5000 int left = d->size - offset;
5001 total_len = left > size ? size: left;
5002 memcpy(buf, cache + offset, total_len);
5003 return total_len;
5004 }
5005
0ecddf02
CB
5006 reaperage = get_reaper_age(fc->pid);
5007 /* To understand why this is done, please read the comment to the
5008 * get_reaper_busy() function.
5009 */
5010 idletime = reaperage;
5011 if (reaperage >= busytime)
5012 idletime = reaperage - busytime;
237e200e 5013
1c4b4e38 5014 total_len = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
bbdf646b 5015 if (total_len < 0 || total_len >= d->buflen){
0ecddf02 5016 lxcfs_error("%s\n", "failed to write to cache");
237e200e
SH
5017 return 0;
5018 }
5019
5020 d->size = (int)total_len;
5021 d->cached = 1;
5022
5023 if (total_len > size) total_len = size;
5024
5025 memcpy(buf, d->buf, total_len);
5026 return total_len;
5027}
5028
5029static int proc_diskstats_read(char *buf, size_t size, off_t offset,
9a9484ab 5030 struct fuse_file_info *fi)
237e200e 5031{
9a9484ab
CB
5032 __do_free char *cg = NULL, *io_serviced_str = NULL,
5033 *io_merged_str = NULL, *io_service_bytes_str = NULL,
5034 *io_wait_time_str = NULL, *io_service_time_str = NULL,
5035 *line = NULL;
5036 __do_fclose FILE *f = NULL;
237e200e
SH
5037 struct fuse_context *fc = fuse_get_context();
5038 struct file_info *d = (struct file_info *)fi->fh;
237e200e
SH
5039 unsigned long read = 0, write = 0;
5040 unsigned long read_merged = 0, write_merged = 0;
5041 unsigned long read_sectors = 0, write_sectors = 0;
5042 unsigned long read_ticks = 0, write_ticks = 0;
5043 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5044 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5045 char *cache = d->buf;
5046 size_t cache_size = d->buflen;
9a9484ab 5047 size_t linelen = 0, total_len = 0;
237e200e
SH
5048 unsigned int major = 0, minor = 0;
5049 int i = 0;
9a9484ab
CB
5050 int ret;
5051 char dev_name[72];
237e200e
SH
5052
5053 if (offset){
9a9484ab
CB
5054 int left;
5055
237e200e
SH
5056 if (offset > d->size)
5057 return -EINVAL;
9a9484ab 5058
237e200e
SH
5059 if (!d->cached)
5060 return 0;
9a9484ab
CB
5061
5062 left = d->size - offset;
237e200e
SH
5063 total_len = left > size ? size: left;
5064 memcpy(buf, cache + offset, total_len);
9a9484ab 5065
237e200e
SH
5066 return total_len;
5067 }
5068
5069 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 5070 if (initpid <= 1 || is_shared_pidns(initpid))
237e200e
SH
5071 initpid = fc->pid;
5072 cg = get_pid_cgroup(initpid, "blkio");
5073 if (!cg)
5fbea8a6 5074 return read_file_fuse("/proc/diskstats", buf, size, d);
6d2f6996 5075 prune_init_slice(cg);
237e200e 5076
9a9484ab
CB
5077 ret = cgroup_ops->get_io_serviced(cgroup_ops, cg, &io_serviced_str);
5078 if (ret < 0) {
5079 if (ret == -EOPNOTSUPP)
5080 return read_file_fuse("/proc/diskstats", buf, size, d);
5081 }
237e200e 5082
9a9484ab
CB
5083 ret = cgroup_ops->get_io_merged(cgroup_ops, cg, &io_merged_str);
5084 if (ret < 0) {
5085 if (ret == -EOPNOTSUPP)
5086 return read_file_fuse("/proc/diskstats", buf, size, d);
5087 }
5088
5089 ret = cgroup_ops->get_io_service_bytes(cgroup_ops, cg, &io_service_bytes_str);
5090 if (ret < 0) {
5091 if (ret == -EOPNOTSUPP)
5092 return read_file_fuse("/proc/diskstats", buf, size, d);
5093 }
5094
5095 ret = cgroup_ops->get_io_wait_time(cgroup_ops, cg, &io_wait_time_str);
5096 if (ret < 0) {
5097 if (ret == -EOPNOTSUPP)
5098 return read_file_fuse("/proc/diskstats", buf, size, d);
5099 }
5100
5101 ret = cgroup_ops->get_io_service_time(cgroup_ops, cg, &io_service_time_str);
5102 if (ret < 0) {
5103 if (ret == -EOPNOTSUPP)
5104 return read_file_fuse("/proc/diskstats", buf, size, d);
5105 }
237e200e
SH
5106
5107 f = fopen("/proc/diskstats", "r");
5108 if (!f)
9a9484ab 5109 return 0;
237e200e
SH
5110
5111 while (getline(&line, &linelen, f) != -1) {
a262ddb7 5112 ssize_t l;
2209fe50 5113 char lbuf[256];
237e200e
SH
5114
5115 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2209fe50 5116 if (i != 3)
237e200e 5117 continue;
2209fe50
SH
5118
5119 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5120 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5121 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5122 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5123 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5124 read_sectors = read_sectors/512;
5125 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5126 write_sectors = write_sectors/512;
5127
5128 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5129 rd_svctm = rd_svctm/1000000;
5130 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5131 rd_wait = rd_wait/1000000;
5132 read_ticks = rd_svctm + rd_wait;
5133
5134 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5135 wr_svctm = wr_svctm/1000000;
5136 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5137 wr_wait = wr_wait/1000000;
5138 write_ticks = wr_svctm + wr_wait;
5139
5140 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5141 tot_ticks = tot_ticks/1000000;
237e200e
SH
5142
5143 memset(lbuf, 0, 256);
2db31eb6
SH
5144 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5145 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5146 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5147 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5148 else
5149 continue;
237e200e 5150
2209fe50 5151 l = snprintf(cache, cache_size, "%s", lbuf);
237e200e
SH
5152 if (l < 0) {
5153 perror("Error writing to fuse buf");
9a9484ab 5154 return 0;
237e200e
SH
5155 }
5156 if (l >= cache_size) {
b8defc3d 5157 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
9a9484ab 5158 return 0;
237e200e
SH
5159 }
5160 cache += l;
5161 cache_size -= l;
5162 total_len += l;
5163 }
5164
5165 d->cached = 1;
5166 d->size = total_len;
5167 if (total_len > size ) total_len = size;
5168 memcpy(buf, d->buf, total_len);
5169
9a9484ab 5170 return total_len;
237e200e
SH
5171}
5172
70dcc12e 5173static int proc_swaps_read(char *buf, size_t size, off_t offset,
66c5e848 5174 struct fuse_file_info *fi)
70dcc12e 5175{
66c5e848
CB
5176 __do_free char *cg = NULL, *memswlimit_str = NULL, *memusage_str = NULL,
5177 *memswusage_str = NULL;
70dcc12e
SH
5178 struct fuse_context *fc = fuse_get_context();
5179 struct file_info *d = (struct file_info *)fi->fh;
66c5e848
CB
5180 unsigned long memswlimit = 0, memlimit = 0, memusage = 0,
5181 memswusage = 0, swap_total = 0, swap_free = 0;
5182 ssize_t total_len = 0;
a262ddb7 5183 ssize_t l = 0;
70dcc12e 5184 char *cache = d->buf;
66c5e848 5185 int ret;
70dcc12e
SH
5186
5187 if (offset) {
66c5e848
CB
5188 int left;
5189
70dcc12e
SH
5190 if (offset > d->size)
5191 return -EINVAL;
66c5e848 5192
70dcc12e
SH
5193 if (!d->cached)
5194 return 0;
66c5e848
CB
5195
5196 left = d->size - offset;
70dcc12e
SH
5197 total_len = left > size ? size: left;
5198 memcpy(buf, cache + offset, total_len);
66c5e848 5199
70dcc12e
SH
5200 return total_len;
5201 }
5202
5203 pid_t initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 5204 if (initpid <= 1 || is_shared_pidns(initpid))
70dcc12e
SH
5205 initpid = fc->pid;
5206 cg = get_pid_cgroup(initpid, "memory");
5207 if (!cg)
5fbea8a6 5208 return read_file_fuse("/proc/swaps", buf, size, d);
6d2f6996 5209 prune_init_slice(cg);
70dcc12e 5210
66c5e848 5211 memlimit = get_min_memlimit(cg, false);
70dcc12e 5212
66c5e848
CB
5213 ret = cgroup_ops->get_memory_current(cgroup_ops, cg, &memusage_str);
5214 if (ret < 0)
5215 return 0;
70dcc12e 5216
70dcc12e
SH
5217 memusage = strtoul(memusage_str, NULL, 10);
5218
66c5e848
CB
5219 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cg, &memswlimit_str);
5220 if (ret >= 0)
5221 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cg, &memswusage_str);
5222 if (ret >= 0) {
5223 memswlimit = get_min_memlimit(cg, true);
70dcc12e 5224 memswusage = strtoul(memswusage_str, NULL, 10);
70dcc12e
SH
5225 swap_total = (memswlimit - memlimit) / 1024;
5226 swap_free = (memswusage - memusage) / 1024;
5227 }
5228
5229 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5230
5231 /* When no mem + swap limit is specified or swapaccount=0*/
5232 if (!memswlimit) {
66c5e848
CB
5233 __do_free char *line = NULL;
5234 __do_fclose FILE *f = NULL;
70dcc12e 5235 size_t linelen = 0;
70dcc12e 5236
66c5e848 5237 f = fopen("/proc/meminfo", "r");
70dcc12e 5238 if (!f)
66c5e848 5239 return 0;
70dcc12e
SH
5240
5241 while (getline(&line, &linelen, f) != -1) {
66c5e848 5242 if (startswith(line, "SwapTotal:"))
70dcc12e 5243 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
66c5e848 5244 else if (startswith(line, "SwapFree:"))
70dcc12e 5245 sscanf(line, "SwapFree: %8lu kB", &swap_free);
70dcc12e 5246 }
70dcc12e
SH
5247 }
5248
5249 if (swap_total > 0) {
a262ddb7
CB
5250 l = snprintf(d->buf + total_len, d->size - total_len,
5251 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5252 swap_total, swap_free);
5253 total_len += l;
70dcc12e
SH
5254 }
5255
a262ddb7 5256 if (total_len < 0 || l < 0) {
70dcc12e 5257 perror("Error writing to cache");
66c5e848 5258 return 0;
70dcc12e
SH
5259 }
5260
5261 d->cached = 1;
5262 d->size = (int)total_len;
5263
5264 if (total_len > size) total_len = size;
5265 memcpy(buf, d->buf, total_len);
66c5e848 5266 return total_len;
70dcc12e 5267}
66c5e848 5268
6db4f7a3 5269/*
5270 * Find the process pid from cgroup path.
5271 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5272 * @pid_buf : put pid to pid_buf.
5273 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5274 * @depth : the depth of cgroup in container.
5275 * @sum : return the number of pid.
5276 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5277 */
5278static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5279{
54a6d46a
CB
5280 __do_free char *path = NULL;
5281 __do_close_prot_errno int fd = -EBADF;
5282 __do_fclose FILE *f = NULL;
5283 __do_closedir DIR *dir = NULL;
6db4f7a3 5284 struct dirent *file;
6db4f7a3 5285 size_t linelen = 0;
5286 char *line = NULL;
5287 int pd;
6db4f7a3 5288 char **pid;
5289
5290 /* path = dpath + "/cgroup.procs" + /0 */
54a6d46a
CB
5291 path = malloc(strlen(dpath) + 20);
5292 if (!path)
5293 return sum;
6db4f7a3 5294
5295 strcpy(path, dpath);
54a6d46a 5296 fd = openat(cfd, path, O_RDONLY | O_CLOEXEC | O_NOFOLLOW);
6db4f7a3 5297 if (fd < 0)
54a6d46a 5298 return sum;
6db4f7a3 5299
54a6d46a
CB
5300 dir = fdopendir(move_fd(fd));
5301 if (!dir)
5302 return sum;
6db4f7a3 5303
5304 while (((file = readdir(dir)) != NULL) && depth > 0) {
54a6d46a 5305 if (strcmp(file->d_name, ".") == 0)
6db4f7a3 5306 continue;
54a6d46a
CB
5307
5308 if (strcmp(file->d_name, "..") == 0)
6db4f7a3 5309 continue;
54a6d46a 5310
6db4f7a3 5311 if (file->d_type == DT_DIR) {
54a6d46a
CB
5312 __do_free char *path_dir = NULL;
5313
6db4f7a3 5314 /* path + '/' + d_name +/0 */
54a6d46a
CB
5315 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5316 if (!path_dir)
5317 return sum;
5318
6db4f7a3 5319 strcpy(path_dir, path);
5320 strcat(path_dir, "/");
5321 strcat(path_dir, file->d_name);
5322 pd = depth - 1;
5323 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
6db4f7a3 5324 }
5325 }
6db4f7a3 5326
5327 strcat(path, "/cgroup.procs");
5328 fd = openat(cfd, path, O_RDONLY);
5329 if (fd < 0)
54a6d46a 5330 return sum;
6db4f7a3 5331
54a6d46a
CB
5332 f = fdopen(move_fd(fd), "r");
5333 if (!f)
5334 return sum;
6db4f7a3 5335
5336 while (getline(&line, &linelen, f) != -1) {
54a6d46a
CB
5337 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5338 if (!pid)
5339 return sum;
6db4f7a3 5340 *pid_buf = pid;
54a6d46a
CB
5341
5342 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5343 if (!*(*pid_buf + sum))
5344 return sum;
5345
6db4f7a3 5346 strcpy(*(*pid_buf + sum), line);
5347 sum++;
5348 }
54a6d46a 5349
6db4f7a3 5350 return sum;
5351}
54a6d46a 5352
6db4f7a3 5353/*
5354 * calc_load calculates the load according to the following formula:
5355 * load1 = load0 * exp + active * (1 - exp)
5356 *
5357 * @load1: the new loadavg.
5358 * @load0: the former loadavg.
5359 * @active: the total number of running pid at this moment.
5360 * @exp: the fixed-point defined in the beginning.
5361 */
5362static unsigned long
5363calc_load(unsigned long load, unsigned long exp, unsigned long active)
5364{
5365 unsigned long newload;
5366
5367 active = active > 0 ? active * FIXED_1 : 0;
5368 newload = load * exp + active * (FIXED_1 - exp);
5369 if (active >= load)
5370 newload += FIXED_1 - 1;
5371
5372 return newload / FIXED_1;
5373}
5374
5375/*
5376 * Return 0 means that container p->cg is closed.
5377 * Return -1 means that error occurred in refresh.
5378 * Positive num equals the total number of pid.
5379 */
5380static int refresh_load(struct load_node *p, char *path)
5381{
54a6d46a 5382 __do_free char *line = NULL;
6db4f7a3 5383 char **idbuf;
5384 char proc_path[256];
5385 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
6db4f7a3 5386 size_t linelen = 0;
5387 int sum, length;
6db4f7a3 5388 struct dirent *file;
5389
54a6d46a
CB
5390 idbuf = malloc(sizeof(char *));
5391 if (!idbuf)
5392 return -1;
5393
6db4f7a3 5394 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5395 /* normal exit */
5396 if (sum == 0)
5397 goto out;
5398
5399 for (i = 0; i < sum; i++) {
54a6d46a
CB
5400 __do_closedir DIR *dp = NULL;
5401
6db4f7a3 5402 /*clean up '\n' */
5403 length = strlen(idbuf[i])-1;
5404 idbuf[i][length] = '\0';
5405 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5406 if (ret < 0 || ret > 255) {
5407 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5408 i = sum;
5409 sum = -1;
5410 goto err_out;
5411 }
5412
5413 dp = opendir(proc_path);
5414 if (!dp) {
5415 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5416 continue;
5417 }
5418 while ((file = readdir(dp)) != NULL) {
54a6d46a
CB
5419 __do_fclose FILE *f = NULL;
5420
6db4f7a3 5421 if (strncmp(file->d_name, ".", 1) == 0)
5422 continue;
5423 if (strncmp(file->d_name, "..", 1) == 0)
5424 continue;
5425 total_pid++;
5426 /* We make the biggest pid become last_pid.*/
5427 ret = atof(file->d_name);
5428 last_pid = (ret > last_pid) ? ret : last_pid;
5429
5430 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5431 if (ret < 0 || ret > 255) {
5432 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5433 i = sum;
5434 sum = -1;
6db4f7a3 5435 goto err_out;
5436 }
54a6d46a 5437
6db4f7a3 5438 f = fopen(proc_path, "r");
5439 if (f != NULL) {
5440 while (getline(&line, &linelen, f) != -1) {
5441 /* Find State */
5442 if ((line[0] == 'S') && (line[1] == 't'))
5443 break;
5444 }
54a6d46a 5445
6db4f7a3 5446 if ((line[7] == 'R') || (line[7] == 'D'))
5447 run_pid++;
6db4f7a3 5448 }
5449 }
6db4f7a3 5450 }
5451 /*Calculate the loadavg.*/
5452 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5453 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5454 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5455 p->run_pid = run_pid;
5456 p->total_pid = total_pid;
5457 p->last_pid = last_pid;
5458
beb5024e 5459err_out:
6db4f7a3 5460 for (; i > 0; i--)
5461 free(idbuf[i-1]);
5462out:
5463 free(idbuf);
5464 return sum;
5465}
54a6d46a 5466
6db4f7a3 5467/*
5468 * Traverse the hash table and update it.
5469 */
5470void *load_begin(void *arg)
5471{
5472
6db4f7a3 5473 int i, sum, length, ret;
5474 struct load_node *f;
5475 int first_node;
5476 clock_t time1, time2;
5477
5478 while (1) {
a83618e2
JS
5479 if (loadavg_stop == 1)
5480 return NULL;
5481
6db4f7a3 5482 time1 = clock();
5483 for (i = 0; i < LOAD_SIZE; i++) {
5484 pthread_mutex_lock(&load_hash[i].lock);
5485 if (load_hash[i].next == NULL) {
5486 pthread_mutex_unlock(&load_hash[i].lock);
5487 continue;
5488 }
5489 f = load_hash[i].next;
5490 first_node = 1;
5491 while (f) {
54a6d46a
CB
5492 __do_free char *path = NULL;
5493
6db4f7a3 5494 length = strlen(f->cg) + 2;
6db4f7a3 5495 /* strlen(f->cg) + '.' or '' + \0 */
54a6d46a
CB
5496 path = malloc(length);
5497 if (!path)
5498 goto out;
6db4f7a3 5499
075387cd 5500 ret = snprintf(path, length, "%s%s", dot_or_empty(f->cg), f->cg);
6db4f7a3 5501 if (ret < 0 || ret > length - 1) {
5502 /* snprintf failed, ignore the node.*/
5503 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5504 goto out;
5505 }
54a6d46a 5506
6db4f7a3 5507 sum = refresh_load(f, path);
54a6d46a 5508 if (sum == 0)
6db4f7a3 5509 f = del_node(f, i);
54a6d46a 5510 else
6db4f7a3 5511out: f = f->next;
6db4f7a3 5512 /* load_hash[i].lock locks only on the first node.*/
5513 if (first_node == 1) {
5514 first_node = 0;
5515 pthread_mutex_unlock(&load_hash[i].lock);
5516 }
5517 }
5518 }
a83618e2
JS
5519
5520 if (loadavg_stop == 1)
5521 return NULL;
5522
6db4f7a3 5523 time2 = clock();
5524 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5525 }
5526}
5527
5528static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5529 struct fuse_file_info *fi)
5530{
5531 struct fuse_context *fc = fuse_get_context();
5532 struct file_info *d = (struct file_info *)fi->fh;
5533 pid_t initpid;
5534 char *cg;
5535 size_t total_len = 0;
5536 char *cache = d->buf;
5537 struct load_node *n;
5538 int hash;
01d88ede 5539 int cfd, rv = 0;
6db4f7a3 5540 unsigned long a, b, c;
5541
5542 if (offset) {
5543 if (offset > d->size)
5544 return -EINVAL;
5545 if (!d->cached)
5546 return 0;
5547 int left = d->size - offset;
5548 total_len = left > size ? size : left;
5549 memcpy(buf, cache + offset, total_len);
5550 return total_len;
5551 }
5552 if (!loadavg)
5fbea8a6 5553 return read_file_fuse("/proc/loadavg", buf, size, d);
6db4f7a3 5554
5555 initpid = lookup_initpid_in_store(fc->pid);
6e3637bb 5556 if (initpid <= 1 || is_shared_pidns(initpid))
6db4f7a3 5557 initpid = fc->pid;
5558 cg = get_pid_cgroup(initpid, "cpu");
5559 if (!cg)
5fbea8a6 5560 return read_file_fuse("/proc/loadavg", buf, size, d);
6db4f7a3 5561
5562 prune_init_slice(cg);
b077527b 5563 hash = calc_hash(cg) % LOAD_SIZE;
6db4f7a3 5564 n = locate_node(cg, hash);
5565
5566 /* First time */
5567 if (n == NULL) {
d298bba1 5568 cfd = get_cgroup_fd("cpu");
5fbea8a6 5569 if (cfd >= 0) {
6db4f7a3 5570 /*
5571 * In locate_node() above, pthread_rwlock_unlock() isn't used
5572 * because delete is not allowed before read has ended.
5573 */
5574 pthread_rwlock_unlock(&load_hash[hash].rdlock);
01d88ede
JS
5575 rv = 0;
5576 goto err;
6db4f7a3 5577 }
5578 do {
5579 n = malloc(sizeof(struct load_node));
5580 } while (!n);
5581
5582 do {
5583 n->cg = malloc(strlen(cg)+1);
5584 } while (!n->cg);
5585 strcpy(n->cg, cg);
5586 n->avenrun[0] = 0;
5587 n->avenrun[1] = 0;
5588 n->avenrun[2] = 0;
5589 n->run_pid = 0;
5590 n->total_pid = 1;
5591 n->last_pid = initpid;
5592 n->cfd = cfd;
5593 insert_node(&n, hash);
5594 }
5595 a = n->avenrun[0] + (FIXED_1/200);
5596 b = n->avenrun[1] + (FIXED_1/200);
5597 c = n->avenrun[2] + (FIXED_1/200);
5598 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5599 LOAD_INT(a), LOAD_FRAC(a),
5600 LOAD_INT(b), LOAD_FRAC(b),
5601 LOAD_INT(c), LOAD_FRAC(c),
5602 n->run_pid, n->total_pid, n->last_pid);
5603 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5604 if (total_len < 0 || total_len >= d->buflen) {
5605 lxcfs_error("%s\n", "Failed to write to cache");
01d88ede
JS
5606 rv = 0;
5607 goto err;
6db4f7a3 5608 }
5609 d->size = (int)total_len;
5610 d->cached = 1;
5611
5612 if (total_len > size)
5613 total_len = size;
5614 memcpy(buf, d->buf, total_len);
01d88ede
JS
5615 rv = total_len;
5616
5617err:
5618 free(cg);
5619 return rv;
6db4f7a3 5620}
5621/* Return a positive number on success, return 0 on failure.*/
5622pthread_t load_daemon(int load_use)
5623{
5624 int ret;
5625 pthread_t pid;
5626
5627 ret = init_load();
5628 if (ret == -1) {
5629 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5630 return 0;
5631 }
5632 ret = pthread_create(&pid, NULL, load_begin, NULL);
5633 if (ret != 0) {
5634 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5635 load_free();
5636 return 0;
5637 }
5638 /* use loadavg, here loadavg = 1*/
5639 loadavg = load_use;
5640 return pid;
5641}
70dcc12e 5642
a83618e2
JS
5643/* Returns 0 on success. */
5644int stop_load_daemon(pthread_t pid)
5645{
5646 int s;
5647
5648 /* Signal the thread to gracefully stop */
5649 loadavg_stop = 1;
5650
5651 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5652 if (s != 0) {
5653 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5654 return -1;
5655 }
5656
5657 load_free();
5658 loadavg_stop = 0;
5659
5660 return 0;
5661}
5662
237e200e
SH
5663static off_t get_procfile_size(const char *which)
5664{
5665 FILE *f = fopen(which, "r");
5666 char *line = NULL;
5667 size_t len = 0;
5668 ssize_t sz, answer = 0;
5669 if (!f)
5670 return 0;
5671
5672 while ((sz = getline(&line, &len, f)) != -1)
5673 answer += sz;
5674 fclose (f);
5675 free(line);
5676
5677 return answer;
5678}
5679
5680int proc_getattr(const char *path, struct stat *sb)
5681{
5682 struct timespec now;
5683
5684 memset(sb, 0, sizeof(struct stat));
5685 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5686 return -EINVAL;
5687 sb->st_uid = sb->st_gid = 0;
5688 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5689 if (strcmp(path, "/proc") == 0) {
5690 sb->st_mode = S_IFDIR | 00555;
5691 sb->st_nlink = 2;
5692 return 0;
5693 }
5694 if (strcmp(path, "/proc/meminfo") == 0 ||
5695 strcmp(path, "/proc/cpuinfo") == 0 ||
5696 strcmp(path, "/proc/uptime") == 0 ||
5697 strcmp(path, "/proc/stat") == 0 ||
70dcc12e 5698 strcmp(path, "/proc/diskstats") == 0 ||
46be8eed 5699 strcmp(path, "/proc/swaps") == 0 ||
5700 strcmp(path, "/proc/loadavg") == 0) {
237e200e
SH
5701 sb->st_size = 0;
5702 sb->st_mode = S_IFREG | 00444;
5703 sb->st_nlink = 1;
5704 return 0;
5705 }
5706
5707 return -ENOENT;
5708}
5709
5710int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5711 struct fuse_file_info *fi)
5712{
d639f863
CB
5713 if (filler(buf, ".", NULL, 0) != 0 ||
5714 filler(buf, "..", NULL, 0) != 0 ||
5715 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5716 filler(buf, "meminfo", NULL, 0) != 0 ||
5717 filler(buf, "stat", NULL, 0) != 0 ||
5718 filler(buf, "uptime", NULL, 0) != 0 ||
5719 filler(buf, "diskstats", NULL, 0) != 0 ||
46be8eed 5720 filler(buf, "swaps", NULL, 0) != 0 ||
5721 filler(buf, "loadavg", NULL, 0) != 0)
237e200e
SH
5722 return -EINVAL;
5723 return 0;
5724}
5725
5726int proc_open(const char *path, struct fuse_file_info *fi)
5727{
5728 int type = -1;
5729 struct file_info *info;
5730
5731 if (strcmp(path, "/proc/meminfo") == 0)
5732 type = LXC_TYPE_PROC_MEMINFO;
5733 else if (strcmp(path, "/proc/cpuinfo") == 0)
5734 type = LXC_TYPE_PROC_CPUINFO;
5735 else if (strcmp(path, "/proc/uptime") == 0)
5736 type = LXC_TYPE_PROC_UPTIME;
5737 else if (strcmp(path, "/proc/stat") == 0)
5738 type = LXC_TYPE_PROC_STAT;
5739 else if (strcmp(path, "/proc/diskstats") == 0)
5740 type = LXC_TYPE_PROC_DISKSTATS;
70dcc12e
SH
5741 else if (strcmp(path, "/proc/swaps") == 0)
5742 type = LXC_TYPE_PROC_SWAPS;
46be8eed 5743 else if (strcmp(path, "/proc/loadavg") == 0)
5744 type = LXC_TYPE_PROC_LOADAVG;
237e200e
SH
5745 if (type == -1)
5746 return -ENOENT;
5747
5748 info = malloc(sizeof(*info));
5749 if (!info)
5750 return -ENOMEM;
5751
5752 memset(info, 0, sizeof(*info));
5753 info->type = type;
5754
5755 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5756 do {
5757 info->buf = malloc(info->buflen);
5758 } while (!info->buf);
5759 memset(info->buf, 0, info->buflen);
5760 /* set actual size to buffer size */
5761 info->size = info->buflen;
5762
5763 fi->fh = (unsigned long)info;
5764 return 0;
5765}
5766
bddbb106
SH
5767int proc_access(const char *path, int mask)
5768{
e7849aa3
CB
5769 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5770 return 0;
5771
bddbb106
SH
5772 /* these are all read-only */
5773 if ((mask & ~R_OK) != 0)
1b060d0a 5774 return -EACCES;
bddbb106
SH
5775 return 0;
5776}
5777
237e200e
SH
5778int proc_release(const char *path, struct fuse_file_info *fi)
5779{
43215927 5780 do_release_file_info(fi);
237e200e
SH
5781 return 0;
5782}
5783
5784int proc_read(const char *path, char *buf, size_t size, off_t offset,
5785 struct fuse_file_info *fi)
5786{
5787 struct file_info *f = (struct file_info *) fi->fh;
5788
5789 switch (f->type) {
5790 case LXC_TYPE_PROC_MEMINFO:
5791 return proc_meminfo_read(buf, size, offset, fi);
5792 case LXC_TYPE_PROC_CPUINFO:
5793 return proc_cpuinfo_read(buf, size, offset, fi);
5794 case LXC_TYPE_PROC_UPTIME:
5795 return proc_uptime_read(buf, size, offset, fi);
5796 case LXC_TYPE_PROC_STAT:
5797 return proc_stat_read(buf, size, offset, fi);
5798 case LXC_TYPE_PROC_DISKSTATS:
5799 return proc_diskstats_read(buf, size, offset, fi);
70dcc12e
SH
5800 case LXC_TYPE_PROC_SWAPS:
5801 return proc_swaps_read(buf, size, offset, fi);
46be8eed 5802 case LXC_TYPE_PROC_LOADAVG:
5803 return proc_loadavg_read(buf, size, offset, fi);
237e200e
SH
5804 default:
5805 return -EINVAL;
5806 }
5807}
5808
29a73c2f
CB
5809/*
5810 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
5811 */
5812
29a73c2f
CB
5813static bool umount_if_mounted(void)
5814{
5815 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 5816 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
5817 return false;
5818 }
5819 return true;
5820}
5821
2283e240
CB
5822/* __typeof__ should be safe to use with all compilers. */
5823typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5824static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5825{
5826 return (fs->f_type == (fs_type_magic)magic_val);
5827}
5828
0a4dea41
CB
5829/*
5830 * looking at fs/proc_namespace.c, it appears we can
5831 * actually expect the rootfs entry to very specifically contain
5832 * " - rootfs rootfs "
5833 * IIUC, so long as we've chrooted so that rootfs is not our root,
5834 * the rootfs entry should always be skipped in mountinfo contents.
5835 */
5836static bool is_on_ramfs(void)
5837{
5838 FILE *f;
5839 char *p, *p2;
5840 char *line = NULL;
5841 size_t len = 0;
5842 int i;
5843
5844 f = fopen("/proc/self/mountinfo", "r");
5845 if (!f)
5846 return false;
5847
5848 while (getline(&line, &len, f) != -1) {
5849 for (p = line, i = 0; p && i < 4; i++)
5850 p = strchr(p + 1, ' ');
5851 if (!p)
5852 continue;
5853 p2 = strchr(p + 1, ' ');
5854 if (!p2)
5855 continue;
5856 *p2 = '\0';
5857 if (strcmp(p + 1, "/") == 0) {
5858 // this is '/'. is it the ramfs?
5859 p = strchr(p2 + 1, '-');
5860 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5861 free(line);
5862 fclose(f);
5863 return true;
5864 }
5865 }
5866 }
5867 free(line);
5868 fclose(f);
5869 return false;
5870}
5871
cc309f33 5872static int pivot_enter()
0a4dea41 5873{
cc309f33
CB
5874 int ret = -1, oldroot = -1, newroot = -1;
5875
5876 oldroot = open("/", O_DIRECTORY | O_RDONLY);
5877 if (oldroot < 0) {
5878 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5879 return ret;
5880 }
5881
5882 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5883 if (newroot < 0) {
5884 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5885 goto err;
5886 }
5887
5888 /* change into new root fs */
5889 if (fchdir(newroot) < 0) {
5890 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5891 goto err;
5892 }
5893
0a4dea41
CB
5894 /* pivot_root into our new root fs */
5895 if (pivot_root(".", ".") < 0) {
5896 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
cc309f33 5897 goto err;
0a4dea41
CB
5898 }
5899
5900 /*
5901 * At this point the old-root is mounted on top of our new-root.
5902 * To unmounted it we must not be chdir'd into it, so escape back
5903 * to the old-root.
5904 */
5905 if (fchdir(oldroot) < 0) {
5906 lxcfs_error("%s\n", "Failed to enter old root.");
cc309f33 5907 goto err;
0a4dea41
CB
5908 }
5909
5910 if (umount2(".", MNT_DETACH) < 0) {
5911 lxcfs_error("%s\n", "Failed to detach old root.");
cc309f33 5912 goto err;
0a4dea41
CB
5913 }
5914
5915 if (fchdir(newroot) < 0) {
5916 lxcfs_error("%s\n", "Failed to re-enter new root.");
cc309f33 5917 goto err;
0a4dea41
CB
5918 }
5919
cc309f33
CB
5920 ret = 0;
5921
5922err:
5923 if (oldroot > 0)
5924 close(oldroot);
5925 if (newroot > 0)
5926 close(newroot);
5927
5928 return ret;
0a4dea41
CB
5929}
5930
5931static int chroot_enter()
5932{
5933 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5934 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5935 return -1;
5936 }
5937
5938 if (chroot(".") < 0) {
5939 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
5940 return -1;
5941 }
5942
5943 if (chdir("/") < 0) {
5944 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
5945 return -1;
5946 }
5947
5948 return 0;
5949}
5950
0232cbac 5951static int permute_and_enter(void)
29a73c2f 5952{
0a4dea41
CB
5953 struct statfs sb;
5954
5955 if (statfs("/", &sb) < 0) {
5956 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 5957 return -1;
0a4dea41
CB
5958 }
5959
5960 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5961 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5962 * /proc/1/mountinfo. */
5963 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5964 return chroot_enter();
29a73c2f 5965
cc309f33 5966 if (pivot_enter() < 0) {
0a4dea41 5967 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 5968 return -1;
29a73c2f
CB
5969 }
5970
cc309f33 5971 return 0;
29a73c2f
CB
5972}
5973
5974/* Prepare our new clean root. */
0232cbac 5975static int permute_prepare(void)
29a73c2f
CB
5976{
5977 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 5978 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
5979 return -1;
5980 }
5981
5982 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 5983 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
5984 return -1;
5985 }
5986
5987 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 5988 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
5989 return -1;
5990 }
5991
5992 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 5993 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
5994 return -1;
5995 }
5996
5997 return 0;
5998}
5999
0232cbac
CB
6000/* Calls chroot() on ramfs, pivot_root() in all other cases. */
6001static bool permute_root(void)
29a73c2f
CB
6002{
6003 /* Prepare new root. */
0232cbac 6004 if (permute_prepare() < 0)
29a73c2f
CB
6005 return false;
6006
6007 /* Pivot into new root. */
0232cbac 6008 if (permute_and_enter() < 0)
29a73c2f
CB
6009 return false;
6010
6011 return true;
6012}
6013
a257a8ee
CB
6014static int preserve_mnt_ns(int pid)
6015{
6016 int ret;
6017 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6018 char path[len];
6019
6020 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6021 if (ret < 0 || (size_t)ret >= len)
6022 return -1;
6023
6024 return open(path, O_RDONLY | O_CLOEXEC);
6025}
6026
0a4dea41 6027static bool cgfs_prepare_mounts(void)
29a73c2f
CB
6028{
6029 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 6030 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
6031 return false;
6032 }
480262c9 6033
29a73c2f 6034 if (!umount_if_mounted()) {
b8defc3d 6035 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
6036 return false;
6037 }
6038
6039 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 6040 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
6041 return false;
6042 }
6043
0646f250
CB
6044 cgroup_ops->mntns_fd = preserve_mnt_ns(getpid());
6045 if (cgroup_ops->mntns_fd < 0) {
a257a8ee
CB
6046 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6047 return false;
6048 }
6049
480262c9 6050 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 6051 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
6052 return false;
6053 }
480262c9 6054
29a73c2f 6055 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 6056 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
6057 return false;
6058 }
480262c9 6059
29a73c2f
CB
6060 return true;
6061}
6062
0a4dea41 6063static bool cgfs_mount_hierarchies(void)
29a73c2f 6064{
5fbea8a6
CB
6065 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
6066 return false;
51c7ca35 6067
5fbea8a6
CB
6068 if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
6069 return false;
29a73c2f 6070
5fbea8a6
CB
6071 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
6072 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
6073 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
6074 if ((*h)->fd < 0)
29a73c2f 6075 return false;
29a73c2f 6076 }
5fbea8a6 6077
29a73c2f
CB
6078 return true;
6079}
6080
480262c9 6081static bool cgfs_setup_controllers(void)
29a73c2f 6082{
0a4dea41 6083 if (!cgfs_prepare_mounts())
29a73c2f 6084 return false;
29a73c2f 6085
0a4dea41 6086 if (!cgfs_mount_hierarchies()) {
b8defc3d 6087 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
29a73c2f
CB
6088 return false;
6089 }
6090
0232cbac 6091 if (!permute_root())
29a73c2f
CB
6092 return false;
6093
6094 return true;
6095}
6096
2243c5a9 6097static void __attribute__((constructor)) lxcfs_init(void)
237e200e 6098{
2243c5a9 6099 __do_close_prot_errno int init_ns = -EBADF;
5fbea8a6 6100 char *cret;
e58dab00 6101 char cwd[MAXPATHLEN];
237e200e 6102
5fbea8a6
CB
6103 cgroup_ops = cgroup_init();
6104 if (!cgroup_ops)
2243c5a9 6105 log_exit("Failed to initialize cgroup support");
237e200e 6106
480262c9 6107 /* Preserve initial namespace. */
a257a8ee 6108 init_ns = preserve_mnt_ns(getpid());
2243c5a9
CB
6109 if (init_ns < 0)
6110 log_exit("Failed to preserve initial mount namespace");
480262c9 6111
e58dab00 6112 cret = getcwd(cwd, MAXPATHLEN);
2243c5a9 6113 log_exit("%s - Could not retrieve current working directory", strerror(errno));
e58dab00 6114
480262c9
CB
6115 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6116 * to privately mount lxcfs cgroups. */
2243c5a9
CB
6117 if (!cgfs_setup_controllers())
6118 log_exit("Failed to setup private cgroup mounts for lxcfs");
480262c9 6119
2243c5a9
CB
6120 if (setns(init_ns, 0) < 0)
6121 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
29a73c2f 6122
e58dab00 6123 if (!cret || chdir(cwd) < 0)
2243c5a9 6124 log_exit("%s - Could not change back to original working directory", strerror(errno));
e58dab00 6125
2243c5a9
CB
6126 if (!init_cpuview())
6127 log_exit("Failed to init CPU view");
056adcef 6128
237e200e 6129 print_subsystems();
237e200e
SH
6130}
6131
2243c5a9 6132static void __attribute__((destructor)) lxcfs_exit(void)
237e200e 6133{
0646f250 6134 lxcfs_debug("%s\n", "Running destructor for liblxcfs");
056adcef 6135 free_cpuview();
2243c5a9 6136 cgroup_exit(cgroup_ops);
1c4b4e38 6137}