]> git.proxmox.com Git - mirror_lxcfs.git/blame - bindings.c
calc_hash(): do not apply modulo LOAD_SIZE
[mirror_lxcfs.git] / bindings.c
CommitLineData
237e200e
SH
1/* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9#define FUSE_USE_VERSION 26
10
0ecddf02 11#define __STDC_FORMAT_MACROS
237e200e 12#include <dirent.h>
29a73c2f 13#include <errno.h>
237e200e
SH
14#include <fcntl.h>
15#include <fuse.h>
0ecddf02 16#include <inttypes.h>
237e200e 17#include <libgen.h>
237e200e 18#include <pthread.h>
29a73c2f
CB
19#include <sched.h>
20#include <stdbool.h>
0ecddf02 21#include <stdint.h>
29a73c2f
CB
22#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25#include <time.h>
26#include <unistd.h>
27#include <wait.h>
d89504c4 28#include <linux/magic.h>
237e200e 29#include <linux/sched.h>
29a73c2f
CB
30#include <sys/epoll.h>
31#include <sys/mman.h>
32#include <sys/mount.h>
237e200e
SH
33#include <sys/param.h>
34#include <sys/socket.h>
29a73c2f 35#include <sys/syscall.h>
0ecddf02 36#include <sys/sysinfo.h>
d89504c4 37#include <sys/vfs.h>
237e200e 38
237e200e 39#include "bindings.h"
237e200e
SH
40#include "config.h" // for VERSION
41
0ecddf02
CB
42/* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
43#define LXCFS_NUMSTRLEN64 21
44
29a73c2f
CB
45/* Define pivot_root() if missing from the C library */
46#ifndef HAVE_PIVOT_ROOT
47static int pivot_root(const char * new_root, const char * put_old)
48{
49#ifdef __NR_pivot_root
50return syscall(__NR_pivot_root, new_root, put_old);
51#else
52errno = ENOSYS;
53return -1;
54#endif
55}
56#else
57extern int pivot_root(const char * new_root, const char * put_old);
58#endif
59
237e200e
SH
60enum {
61 LXC_TYPE_CGDIR,
62 LXC_TYPE_CGFILE,
63 LXC_TYPE_PROC_MEMINFO,
64 LXC_TYPE_PROC_CPUINFO,
65 LXC_TYPE_PROC_UPTIME,
66 LXC_TYPE_PROC_STAT,
67 LXC_TYPE_PROC_DISKSTATS,
70dcc12e 68 LXC_TYPE_PROC_SWAPS,
46be8eed 69 LXC_TYPE_PROC_LOADAVG,
237e200e
SH
70};
71
72struct file_info {
73 char *controller;
74 char *cgroup;
75 char *file;
76 int type;
77 char *buf; // unused as of yet
78 int buflen;
79 int size; //actual data size
80 int cached;
81};
82
8be92dd1
JS
83struct cpuacct_usage {
84 uint64_t user;
85 uint64_t system;
86};
87
0e47acaa 88/* The function of hash table.*/
89#define LOAD_SIZE 100 /*the size of hash_table */
6db4f7a3 90#define FLUSH_TIME 5 /*the flush rate */
91#define DEPTH_DIR 3 /*the depth of per cgroup */
92/* The function of calculate loadavg .*/
93#define FSHIFT 11 /* nr of bits of precision */
94#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
95#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
96#define EXP_5 2014 /* 1/exp(5sec/5min) */
97#define EXP_15 2037 /* 1/exp(5sec/15min) */
98#define LOAD_INT(x) ((x) >> FSHIFT)
99#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
beb5024e 100/*
6db4f7a3 101 * This parameter is used for proc_loadavg_read().
102 * 1 means use loadavg, 0 means not use.
103 */
104static int loadavg = 0;
a83618e2 105static volatile sig_atomic_t loadavg_stop = 0;
0e47acaa 106static int calc_hash(char *name)
107{
108 unsigned int hash = 0;
109 unsigned int x = 0;
110 /* ELFHash algorithm. */
111 while (*name) {
112 hash = (hash << 4) + *name++;
113 x = hash & 0xf0000000;
114 if (x != 0)
115 hash ^= (x >> 24);
116 hash &= ~x;
117 }
b077527b 118 return (hash & 0x7fffffff);
0e47acaa 119}
120
121struct load_node {
122 char *cg; /*cg */
123 unsigned long avenrun[3]; /* Load averages */
124 unsigned int run_pid;
125 unsigned int total_pid;
126 unsigned int last_pid;
127 int cfd; /* The file descriptor of the mounted cgroup */
128 struct load_node *next;
129 struct load_node **pre;
130};
131
132struct load_head {
133 /*
134 * The lock is about insert load_node and refresh load_node.To the first
135 * load_node of each hash bucket, insert and refresh in this hash bucket is
136 * mutually exclusive.
137 */
138 pthread_mutex_t lock;
139 /*
140 * The rdlock is about read loadavg and delete load_node.To each hash
141 * bucket, read and delete is mutually exclusive. But at the same time, we
142 * allow paratactic read operation. This rdlock is at list level.
143 */
144 pthread_rwlock_t rdlock;
145 /*
146 * The rilock is about read loadavg and insert load_node.To the first
147 * load_node of each hash bucket, read and insert is mutually exclusive.
148 * But at the same time, we allow paratactic read operation.
149 */
150 pthread_rwlock_t rilock;
151 struct load_node *next;
152};
153
154static struct load_head load_hash[LOAD_SIZE]; /* hash table */
155/*
156 * init_load initialize the hash table.
157 * Return 0 on success, return -1 on failure.
158 */
159static int init_load(void)
160{
161 int i;
162 int ret;
163
164 for (i = 0; i < LOAD_SIZE; i++) {
165 load_hash[i].next = NULL;
166 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
167 if (ret != 0) {
168 lxcfs_error("%s\n", "Failed to initialize lock");
169 goto out3;
170 }
171 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
172 if (ret != 0) {
173 lxcfs_error("%s\n", "Failed to initialize rdlock");
174 goto out2;
175 }
176 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
177 if (ret != 0) {
178 lxcfs_error("%s\n", "Failed to initialize rilock");
179 goto out1;
180 }
181 }
182 return 0;
183out1:
184 pthread_rwlock_destroy(&load_hash[i].rdlock);
185out2:
186 pthread_mutex_destroy(&load_hash[i].lock);
187out3:
188 while (i > 0) {
189 i--;
190 pthread_mutex_destroy(&load_hash[i].lock);
191 pthread_rwlock_destroy(&load_hash[i].rdlock);
192 pthread_rwlock_destroy(&load_hash[i].rilock);
193 }
194 return -1;
195}
196
197static void insert_node(struct load_node **n, int locate)
198{
199 struct load_node *f;
200
201 pthread_mutex_lock(&load_hash[locate].lock);
202 pthread_rwlock_wrlock(&load_hash[locate].rilock);
203 f = load_hash[locate].next;
204 load_hash[locate].next = *n;
205
206 (*n)->pre = &(load_hash[locate].next);
207 if (f)
208 f->pre = &((*n)->next);
209 (*n)->next = f;
210 pthread_mutex_unlock(&load_hash[locate].lock);
211 pthread_rwlock_unlock(&load_hash[locate].rilock);
212}
213/*
214 * locate_node() finds special node. Not return NULL means success.
215 * It should be noted that rdlock isn't unlocked at the end of code
216 * because this function is used to read special node. Delete is not
217 * allowed before read has ended.
218 * unlock rdlock only in proc_loadavg_read().
219 */
220static struct load_node *locate_node(char *cg, int locate)
221{
222 struct load_node *f = NULL;
223 int i = 0;
224
225 pthread_rwlock_rdlock(&load_hash[locate].rilock);
226 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
227 if (load_hash[locate].next == NULL) {
228 pthread_rwlock_unlock(&load_hash[locate].rilock);
229 return f;
230 }
231 f = load_hash[locate].next;
232 pthread_rwlock_unlock(&load_hash[locate].rilock);
233 while (f && ((i = strcmp(f->cg, cg)) != 0))
234 f = f->next;
235 return f;
236}
237/* Delete the load_node n and return the next node of it. */
238static struct load_node *del_node(struct load_node *n, int locate)
239{
240 struct load_node *g;
241
242 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
243 if (n->next == NULL) {
244 *(n->pre) = NULL;
245 } else {
246 *(n->pre) = n->next;
247 n->next->pre = n->pre;
248 }
249 g = n->next;
250 free(n->cg);
251 free(n);
252 pthread_rwlock_unlock(&load_hash[locate].rdlock);
253 return g;
254}
255
a83618e2 256static void load_free(void)
9c480eb7 257{
258 int i;
259 struct load_node *f, *p;
260
261 for (i = 0; i < LOAD_SIZE; i++) {
262 pthread_mutex_lock(&load_hash[i].lock);
263 pthread_rwlock_wrlock(&load_hash[i].rilock);
264 pthread_rwlock_wrlock(&load_hash[i].rdlock);
265 if (load_hash[i].next == NULL) {
266 pthread_mutex_unlock(&load_hash[i].lock);
267 pthread_mutex_destroy(&load_hash[i].lock);
268 pthread_rwlock_unlock(&load_hash[i].rilock);
269 pthread_rwlock_destroy(&load_hash[i].rilock);
270 pthread_rwlock_unlock(&load_hash[i].rdlock);
271 pthread_rwlock_destroy(&load_hash[i].rdlock);
272 continue;
273 }
274 for (f = load_hash[i].next; f; ) {
275 free(f->cg);
276 p = f->next;
277 free(f);
278 f = p;
279 }
280 pthread_mutex_unlock(&load_hash[i].lock);
281 pthread_mutex_destroy(&load_hash[i].lock);
282 pthread_rwlock_unlock(&load_hash[i].rilock);
283 pthread_rwlock_destroy(&load_hash[i].rilock);
284 pthread_rwlock_unlock(&load_hash[i].rdlock);
285 pthread_rwlock_destroy(&load_hash[i].rdlock);
286 }
287}
f34de69a
CB
288/* Reserve buffer size to account for file size changes. */
289#define BUF_RESERVE_SIZE 512
237e200e
SH
290
291/*
292 * A table caching which pid is init for a pid namespace.
293 * When looking up which pid is init for $qpid, we first
294 * 1. Stat /proc/$qpid/ns/pid.
295 * 2. Check whether the ino_t is in our store.
296 * a. if not, fork a child in qpid's ns to send us
297 * ucred.pid = 1, and read the initpid. Cache
298 * initpid and creation time for /proc/initpid
299 * in a new store entry.
300 * b. if so, verify that /proc/initpid still matches
301 * what we have saved. If not, clear the store
302 * entry and go back to a. If so, return the
303 * cached initpid.
304 */
305struct pidns_init_store {
306 ino_t ino; // inode number for /proc/$pid/ns/pid
307 pid_t initpid; // the pid of nit in that ns
308 long int ctime; // the time at which /proc/$initpid was created
309 struct pidns_init_store *next;
310 long int lastcheck;
311};
312
313/* lol - look at how they are allocated in the kernel */
314#define PIDNS_HASH_SIZE 4096
315#define HASH(x) ((x) % PIDNS_HASH_SIZE)
316
317static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
318static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
319static void lock_mutex(pthread_mutex_t *l)
320{
321 int ret;
322
323 if ((ret = pthread_mutex_lock(l)) != 0) {
b8defc3d 324 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
325 exit(1);
326 }
327}
328
29a73c2f
CB
329/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
330 * Number of hierarchies mounted. */
331static int num_hierarchies;
332
333/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
334 * Hierachies mounted {cpuset, blkio, ...}:
335 * Initialized via __constructor__ collect_and_mount_subsystems(). */
336static char **hierarchies;
337
338/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
339 * Open file descriptors:
340 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
341 * private mount namespace.
342 * Initialized via __constructor__ collect_and_mount_subsystems().
343 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
344 * mounts and respective files in the private namespace even when located in
345 * another namespace using the *at() family of functions
346 * {openat(), fchownat(), ...}. */
347static int *fd_hierarchies;
a257a8ee 348static int cgroup_mount_ns_fd = -1;
29a73c2f 349
237e200e
SH
350static void unlock_mutex(pthread_mutex_t *l)
351{
352 int ret;
353
354 if ((ret = pthread_mutex_unlock(l)) != 0) {
b8defc3d 355 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
356 exit(1);
357 }
358}
359
360static void store_lock(void)
361{
362 lock_mutex(&pidns_store_mutex);
363}
364
365static void store_unlock(void)
366{
367 unlock_mutex(&pidns_store_mutex);
368}
369
370/* Must be called under store_lock */
371static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
372{
373 struct stat initsb;
374 char fnam[100];
375
376 snprintf(fnam, 100, "/proc/%d", e->initpid);
377 if (stat(fnam, &initsb) < 0)
378 return false;
7dd6560a
CB
379
380 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
381 initsb.st_ctime, e->initpid);
382
237e200e
SH
383 if (e->ctime != initsb.st_ctime)
384 return false;
385 return true;
386}
387
388/* Must be called under store_lock */
389static void remove_initpid(struct pidns_init_store *e)
390{
391 struct pidns_init_store *tmp;
392 int h;
393
7dd6560a
CB
394 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
395
237e200e
SH
396 h = HASH(e->ino);
397 if (pidns_hash_table[h] == e) {
398 pidns_hash_table[h] = e->next;
399 free(e);
400 return;
401 }
402
403 tmp = pidns_hash_table[h];
404 while (tmp) {
405 if (tmp->next == e) {
406 tmp->next = e->next;
407 free(e);
408 return;
409 }
410 tmp = tmp->next;
411 }
412}
413
414#define PURGE_SECS 5
415/* Must be called under store_lock */
416static void prune_initpid_store(void)
417{
418 static long int last_prune = 0;
419 struct pidns_init_store *e, *prev, *delme;
420 long int now, threshold;
421 int i;
422
423 if (!last_prune) {
424 last_prune = time(NULL);
425 return;
426 }
427 now = time(NULL);
428 if (now < last_prune + PURGE_SECS)
429 return;
7dd6560a
CB
430
431 lxcfs_debug("%s\n", "Pruning.");
432
237e200e
SH
433 last_prune = now;
434 threshold = now - 2 * PURGE_SECS;
435
436 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
437 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
438 if (e->lastcheck < threshold) {
7dd6560a
CB
439
440 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
441
237e200e
SH
442 delme = e;
443 if (prev)
444 prev->next = e->next;
445 else
446 pidns_hash_table[i] = e->next;
447 e = e->next;
448 free(delme);
449 } else {
450 prev = e;
451 e = e->next;
452 }
453 }
454 }
455}
456
457/* Must be called under store_lock */
458static void save_initpid(struct stat *sb, pid_t pid)
459{
460 struct pidns_init_store *e;
461 char fpath[100];
462 struct stat procsb;
463 int h;
464
7dd6560a
CB
465 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
466
237e200e
SH
467 snprintf(fpath, 100, "/proc/%d", pid);
468 if (stat(fpath, &procsb) < 0)
469 return;
470 do {
471 e = malloc(sizeof(*e));
472 } while (!e);
473 e->ino = sb->st_ino;
474 e->initpid = pid;
475 e->ctime = procsb.st_ctime;
476 h = HASH(e->ino);
477 e->next = pidns_hash_table[h];
478 e->lastcheck = time(NULL);
479 pidns_hash_table[h] = e;
480}
481
482/*
483 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
484 * entry for the inode number and creation time. Verify that the init pid
485 * is still valid. If not, remove it. Return the entry if valid, NULL
486 * otherwise.
487 * Must be called under store_lock
488 */
489static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
490{
491 int h = HASH(sb->st_ino);
492 struct pidns_init_store *e = pidns_hash_table[h];
493
494 while (e) {
495 if (e->ino == sb->st_ino) {
496 if (initpid_still_valid(e, sb)) {
497 e->lastcheck = time(NULL);
498 return e;
499 }
500 remove_initpid(e);
501 return NULL;
502 }
503 e = e->next;
504 }
505
506 return NULL;
507}
508
0f657ce3 509static int is_dir(const char *path, int fd)
237e200e
SH
510{
511 struct stat statbuf;
0f657ce3 512 int ret = fstatat(fd, path, &statbuf, fd);
237e200e
SH
513 if (ret == 0 && S_ISDIR(statbuf.st_mode))
514 return 1;
515 return 0;
516}
517
518static char *must_copy_string(const char *str)
519{
520 char *dup = NULL;
521 if (!str)
522 return NULL;
523 do {
524 dup = strdup(str);
525 } while (!dup);
526
527 return dup;
528}
529
530static inline void drop_trailing_newlines(char *s)
531{
532 int l;
533
534 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
535 s[l-1] = '\0';
536}
537
538#define BATCH_SIZE 50
539static void dorealloc(char **mem, size_t oldlen, size_t newlen)
540{
541 int newbatches = (newlen / BATCH_SIZE) + 1;
542 int oldbatches = (oldlen / BATCH_SIZE) + 1;
543
544 if (!*mem || newbatches > oldbatches) {
545 char *tmp;
546 do {
547 tmp = realloc(*mem, newbatches * BATCH_SIZE);
548 } while (!tmp);
549 *mem = tmp;
550 }
551}
552static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
553{
554 size_t newlen = *len + linelen;
555 dorealloc(contents, *len, newlen + 1);
556 memcpy(*contents + *len, line, linelen+1);
557 *len = newlen;
558}
559
60f2ae53 560static char *slurp_file(const char *from, int fd)
237e200e
SH
561{
562 char *line = NULL;
563 char *contents = NULL;
60f2ae53 564 FILE *f = fdopen(fd, "r");
237e200e
SH
565 size_t len = 0, fulllen = 0;
566 ssize_t linelen;
567
568 if (!f)
569 return NULL;
570
571 while ((linelen = getline(&line, &len, f)) != -1) {
572 append_line(&contents, &fulllen, line, linelen);
573 }
574 fclose(f);
575
576 if (contents)
577 drop_trailing_newlines(contents);
578 free(line);
579 return contents;
580}
581
ba59ea09 582static bool write_string(const char *fnam, const char *string, int fd)
237e200e
SH
583{
584 FILE *f;
585 size_t len, ret;
586
beb5024e
CB
587 f = fdopen(fd, "w");
588 if (!f)
237e200e 589 return false;
beb5024e 590
237e200e
SH
591 len = strlen(string);
592 ret = fwrite(string, 1, len, f);
593 if (ret != len) {
beb5024e
CB
594 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
595 strerror(errno), string, fnam);
237e200e
SH
596 fclose(f);
597 return false;
598 }
beb5024e 599
237e200e 600 if (fclose(f) < 0) {
beb5024e 601 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
237e200e
SH
602 return false;
603 }
beb5024e 604
237e200e
SH
605 return true;
606}
607
237e200e
SH
608struct cgfs_files {
609 char *name;
610 uint32_t uid, gid;
611 uint32_t mode;
612};
613
0619767c 614#define ALLOC_NUM 20
237e200e
SH
615static bool store_hierarchy(char *stridx, char *h)
616{
0619767c
SH
617 if (num_hierarchies % ALLOC_NUM == 0) {
618 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
619 n *= ALLOC_NUM;
620 char **tmp = realloc(hierarchies, n * sizeof(char *));
0619767c 621 if (!tmp) {
b8defc3d 622 lxcfs_error("%s\n", strerror(errno));
0619767c
SH
623 exit(1);
624 }
237e200e 625 hierarchies = tmp;
237e200e 626 }
f676eb79 627
0619767c 628 hierarchies[num_hierarchies++] = must_copy_string(h);
237e200e
SH
629 return true;
630}
631
632static void print_subsystems(void)
633{
634 int i;
635
a257a8ee 636 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
cc97d34c 637 fprintf(stderr, "hierarchies:\n");
237e200e
SH
638 for (i = 0; i < num_hierarchies; i++) {
639 if (hierarchies[i])
b8defc3d
CB
640 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
641 fd_hierarchies[i], hierarchies[i]);
237e200e
SH
642 }
643}
644
645static bool in_comma_list(const char *needle, const char *haystack)
646{
647 const char *s = haystack, *e;
648 size_t nlen = strlen(needle);
649
06081b29 650 while (*s && (e = strchr(s, ','))) {
237e200e
SH
651 if (nlen != e - s) {
652 s = e + 1;
653 continue;
654 }
655 if (strncmp(needle, s, nlen) == 0)
656 return true;
657 s = e + 1;
658 }
659 if (strcmp(needle, s) == 0)
660 return true;
661 return false;
662}
663
664/* do we need to do any massaging here? I'm not sure... */
5dd3e6fd
CB
665/* Return the mounted controller and store the corresponding open file descriptor
666 * referring to the controller mountpoint in the private lxcfs namespace in
667 * @cfd.
668 */
669static char *find_mounted_controller(const char *controller, int *cfd)
237e200e
SH
670{
671 int i;
672
673 for (i = 0; i < num_hierarchies; i++) {
674 if (!hierarchies[i])
675 continue;
5dd3e6fd
CB
676 if (strcmp(hierarchies[i], controller) == 0) {
677 *cfd = fd_hierarchies[i];
237e200e 678 return hierarchies[i];
5dd3e6fd
CB
679 }
680 if (in_comma_list(controller, hierarchies[i])) {
681 *cfd = fd_hierarchies[i];
237e200e 682 return hierarchies[i];
5dd3e6fd 683 }
237e200e
SH
684 }
685
686 return NULL;
687}
688
689bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
690 const char *value)
691{
ba59ea09 692 int ret, fd, cfd;
237e200e 693 size_t len;
f5a6d92e 694 char *fnam, *tmpc;
237e200e 695
f5a6d92e 696 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
697 if (!tmpc)
698 return false;
f5a6d92e
CB
699
700 /* Make sure we pass a relative path to *at() family of functions.
701 * . + /cgroup + / + file + \0
702 */
ba59ea09 703 len = strlen(cgroup) + strlen(file) + 3;
237e200e 704 fnam = alloca(len);
ba59ea09
CB
705 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
706 if (ret < 0 || (size_t)ret >= len)
707 return false;
708
709 fd = openat(cfd, fnam, O_WRONLY);
710 if (fd < 0)
711 return false;
f676eb79 712
ba59ea09 713 return write_string(fnam, value, fd);
237e200e
SH
714}
715
716// Chown all the files in the cgroup directory. We do this when we create
717// a cgroup on behalf of a user.
f23fe717 718static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e 719{
f23fe717 720 struct dirent *direntp;
237e200e
SH
721 char path[MAXPATHLEN];
722 size_t len;
723 DIR *d;
f23fe717 724 int fd1, ret;
237e200e
SH
725
726 len = strlen(dirname);
727 if (len >= MAXPATHLEN) {
b8defc3d 728 lxcfs_error("Pathname too long: %s\n", dirname);
237e200e
SH
729 return;
730 }
731
f23fe717
CB
732 fd1 = openat(fd, dirname, O_DIRECTORY);
733 if (fd1 < 0)
734 return;
735
736 d = fdopendir(fd1);
237e200e 737 if (!d) {
b8defc3d 738 lxcfs_error("Failed to open %s\n", dirname);
237e200e
SH
739 return;
740 }
741
f23fe717 742 while ((direntp = readdir(d))) {
237e200e
SH
743 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
744 continue;
745 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
746 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 747 lxcfs_error("Pathname too long under %s\n", dirname);
237e200e
SH
748 continue;
749 }
f23fe717 750 if (fchownat(fd, path, uid, gid, 0) < 0)
b8defc3d 751 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
237e200e
SH
752 }
753 closedir(d);
754}
755
756int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
757{
5dd3e6fd 758 int cfd;
237e200e 759 size_t len;
f5a6d92e 760 char *dirnam, *tmpc;
237e200e 761
f5a6d92e 762 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
763 if (!tmpc)
764 return -EINVAL;
f5a6d92e
CB
765
766 /* Make sure we pass a relative path to *at() family of functions.
767 * . + /cg + \0
768 */
f23fe717 769 len = strlen(cg) + 2;
237e200e 770 dirnam = alloca(len);
f23fe717 771 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
237e200e 772
f23fe717 773 if (mkdirat(cfd, dirnam, 0755) < 0)
237e200e
SH
774 return -errno;
775
776 if (uid == 0 && gid == 0)
777 return 0;
778
f23fe717 779 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
237e200e
SH
780 return -errno;
781
f23fe717 782 chown_all_cgroup_files(dirnam, uid, gid, cfd);
237e200e
SH
783
784 return 0;
785}
786
7213ec5c 787static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
237e200e 788{
b7672ded 789 struct dirent *direntp;
237e200e
SH
790 DIR *dir;
791 bool ret = false;
792 char pathname[MAXPATHLEN];
b7672ded
CB
793 int dupfd;
794
795 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
796 if (dupfd < 0)
797 return false;
237e200e 798
b7672ded 799 dir = fdopendir(dupfd);
237e200e 800 if (!dir) {
7dd6560a 801 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
7213ec5c 802 close(dupfd);
237e200e
SH
803 return false;
804 }
805
b7672ded 806 while ((direntp = readdir(dir))) {
237e200e
SH
807 struct stat mystat;
808 int rc;
809
237e200e
SH
810 if (!strcmp(direntp->d_name, ".") ||
811 !strcmp(direntp->d_name, ".."))
812 continue;
813
814 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
815 if (rc < 0 || rc >= MAXPATHLEN) {
b8defc3d 816 lxcfs_error("%s\n", "Pathname too long.");
237e200e
SH
817 continue;
818 }
819
2e81a5e3
CB
820 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
821 if (rc) {
7dd6560a 822 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
237e200e
SH
823 continue;
824 }
7dd6560a 825 if (S_ISDIR(mystat.st_mode))
2e81a5e3 826 if (!recursive_rmdir(pathname, fd, cfd))
7dd6560a 827 lxcfs_debug("Error removing %s.\n", pathname);
237e200e
SH
828 }
829
830 ret = true;
831 if (closedir(dir) < 0) {
b8defc3d 832 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
237e200e
SH
833 ret = false;
834 }
835
2e81a5e3 836 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
7dd6560a 837 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
237e200e
SH
838 ret = false;
839 }
7213ec5c
CB
840
841 close(dupfd);
237e200e
SH
842
843 return ret;
844}
845
846bool cgfs_remove(const char *controller, const char *cg)
847{
b7672ded 848 int fd, cfd;
237e200e 849 size_t len;
f5a6d92e 850 char *dirnam, *tmpc;
7213ec5c 851 bool bret;
237e200e 852
f5a6d92e 853 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
854 if (!tmpc)
855 return false;
f5a6d92e
CB
856
857 /* Make sure we pass a relative path to *at() family of functions.
858 * . + /cg + \0
859 */
b7672ded 860 len = strlen(cg) + 2;
237e200e 861 dirnam = alloca(len);
b7672ded
CB
862 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
863
864 fd = openat(cfd, dirnam, O_DIRECTORY);
865 if (fd < 0)
866 return false;
867
7213ec5c
CB
868 bret = recursive_rmdir(dirnam, fd, cfd);
869 close(fd);
870 return bret;
237e200e
SH
871}
872
873bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
874{
5dd3e6fd 875 int cfd;
237e200e 876 size_t len;
f5a6d92e 877 char *pathname, *tmpc;
237e200e 878
f5a6d92e 879 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
880 if (!tmpc)
881 return false;
f5a6d92e
CB
882
883 /* Make sure we pass a relative path to *at() family of functions.
884 * . + /file + \0
885 */
534690b4 886 len = strlen(file) + 2;
237e200e 887 pathname = alloca(len);
534690b4
CB
888 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
889 if (fchmodat(cfd, pathname, mode, 0) < 0)
237e200e
SH
890 return false;
891 return true;
892}
893
0f657ce3 894static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e
SH
895{
896 size_t len;
897 char *fname;
898
899 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
900 fname = alloca(len);
901 snprintf(fname, len, "%s/tasks", dirname);
0f657ce3 902 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
903 return -errno;
904 snprintf(fname, len, "%s/cgroup.procs", dirname);
0f657ce3 905 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
906 return -errno;
907 return 0;
908}
909
910int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
911{
5dd3e6fd 912 int cfd;
237e200e 913 size_t len;
f5a6d92e 914 char *pathname, *tmpc;
237e200e 915
f5a6d92e 916 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
917 if (!tmpc)
918 return -EINVAL;
f5a6d92e
CB
919
920 /* Make sure we pass a relative path to *at() family of functions.
921 * . + /file + \0
922 */
0f657ce3 923 len = strlen(file) + 2;
237e200e 924 pathname = alloca(len);
0f657ce3
CB
925 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
926 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
237e200e
SH
927 return -errno;
928
0f657ce3 929 if (is_dir(pathname, cfd))
237e200e 930 // like cgmanager did, we want to chown the tasks file as well
0f657ce3 931 return chown_tasks_files(pathname, uid, gid, cfd);
237e200e
SH
932
933 return 0;
934}
935
936FILE *open_pids_file(const char *controller, const char *cgroup)
937{
3ffd08ee 938 int fd, cfd;
237e200e 939 size_t len;
f5a6d92e 940 char *pathname, *tmpc;
237e200e 941
f5a6d92e 942 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
943 if (!tmpc)
944 return NULL;
f5a6d92e
CB
945
946 /* Make sure we pass a relative path to *at() family of functions.
947 * . + /cgroup + / "cgroup.procs" + \0
948 */
3ffd08ee 949 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
237e200e 950 pathname = alloca(len);
3ffd08ee
CB
951 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
952
953 fd = openat(cfd, pathname, O_WRONLY);
954 if (fd < 0)
955 return NULL;
956
957 return fdopen(fd, "w");
237e200e
SH
958}
959
f366da65
WB
960static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
961 void ***list, size_t typesize,
962 void* (*iterator)(const char*, const char*, const char*))
237e200e 963{
4ea38a4c 964 int cfd, fd, ret;
237e200e 965 size_t len;
4ea38a4c 966 char *cg, *tmpc;
237e200e 967 char pathname[MAXPATHLEN];
f366da65 968 size_t sz = 0, asz = 0;
4ea38a4c 969 struct dirent *dirent;
237e200e 970 DIR *dir;
237e200e 971
4ea38a4c 972 tmpc = find_mounted_controller(controller, &cfd);
f366da65 973 *list = NULL;
237e200e 974 if (!tmpc)
e97c834b 975 return false;
237e200e 976
f5a6d92e 977 /* Make sure we pass a relative path to *at() family of functions. */
4ea38a4c
CB
978 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
979 cg = alloca(len);
980 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
981 if (ret < 0 || (size_t)ret >= len) {
b8defc3d 982 lxcfs_error("Pathname too long under %s\n", cgroup);
4ea38a4c
CB
983 return false;
984 }
237e200e 985
4ea38a4c
CB
986 fd = openat(cfd, cg, O_DIRECTORY);
987 if (fd < 0)
988 return false;
989
990 dir = fdopendir(fd);
237e200e
SH
991 if (!dir)
992 return false;
993
4ea38a4c 994 while ((dirent = readdir(dir))) {
237e200e 995 struct stat mystat;
237e200e 996
4ea38a4c
CB
997 if (!strcmp(dirent->d_name, ".") ||
998 !strcmp(dirent->d_name, ".."))
237e200e
SH
999 continue;
1000
4ea38a4c
CB
1001 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1002 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 1003 lxcfs_error("Pathname too long under %s\n", cg);
237e200e
SH
1004 continue;
1005 }
1006
4ea38a4c 1007 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
237e200e 1008 if (ret) {
b8defc3d 1009 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
237e200e
SH
1010 continue;
1011 }
f366da65
WB
1012 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1013 (directories && !S_ISDIR(mystat.st_mode)))
237e200e
SH
1014 continue;
1015
1016 if (sz+2 >= asz) {
f366da65 1017 void **tmp;
237e200e
SH
1018 asz += BATCH_SIZE;
1019 do {
f366da65 1020 tmp = realloc(*list, asz * typesize);
237e200e
SH
1021 } while (!tmp);
1022 *list = tmp;
1023 }
4ea38a4c 1024 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
237e200e
SH
1025 (*list)[sz+1] = NULL;
1026 sz++;
1027 }
1028 if (closedir(dir) < 0) {
b8defc3d 1029 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
237e200e
SH
1030 return false;
1031 }
1032 return true;
1033}
1034
f366da65
WB
1035static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1036{
1037 char *dup;
1038 do {
1039 dup = strdup(dir_entry);
1040 } while (!dup);
1041 return dup;
1042}
1043
1044bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1045{
1046 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1047}
1048
237e200e
SH
1049void free_key(struct cgfs_files *k)
1050{
1051 if (!k)
1052 return;
1053 free(k->name);
1054 free(k);
1055}
1056
1057void free_keys(struct cgfs_files **keys)
1058{
1059 int i;
1060
1061 if (!keys)
1062 return;
1063 for (i = 0; keys[i]; i++) {
1064 free_key(keys[i]);
1065 }
1066 free(keys);
1067}
1068
1069bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1070{
60f2ae53 1071 int ret, fd, cfd;
237e200e 1072 size_t len;
f5a6d92e 1073 char *fnam, *tmpc;
237e200e 1074
f5a6d92e 1075 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1076 if (!tmpc)
1077 return false;
f5a6d92e
CB
1078
1079 /* Make sure we pass a relative path to *at() family of functions.
1080 * . + /cgroup + / + file + \0
1081 */
60f2ae53 1082 len = strlen(cgroup) + strlen(file) + 3;
237e200e 1083 fnam = alloca(len);
60f2ae53
CB
1084 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1085 if (ret < 0 || (size_t)ret >= len)
234a820c 1086 return false;
60f2ae53
CB
1087
1088 fd = openat(cfd, fnam, O_RDONLY);
1089 if (fd < 0)
234a820c 1090 return false;
237e200e 1091
60f2ae53 1092 *value = slurp_file(fnam, fd);
237e200e
SH
1093 return *value != NULL;
1094}
1095
1096struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1097{
4ea38a4c 1098 int ret, cfd;
237e200e 1099 size_t len;
f5a6d92e 1100 char *fnam, *tmpc;
237e200e
SH
1101 struct stat sb;
1102 struct cgfs_files *newkey;
237e200e 1103
f5a6d92e 1104 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1105 if (!tmpc)
1106 return false;
1107
1108 if (file && *file == '/')
1109 file++;
1110
06081b29 1111 if (file && strchr(file, '/'))
237e200e
SH
1112 return NULL;
1113
f5a6d92e
CB
1114 /* Make sure we pass a relative path to *at() family of functions.
1115 * . + /cgroup + / + file + \0
1116 */
4ea38a4c 1117 len = strlen(cgroup) + 3;
237e200e
SH
1118 if (file)
1119 len += strlen(file) + 1;
1120 fnam = alloca(len);
4ea38a4c
CB
1121 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1122 file ? "/" : "", file ? file : "");
237e200e 1123
4ea38a4c 1124 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1125 if (ret < 0)
1126 return NULL;
1127
1128 do {
1129 newkey = malloc(sizeof(struct cgfs_files));
1130 } while (!newkey);
1131 if (file)
1132 newkey->name = must_copy_string(file);
06081b29
CB
1133 else if (strrchr(cgroup, '/'))
1134 newkey->name = must_copy_string(strrchr(cgroup, '/'));
237e200e
SH
1135 else
1136 newkey->name = must_copy_string(cgroup);
1137 newkey->uid = sb.st_uid;
1138 newkey->gid = sb.st_gid;
1139 newkey->mode = sb.st_mode;
1140
1141 return newkey;
1142}
1143
f366da65 1144static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
237e200e 1145{
f366da65
WB
1146 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1147 if (!entry) {
b8defc3d
CB
1148 lxcfs_error("Error getting files under %s:%s\n", controller,
1149 cgroup);
237e200e 1150 }
f366da65
WB
1151 return entry;
1152}
1153
1154bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1155{
1156 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
237e200e
SH
1157}
1158
1159bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
5dd3e6fd
CB
1160{
1161 int cfd;
1162 size_t len;
f5a6d92e 1163 char *fnam, *tmpc;
237e200e
SH
1164 int ret;
1165 struct stat sb;
1166
f5a6d92e 1167 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1168 if (!tmpc)
1169 return false;
f5a6d92e
CB
1170
1171 /* Make sure we pass a relative path to *at() family of functions.
1172 * . + /cgroup + / + f + \0
1173 */
d04232f2 1174 len = strlen(cgroup) + strlen(f) + 3;
237e200e 1175 fnam = alloca(len);
d04232f2
CB
1176 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1177 if (ret < 0 || (size_t)ret >= len)
1178 return false;
237e200e 1179
d04232f2 1180 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1181 if (ret < 0 || !S_ISDIR(sb.st_mode))
1182 return false;
f5a6d92e 1183
237e200e
SH
1184 return true;
1185}
1186
1187#define SEND_CREDS_OK 0
1188#define SEND_CREDS_NOTSK 1
1189#define SEND_CREDS_FAIL 2
1190static bool recv_creds(int sock, struct ucred *cred, char *v);
1191static int wait_for_pid(pid_t pid);
1192static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
b10bdd6c 1193static int send_creds_clone_wrapper(void *arg);
237e200e
SH
1194
1195/*
b10bdd6c 1196 * clone a task which switches to @task's namespace and writes '1'.
237e200e
SH
1197 * over a unix sock so we can read the task's reaper's pid in our
1198 * namespace
b10bdd6c
FG
1199 *
1200 * Note: glibc's fork() does not respect pidns, which can lead to failed
1201 * assertions inside glibc (and thus failed forks) if the child's pid in
1202 * the pidns and the parent pid outside are identical. Using clone prevents
1203 * this issue.
237e200e
SH
1204 */
1205static void write_task_init_pid_exit(int sock, pid_t target)
1206{
237e200e
SH
1207 char fnam[100];
1208 pid_t pid;
237e200e 1209 int fd, ret;
b10bdd6c
FG
1210 size_t stack_size = sysconf(_SC_PAGESIZE);
1211 void *stack = alloca(stack_size);
237e200e
SH
1212
1213 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1214 if (ret < 0 || ret >= sizeof(fnam))
1215 _exit(1);
1216
1217 fd = open(fnam, O_RDONLY);
1218 if (fd < 0) {
1219 perror("write_task_init_pid_exit open of ns/pid");
1220 _exit(1);
1221 }
1222 if (setns(fd, 0)) {
1223 perror("write_task_init_pid_exit setns 1");
1224 close(fd);
1225 _exit(1);
1226 }
b10bdd6c 1227 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
237e200e
SH
1228 if (pid < 0)
1229 _exit(1);
1230 if (pid != 0) {
1231 if (!wait_for_pid(pid))
1232 _exit(1);
1233 _exit(0);
1234 }
b10bdd6c
FG
1235}
1236
1237static int send_creds_clone_wrapper(void *arg) {
1238 struct ucred cred;
1239 char v;
1240 int sock = *(int *)arg;
237e200e
SH
1241
1242 /* we are the child */
1243 cred.uid = 0;
1244 cred.gid = 0;
1245 cred.pid = 1;
1246 v = '1';
1247 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
b10bdd6c
FG
1248 return 1;
1249 return 0;
237e200e
SH
1250}
1251
1252static pid_t get_init_pid_for_task(pid_t task)
1253{
1254 int sock[2];
1255 pid_t pid;
1256 pid_t ret = -1;
1257 char v = '0';
1258 struct ucred cred;
1259
1260 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1261 perror("socketpair");
1262 return -1;
1263 }
1264
1265 pid = fork();
1266 if (pid < 0)
1267 goto out;
1268 if (!pid) {
1269 close(sock[1]);
1270 write_task_init_pid_exit(sock[0], task);
1271 _exit(0);
1272 }
1273
1274 if (!recv_creds(sock[1], &cred, &v))
1275 goto out;
1276 ret = cred.pid;
1277
1278out:
1279 close(sock[0]);
1280 close(sock[1]);
1281 if (pid > 0)
1282 wait_for_pid(pid);
1283 return ret;
1284}
1285
1286static pid_t lookup_initpid_in_store(pid_t qpid)
1287{
1288 pid_t answer = 0;
1289 struct stat sb;
1290 struct pidns_init_store *e;
1291 char fnam[100];
1292
1293 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1294 store_lock();
1295 if (stat(fnam, &sb) < 0)
1296 goto out;
1297 e = lookup_verify_initpid(&sb);
1298 if (e) {
1299 answer = e->initpid;
1300 goto out;
1301 }
1302 answer = get_init_pid_for_task(qpid);
1303 if (answer > 0)
1304 save_initpid(&sb, answer);
1305
1306out:
1307 /* we prune at end in case we are returning
1308 * the value we were about to return */
1309 prune_initpid_store();
1310 store_unlock();
1311 return answer;
1312}
1313
1314static int wait_for_pid(pid_t pid)
1315{
1316 int status, ret;
1317
1318 if (pid <= 0)
1319 return -1;
1320
1321again:
1322 ret = waitpid(pid, &status, 0);
1323 if (ret == -1) {
1324 if (errno == EINTR)
1325 goto again;
1326 return -1;
1327 }
1328 if (ret != pid)
1329 goto again;
1330 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1331 return -1;
1332 return 0;
1333}
1334
1335
1336/*
1337 * append pid to *src.
1338 * src: a pointer to a char* in which ot append the pid.
1339 * sz: the number of characters printed so far, minus trailing \0.
1340 * asz: the allocated size so far
1341 * pid: the pid to append
1342 */
1343static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1344{
1345 char tmp[30];
1346
1347 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1348
1349 if (!*src || tmplen + *sz + 1 >= *asz) {
1350 char *tmp;
1351 do {
1352 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1353 } while (!tmp);
1354 *src = tmp;
1355 *asz += BUF_RESERVE_SIZE;
1356 }
bbfd0e33 1357 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
237e200e 1358 *sz += tmplen;
237e200e
SH
1359}
1360
1361/*
1362 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1363 * valid in the caller's namespace, return the id mapped into
1364 * pid's namespace.
1365 * Returns the mapped id, or -1 on error.
1366 */
1367unsigned int
1368convert_id_to_ns(FILE *idfile, unsigned int in_id)
1369{
1370 unsigned int nsuid, // base id for a range in the idfile's namespace
1371 hostuid, // base id for a range in the caller's namespace
1372 count; // number of ids in this range
1373 char line[400];
1374 int ret;
1375
1376 fseek(idfile, 0L, SEEK_SET);
1377 while (fgets(line, 400, idfile)) {
1378 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1379 if (ret != 3)
1380 continue;
1381 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1382 /*
1383 * uids wrapped around - unexpected as this is a procfile,
1384 * so just bail.
1385 */
b8defc3d 1386 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
237e200e
SH
1387 nsuid, hostuid, count, line);
1388 return -1;
1389 }
1390 if (hostuid <= in_id && hostuid+count > in_id) {
1391 /*
1392 * now since hostuid <= in_id < hostuid+count, and
1393 * hostuid+count and nsuid+count do not wrap around,
1394 * we know that nsuid+(in_id-hostuid) which must be
1395 * less that nsuid+(count) must not wrap around
1396 */
1397 return (in_id - hostuid) + nsuid;
1398 }
1399 }
1400
1401 // no answer found
1402 return -1;
1403}
1404
1405/*
1406 * for is_privileged_over,
1407 * specify whether we require the calling uid to be root in his
1408 * namespace
1409 */
1410#define NS_ROOT_REQD true
1411#define NS_ROOT_OPT false
1412
1413#define PROCLEN 100
1414
1415static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1416{
1417 char fpath[PROCLEN];
1418 int ret;
1419 bool answer = false;
1420 uid_t nsuid;
1421
1422 if (victim == -1 || uid == -1)
1423 return false;
1424
1425 /*
1426 * If the request is one not requiring root in the namespace,
1427 * then having the same uid suffices. (i.e. uid 1000 has write
1428 * access to files owned by uid 1000
1429 */
1430 if (!req_ns_root && uid == victim)
1431 return true;
1432
1433 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1434 if (ret < 0 || ret >= PROCLEN)
1435 return false;
1436 FILE *f = fopen(fpath, "r");
1437 if (!f)
1438 return false;
1439
1440 /* if caller's not root in his namespace, reject */
1441 nsuid = convert_id_to_ns(f, uid);
1442 if (nsuid)
1443 goto out;
1444
1445 /*
1446 * If victim is not mapped into caller's ns, reject.
1447 * XXX I'm not sure this check is needed given that fuse
1448 * will be sending requests where the vfs has converted
1449 */
1450 nsuid = convert_id_to_ns(f, victim);
1451 if (nsuid == -1)
1452 goto out;
1453
1454 answer = true;
1455
1456out:
1457 fclose(f);
1458 return answer;
1459}
1460
1461static bool perms_include(int fmode, mode_t req_mode)
1462{
1463 mode_t r;
1464
1465 switch (req_mode & O_ACCMODE) {
1466 case O_RDONLY:
1467 r = S_IROTH;
1468 break;
1469 case O_WRONLY:
1470 r = S_IWOTH;
1471 break;
1472 case O_RDWR:
1473 r = S_IROTH | S_IWOTH;
1474 break;
1475 default:
1476 return false;
1477 }
1478 return ((fmode & r) == r);
1479}
1480
1481
1482/*
1483 * taskcg is a/b/c
1484 * querycg is /a/b/c/d/e
1485 * we return 'd'
1486 */
1487static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1488{
1489 char *start, *end;
1490
1491 if (strlen(taskcg) <= strlen(querycg)) {
b8defc3d 1492 lxcfs_error("%s\n", "I was fed bad input.");
237e200e
SH
1493 return NULL;
1494 }
1495
06081b29 1496 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
237e200e
SH
1497 start = strdup(taskcg + 1);
1498 else
1499 start = strdup(taskcg + strlen(querycg) + 1);
1500 if (!start)
1501 return NULL;
1502 end = strchr(start, '/');
1503 if (end)
1504 *end = '\0';
1505 return start;
1506}
1507
1508static void stripnewline(char *x)
1509{
1510 size_t l = strlen(x);
1511 if (l && x[l-1] == '\n')
1512 x[l-1] = '\0';
1513}
1514
1515static char *get_pid_cgroup(pid_t pid, const char *contrl)
1516{
5dd3e6fd 1517 int cfd;
237e200e
SH
1518 char fnam[PROCLEN];
1519 FILE *f;
1520 char *answer = NULL;
1521 char *line = NULL;
1522 size_t len = 0;
1523 int ret;
5dd3e6fd 1524 const char *h = find_mounted_controller(contrl, &cfd);
237e200e
SH
1525 if (!h)
1526 return NULL;
1527
1528 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1529 if (ret < 0 || ret >= PROCLEN)
1530 return NULL;
1531 if (!(f = fopen(fnam, "r")))
1532 return NULL;
1533
1534 while (getline(&line, &len, f) != -1) {
1535 char *c1, *c2;
1536 if (!line[0])
1537 continue;
1538 c1 = strchr(line, ':');
1539 if (!c1)
1540 goto out;
1541 c1++;
1542 c2 = strchr(c1, ':');
1543 if (!c2)
1544 goto out;
1545 *c2 = '\0';
1546 if (strcmp(c1, h) != 0)
1547 continue;
1548 c2++;
1549 stripnewline(c2);
1550 do {
1551 answer = strdup(c2);
1552 } while (!answer);
1553 break;
1554 }
1555
1556out:
1557 fclose(f);
1558 free(line);
1559 return answer;
1560}
1561
1562/*
1563 * check whether a fuse context may access a cgroup dir or file
1564 *
1565 * If file is not null, it is a cgroup file to check under cg.
1566 * If file is null, then we are checking perms on cg itself.
1567 *
1568 * For files we can check the mode of the list_keys result.
1569 * For cgroups, we must make assumptions based on the files under the
1570 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1571 * yet.
1572 */
1573static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1574{
1575 struct cgfs_files *k = NULL;
1576 bool ret = false;
1577
1578 k = cgfs_get_key(contrl, cg, file);
1579 if (!k)
1580 return false;
1581
1582 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1583 if (perms_include(k->mode >> 6, mode)) {
1584 ret = true;
1585 goto out;
1586 }
1587 }
1588 if (fc->gid == k->gid) {
1589 if (perms_include(k->mode >> 3, mode)) {
1590 ret = true;
1591 goto out;
1592 }
1593 }
1594 ret = perms_include(k->mode, mode);
1595
1596out:
1597 free_key(k);
1598 return ret;
1599}
1600
1601#define INITSCOPE "/init.scope"
1602static void prune_init_slice(char *cg)
1603{
1604 char *point;
1605 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1606
1607 if (cg_len < initscope_len)
1608 return;
1609
1610 point = cg + cg_len - initscope_len;
1611 if (strcmp(point, INITSCOPE) == 0) {
1612 if (point == cg)
1613 *(point+1) = '\0';
1614 else
1615 *point = '\0';
1616 }
1617}
1618
1619/*
1620 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1621 * If pid is in /a, he may act on /a/b, but not on /b.
1622 * if the answer is false and nextcg is not NULL, then *nextcg will point
1623 * to a string containing the next cgroup directory under cg, which must be
1624 * freed by the caller.
1625 */
1626static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1627{
1628 bool answer = false;
1629 char *c2 = get_pid_cgroup(pid, contrl);
1630 char *linecmp;
1631
1632 if (!c2)
1633 return false;
1634 prune_init_slice(c2);
1635
1636 /*
12c31268
CB
1637 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1638 * they pass in a cgroup without leading '/'
1639 *
1640 * The original line here was:
1641 * linecmp = *cg == '/' ? c2 : c2+1;
1642 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1643 * Serge, do you know?
237e200e 1644 */
12c31268
CB
1645 if (*cg == '/' || !strncmp(cg, "./", 2))
1646 linecmp = c2;
1647 else
1648 linecmp = c2 + 1;
237e200e
SH
1649 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1650 if (nextcg) {
1651 *nextcg = get_next_cgroup_dir(linecmp, cg);
1652 }
1653 goto out;
1654 }
1655 answer = true;
1656
1657out:
1658 free(c2);
1659 return answer;
1660}
1661
1662/*
1663 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1664 */
1665static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1666{
1667 bool answer = false;
1668 char *c2, *task_cg;
1669 size_t target_len, task_len;
1670
f7bff426 1671 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
237e200e
SH
1672 return true;
1673
1674 c2 = get_pid_cgroup(pid, contrl);
1675 if (!c2)
1676 return false;
1677 prune_init_slice(c2);
1678
1679 task_cg = c2 + 1;
1680 target_len = strlen(cg);
1681 task_len = strlen(task_cg);
1682 if (task_len == 0) {
1683 /* Task is in the root cg, it can see everything. This case is
1684 * not handled by the strmcps below, since they test for the
1685 * last /, but that is the first / that we've chopped off
1686 * above.
1687 */
1688 answer = true;
1689 goto out;
1690 }
1691 if (strcmp(cg, task_cg) == 0) {
1692 answer = true;
1693 goto out;
1694 }
1695 if (target_len < task_len) {
1696 /* looking up a parent dir */
1697 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1698 answer = true;
1699 goto out;
1700 }
1701 if (target_len > task_len) {
1702 /* looking up a child dir */
1703 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1704 answer = true;
1705 goto out;
1706 }
1707
1708out:
1709 free(c2);
1710 return answer;
1711}
1712
1713/*
1714 * given /cgroup/freezer/a/b, return "freezer".
1715 * the returned char* should NOT be freed.
1716 */
1717static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1718{
1719 const char *p1;
1720 char *contr, *slash;
1721
99142521 1722 if (strlen(path) < 9) {
e254948f 1723 errno = EACCES;
237e200e 1724 return NULL;
99142521
CB
1725 }
1726 if (*(path + 7) != '/') {
1727 errno = EINVAL;
237e200e 1728 return NULL;
99142521 1729 }
3adc421c 1730 p1 = path + 8;
237e200e 1731 contr = strdupa(p1);
99142521
CB
1732 if (!contr) {
1733 errno = ENOMEM;
237e200e 1734 return NULL;
99142521 1735 }
237e200e
SH
1736 slash = strstr(contr, "/");
1737 if (slash)
1738 *slash = '\0';
1739
1740 int i;
3adc421c 1741 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
1742 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1743 return hierarchies[i];
1744 }
99142521 1745 errno = ENOENT;
237e200e
SH
1746 return NULL;
1747}
1748
1749/*
1750 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1751 * Note that the returned value may include files (keynames) etc
1752 */
1753static const char *find_cgroup_in_path(const char *path)
1754{
1755 const char *p1;
1756
bc70ba9b 1757 if (strlen(path) < 9) {
e254948f 1758 errno = EACCES;
237e200e 1759 return NULL;
bc70ba9b
CB
1760 }
1761 p1 = strstr(path + 8, "/");
1762 if (!p1) {
1763 errno = EINVAL;
237e200e 1764 return NULL;
bc70ba9b
CB
1765 }
1766 errno = 0;
1767 return p1 + 1;
237e200e
SH
1768}
1769
1770/*
1771 * split the last path element from the path in @cg.
1772 * @dir is newly allocated and should be freed, @last not
1773*/
1774static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1775{
1776 char *p;
1777
1778 do {
1779 *dir = strdup(cg);
1780 } while (!*dir);
1781 *last = strrchr(cg, '/');
1782 if (!*last) {
1783 *last = NULL;
1784 return;
1785 }
1786 p = strrchr(*dir, '/');
1787 *p = '\0';
1788}
1789
1790/*
1791 * FUSE ops for /cgroup
1792 */
1793
1794int cg_getattr(const char *path, struct stat *sb)
1795{
1796 struct timespec now;
1797 struct fuse_context *fc = fuse_get_context();
1798 char * cgdir = NULL;
1799 char *last = NULL, *path1, *path2;
1800 struct cgfs_files *k = NULL;
1801 const char *cgroup;
1802 const char *controller = NULL;
1803 int ret = -ENOENT;
1804
1805
1806 if (!fc)
1807 return -EIO;
1808
1809 memset(sb, 0, sizeof(struct stat));
1810
1811 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1812 return -EINVAL;
1813
1814 sb->st_uid = sb->st_gid = 0;
1815 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1816 sb->st_size = 0;
1817
1818 if (strcmp(path, "/cgroup") == 0) {
1819 sb->st_mode = S_IFDIR | 00755;
1820 sb->st_nlink = 2;
1821 return 0;
1822 }
1823
1824 controller = pick_controller_from_path(fc, path);
1825 if (!controller)
2f7036d0 1826 return -errno;
237e200e
SH
1827 cgroup = find_cgroup_in_path(path);
1828 if (!cgroup) {
1829 /* this is just /cgroup/controller, return it as a dir */
1830 sb->st_mode = S_IFDIR | 00755;
1831 sb->st_nlink = 2;
1832 return 0;
1833 }
1834
1835 get_cgdir_and_path(cgroup, &cgdir, &last);
1836
1837 if (!last) {
1838 path1 = "/";
1839 path2 = cgdir;
1840 } else {
1841 path1 = cgdir;
1842 path2 = last;
1843 }
1844
1845 pid_t initpid = lookup_initpid_in_store(fc->pid);
1846 if (initpid <= 0)
1847 initpid = fc->pid;
1848 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1849 * Then check that caller's cgroup is under path if last is a child
1850 * cgroup, or cgdir if last is a file */
1851
1852 if (is_child_cgroup(controller, path1, path2)) {
1853 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1854 ret = -ENOENT;
1855 goto out;
1856 }
1857 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1858 /* this is just /cgroup/controller, return it as a dir */
1859 sb->st_mode = S_IFDIR | 00555;
1860 sb->st_nlink = 2;
1861 ret = 0;
1862 goto out;
1863 }
1864 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1865 ret = -EACCES;
1866 goto out;
1867 }
1868
1869 // get uid, gid, from '/tasks' file and make up a mode
1870 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1871 sb->st_mode = S_IFDIR | 00755;
1872 k = cgfs_get_key(controller, cgroup, NULL);
1873 if (!k) {
1874 sb->st_uid = sb->st_gid = 0;
1875 } else {
1876 sb->st_uid = k->uid;
1877 sb->st_gid = k->gid;
1878 }
1879 free_key(k);
1880 sb->st_nlink = 2;
1881 ret = 0;
1882 goto out;
1883 }
1884
1885 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1886 sb->st_mode = S_IFREG | k->mode;
1887 sb->st_nlink = 1;
1888 sb->st_uid = k->uid;
1889 sb->st_gid = k->gid;
1890 sb->st_size = 0;
1891 free_key(k);
1892 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1893 ret = -ENOENT;
1894 goto out;
1895 }
237e200e
SH
1896 ret = 0;
1897 }
1898
1899out:
1900 free(cgdir);
1901 return ret;
1902}
1903
1904int cg_opendir(const char *path, struct fuse_file_info *fi)
1905{
1906 struct fuse_context *fc = fuse_get_context();
1907 const char *cgroup;
1908 struct file_info *dir_info;
1909 char *controller = NULL;
1910
1911 if (!fc)
1912 return -EIO;
1913
1914 if (strcmp(path, "/cgroup") == 0) {
1915 cgroup = NULL;
1916 controller = NULL;
1917 } else {
1918 // return list of keys for the controller, and list of child cgroups
1919 controller = pick_controller_from_path(fc, path);
1920 if (!controller)
2f7036d0 1921 return -errno;
237e200e
SH
1922
1923 cgroup = find_cgroup_in_path(path);
1924 if (!cgroup) {
1925 /* this is just /cgroup/controller, return its contents */
1926 cgroup = "/";
1927 }
1928 }
1929
1930 pid_t initpid = lookup_initpid_in_store(fc->pid);
1931 if (initpid <= 0)
1932 initpid = fc->pid;
1933 if (cgroup) {
1934 if (!caller_may_see_dir(initpid, controller, cgroup))
1935 return -ENOENT;
1936 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1937 return -EACCES;
1938 }
1939
1940 /* we'll free this at cg_releasedir */
1941 dir_info = malloc(sizeof(*dir_info));
1942 if (!dir_info)
1943 return -ENOMEM;
1944 dir_info->controller = must_copy_string(controller);
1945 dir_info->cgroup = must_copy_string(cgroup);
1946 dir_info->type = LXC_TYPE_CGDIR;
1947 dir_info->buf = NULL;
1948 dir_info->file = NULL;
1949 dir_info->buflen = 0;
1950
1951 fi->fh = (unsigned long)dir_info;
1952 return 0;
1953}
1954
1955int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1956 struct fuse_file_info *fi)
1957{
1958 struct file_info *d = (struct file_info *)fi->fh;
1959 struct cgfs_files **list = NULL;
1960 int i, ret;
1961 char *nextcg = NULL;
1962 struct fuse_context *fc = fuse_get_context();
1963 char **clist = NULL;
1964
d639f863
CB
1965 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1966 return -EIO;
1967
237e200e 1968 if (d->type != LXC_TYPE_CGDIR) {
b8defc3d 1969 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
237e200e
SH
1970 return -EIO;
1971 }
1972 if (!d->cgroup && !d->controller) {
1973 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1974 int i;
1975
1976 for (i = 0; i < num_hierarchies; i++) {
1977 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1978 return -EIO;
1979 }
1980 }
1981 return 0;
1982 }
1983
1984 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1985 // not a valid cgroup
1986 ret = -EINVAL;
1987 goto out;
1988 }
1989
1990 pid_t initpid = lookup_initpid_in_store(fc->pid);
1991 if (initpid <= 0)
1992 initpid = fc->pid;
1993 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1994 if (nextcg) {
1995 ret = filler(buf, nextcg, NULL, 0);
1996 free(nextcg);
1997 if (ret != 0) {
1998 ret = -EIO;
1999 goto out;
2000 }
2001 }
2002 ret = 0;
2003 goto out;
2004 }
2005
2006 for (i = 0; list[i]; i++) {
2007 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2008 ret = -EIO;
2009 goto out;
2010 }
2011 }
2012
2013 // now get the list of child cgroups
2014
2015 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2016 ret = 0;
2017 goto out;
2018 }
f366da65
WB
2019 if (clist) {
2020 for (i = 0; clist[i]; i++) {
2021 if (filler(buf, clist[i], NULL, 0) != 0) {
2022 ret = -EIO;
2023 goto out;
2024 }
237e200e
SH
2025 }
2026 }
2027 ret = 0;
2028
2029out:
2030 free_keys(list);
2031 if (clist) {
2032 for (i = 0; clist[i]; i++)
2033 free(clist[i]);
2034 free(clist);
2035 }
2036 return ret;
2037}
2038
43215927 2039static void do_release_file_info(struct fuse_file_info *fi)
237e200e 2040{
43215927
SH
2041 struct file_info *f = (struct file_info *)fi->fh;
2042
237e200e
SH
2043 if (!f)
2044 return;
43215927
SH
2045
2046 fi->fh = 0;
2047
237e200e 2048 free(f->controller);
43215927 2049 f->controller = NULL;
237e200e 2050 free(f->cgroup);
43215927 2051 f->cgroup = NULL;
237e200e 2052 free(f->file);
43215927 2053 f->file = NULL;
237e200e 2054 free(f->buf);
43215927 2055 f->buf = NULL;
237e200e 2056 free(f);
bbb508dd 2057 f = NULL;
237e200e
SH
2058}
2059
2060int cg_releasedir(const char *path, struct fuse_file_info *fi)
2061{
43215927 2062 do_release_file_info(fi);
237e200e
SH
2063 return 0;
2064}
2065
2066int cg_open(const char *path, struct fuse_file_info *fi)
2067{
2068 const char *cgroup;
2069 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2070 struct cgfs_files *k = NULL;
2071 struct file_info *file_info;
2072 struct fuse_context *fc = fuse_get_context();
2073 int ret;
2074
2075 if (!fc)
2076 return -EIO;
2077
2078 controller = pick_controller_from_path(fc, path);
2079 if (!controller)
2f7036d0 2080 return -errno;
237e200e
SH
2081 cgroup = find_cgroup_in_path(path);
2082 if (!cgroup)
bc70ba9b 2083 return -errno;
237e200e
SH
2084
2085 get_cgdir_and_path(cgroup, &cgdir, &last);
2086 if (!last) {
2087 path1 = "/";
2088 path2 = cgdir;
2089 } else {
2090 path1 = cgdir;
2091 path2 = last;
2092 }
2093
2094 k = cgfs_get_key(controller, path1, path2);
2095 if (!k) {
2096 ret = -EINVAL;
2097 goto out;
2098 }
2099 free_key(k);
2100
2101 pid_t initpid = lookup_initpid_in_store(fc->pid);
2102 if (initpid <= 0)
2103 initpid = fc->pid;
2104 if (!caller_may_see_dir(initpid, controller, path1)) {
2105 ret = -ENOENT;
2106 goto out;
2107 }
2108 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
237e200e
SH
2109 ret = -EACCES;
2110 goto out;
2111 }
2112
2113 /* we'll free this at cg_release */
2114 file_info = malloc(sizeof(*file_info));
2115 if (!file_info) {
2116 ret = -ENOMEM;
2117 goto out;
2118 }
2119 file_info->controller = must_copy_string(controller);
2120 file_info->cgroup = must_copy_string(path1);
2121 file_info->file = must_copy_string(path2);
2122 file_info->type = LXC_TYPE_CGFILE;
2123 file_info->buf = NULL;
2124 file_info->buflen = 0;
2125
2126 fi->fh = (unsigned long)file_info;
2127 ret = 0;
2128
2129out:
2130 free(cgdir);
2131 return ret;
2132}
2133
bddbb106
SH
2134int cg_access(const char *path, int mode)
2135{
6f0f6b83 2136 int ret;
bddbb106 2137 const char *cgroup;
6f0f6b83
CB
2138 char *path1, *path2, *controller;
2139 char *last = NULL, *cgdir = NULL;
bddbb106
SH
2140 struct cgfs_files *k = NULL;
2141 struct fuse_context *fc = fuse_get_context();
6f0f6b83 2142
9873c5e8 2143 if (strcmp(path, "/cgroup") == 0)
6f0f6b83 2144 return 0;
bddbb106
SH
2145
2146 if (!fc)
2147 return -EIO;
2148
2149 controller = pick_controller_from_path(fc, path);
2150 if (!controller)
2f7036d0 2151 return -errno;
bddbb106 2152 cgroup = find_cgroup_in_path(path);
575316c4
SH
2153 if (!cgroup) {
2154 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
3f441bc7
SH
2155 if ((mode & W_OK) == 0)
2156 return 0;
2157 return -EACCES;
575316c4 2158 }
bddbb106
SH
2159
2160 get_cgdir_and_path(cgroup, &cgdir, &last);
2161 if (!last) {
2162 path1 = "/";
2163 path2 = cgdir;
2164 } else {
2165 path1 = cgdir;
2166 path2 = last;
2167 }
2168
2169 k = cgfs_get_key(controller, path1, path2);
2170 if (!k) {
3f441bc7
SH
2171 if ((mode & W_OK) == 0)
2172 ret = 0;
2173 else
2174 ret = -EACCES;
bddbb106
SH
2175 goto out;
2176 }
2177 free_key(k);
2178
2179 pid_t initpid = lookup_initpid_in_store(fc->pid);
2180 if (initpid <= 0)
2181 initpid = fc->pid;
2182 if (!caller_may_see_dir(initpid, controller, path1)) {
2183 ret = -ENOENT;
2184 goto out;
2185 }
2186 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2187 ret = -EACCES;
2188 goto out;
2189 }
2190
2191 ret = 0;
2192
2193out:
2194 free(cgdir);
2195 return ret;
2196}
2197
237e200e
SH
2198int cg_release(const char *path, struct fuse_file_info *fi)
2199{
43215927 2200 do_release_file_info(fi);
237e200e
SH
2201 return 0;
2202}
2203
2204#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2205
2206static bool wait_for_sock(int sock, int timeout)
2207{
2208 struct epoll_event ev;
2209 int epfd, ret, now, starttime, deltatime, saved_errno;
2210
2211 if ((starttime = time(NULL)) < 0)
2212 return false;
2213
2214 if ((epfd = epoll_create(1)) < 0) {
b8defc3d 2215 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
237e200e
SH
2216 return false;
2217 }
2218
2219 ev.events = POLLIN_SET;
2220 ev.data.fd = sock;
2221 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
b8defc3d 2222 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
237e200e
SH
2223 close(epfd);
2224 return false;
2225 }
2226
2227again:
2228 if ((now = time(NULL)) < 0) {
2229 close(epfd);
2230 return false;
2231 }
2232
2233 deltatime = (starttime + timeout) - now;
2234 if (deltatime < 0) { // timeout
2235 errno = 0;
2236 close(epfd);
2237 return false;
2238 }
2239 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2240 if (ret < 0 && errno == EINTR)
2241 goto again;
2242 saved_errno = errno;
2243 close(epfd);
2244
2245 if (ret <= 0) {
2246 errno = saved_errno;
2247 return false;
2248 }
2249 return true;
2250}
2251
2252static int msgrecv(int sockfd, void *buf, size_t len)
2253{
2254 if (!wait_for_sock(sockfd, 2))
2255 return -1;
2256 return recv(sockfd, buf, len, MSG_DONTWAIT);
2257}
2258
2259static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2260{
2261 struct msghdr msg = { 0 };
2262 struct iovec iov;
2263 struct cmsghdr *cmsg;
2264 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2265 char buf[1];
2266 buf[0] = 'p';
2267
2268 if (pingfirst) {
2269 if (msgrecv(sock, buf, 1) != 1) {
b8defc3d 2270 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
237e200e
SH
2271 return SEND_CREDS_FAIL;
2272 }
2273 }
2274
2275 msg.msg_control = cmsgbuf;
2276 msg.msg_controllen = sizeof(cmsgbuf);
2277
2278 cmsg = CMSG_FIRSTHDR(&msg);
2279 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2280 cmsg->cmsg_level = SOL_SOCKET;
2281 cmsg->cmsg_type = SCM_CREDENTIALS;
2282 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2283
2284 msg.msg_name = NULL;
2285 msg.msg_namelen = 0;
2286
2287 buf[0] = v;
2288 iov.iov_base = buf;
2289 iov.iov_len = sizeof(buf);
2290 msg.msg_iov = &iov;
2291 msg.msg_iovlen = 1;
2292
2293 if (sendmsg(sock, &msg, 0) < 0) {
b8defc3d 2294 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
237e200e
SH
2295 if (errno == 3)
2296 return SEND_CREDS_NOTSK;
2297 return SEND_CREDS_FAIL;
2298 }
2299
2300 return SEND_CREDS_OK;
2301}
2302
2303static bool recv_creds(int sock, struct ucred *cred, char *v)
2304{
2305 struct msghdr msg = { 0 };
2306 struct iovec iov;
2307 struct cmsghdr *cmsg;
2308 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2309 char buf[1];
2310 int ret;
2311 int optval = 1;
2312
2313 *v = '1';
2314
2315 cred->pid = -1;
2316 cred->uid = -1;
2317 cred->gid = -1;
2318
2319 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
b8defc3d 2320 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
237e200e
SH
2321 return false;
2322 }
2323 buf[0] = '1';
2324 if (write(sock, buf, 1) != 1) {
b8defc3d 2325 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
237e200e
SH
2326 return false;
2327 }
2328
2329 msg.msg_name = NULL;
2330 msg.msg_namelen = 0;
2331 msg.msg_control = cmsgbuf;
2332 msg.msg_controllen = sizeof(cmsgbuf);
2333
2334 iov.iov_base = buf;
2335 iov.iov_len = sizeof(buf);
2336 msg.msg_iov = &iov;
2337 msg.msg_iovlen = 1;
2338
2339 if (!wait_for_sock(sock, 2)) {
b8defc3d 2340 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
237e200e
SH
2341 return false;
2342 }
2343 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2344 if (ret < 0) {
b8defc3d 2345 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
237e200e
SH
2346 return false;
2347 }
2348
2349 cmsg = CMSG_FIRSTHDR(&msg);
2350
2351 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2352 cmsg->cmsg_level == SOL_SOCKET &&
2353 cmsg->cmsg_type == SCM_CREDENTIALS) {
2354 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2355 }
2356 *v = buf[0];
2357
2358 return true;
2359}
2360
35174b0f
FG
2361struct pid_ns_clone_args {
2362 int *cpipe;
2363 int sock;
2364 pid_t tpid;
2365 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2366};
2367
2368/*
2369 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2370 * with clone(). This simply writes '1' as ACK back to the parent
2371 * before calling the actual wrapped function.
2372 */
2373static int pid_ns_clone_wrapper(void *arg) {
2374 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2375 char b = '1';
2376
2377 close(args->cpipe[0]);
b8defc3d
CB
2378 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2379 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
35174b0f
FG
2380 close(args->cpipe[1]);
2381 return args->wrapped(args->sock, args->tpid);
2382}
237e200e
SH
2383
2384/*
2385 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2386 * int value back over the socket. This shifts the pid from the
2387 * sender's pidns into tpid's pidns.
2388 */
35174b0f 2389static int pid_to_ns(int sock, pid_t tpid)
237e200e
SH
2390{
2391 char v = '0';
2392 struct ucred cred;
2393
2394 while (recv_creds(sock, &cred, &v)) {
2395 if (v == '1')
35174b0f 2396 return 0;
237e200e 2397 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
35174b0f 2398 return 1;
237e200e 2399 }
35174b0f 2400 return 0;
237e200e
SH
2401}
2402
35174b0f 2403
237e200e
SH
2404/*
2405 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
35174b0f
FG
2406 * in your old pidns. Only children which you clone will be in the target
2407 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2408 * actually convert pids.
2409 *
2410 * Note: glibc's fork() does not respect pidns, which can lead to failed
2411 * assertions inside glibc (and thus failed forks) if the child's pid in
2412 * the pidns and the parent pid outside are identical. Using clone prevents
2413 * this issue.
237e200e
SH
2414 */
2415static void pid_to_ns_wrapper(int sock, pid_t tpid)
2416{
2417 int newnsfd = -1, ret, cpipe[2];
2418 char fnam[100];
2419 pid_t cpid;
2420 char v;
2421
2422 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2423 if (ret < 0 || ret >= sizeof(fnam))
2424 _exit(1);
2425 newnsfd = open(fnam, O_RDONLY);
2426 if (newnsfd < 0)
2427 _exit(1);
2428 if (setns(newnsfd, 0) < 0)
2429 _exit(1);
2430 close(newnsfd);
2431
2432 if (pipe(cpipe) < 0)
2433 _exit(1);
2434
35174b0f
FG
2435 struct pid_ns_clone_args args = {
2436 .cpipe = cpipe,
2437 .sock = sock,
2438 .tpid = tpid,
2439 .wrapped = &pid_to_ns
2440 };
2441 size_t stack_size = sysconf(_SC_PAGESIZE);
2442 void *stack = alloca(stack_size);
2443
2444 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2445 if (cpid < 0)
2446 _exit(1);
2447
237e200e
SH
2448 // give the child 1 second to be done forking and
2449 // write its ack
2450 if (!wait_for_sock(cpipe[0], 1))
2451 _exit(1);
2452 ret = read(cpipe[0], &v, 1);
2453 if (ret != sizeof(char) || v != '1')
2454 _exit(1);
2455
2456 if (!wait_for_pid(cpid))
2457 _exit(1);
2458 _exit(0);
2459}
2460
2461/*
2462 * To read cgroup files with a particular pid, we will setns into the child
2463 * pidns, open a pipe, fork a child - which will be the first to really be in
2464 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2465 */
2466bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2467{
2468 int sock[2] = {-1, -1};
2469 char *tmpdata = NULL;
2470 int ret;
2471 pid_t qpid, cpid = -1;
2472 bool answer = false;
2473 char v = '0';
2474 struct ucred cred;
2475 size_t sz = 0, asz = 0;
2476
2477 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2478 return false;
2479
2480 /*
2481 * Now we read the pids from returned data one by one, pass
2482 * them into a child in the target namespace, read back the
2483 * translated pids, and put them into our to-return data
2484 */
2485
2486 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2487 perror("socketpair");
2488 free(tmpdata);
2489 return false;
2490 }
2491
2492 cpid = fork();
2493 if (cpid == -1)
2494 goto out;
2495
2496 if (!cpid) // child - exits when done
2497 pid_to_ns_wrapper(sock[1], tpid);
2498
2499 char *ptr = tmpdata;
2500 cred.uid = 0;
2501 cred.gid = 0;
2502 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2503 cred.pid = qpid;
2504 ret = send_creds(sock[0], &cred, v, true);
2505
2506 if (ret == SEND_CREDS_NOTSK)
2507 goto next;
2508 if (ret == SEND_CREDS_FAIL)
2509 goto out;
2510
2511 // read converted results
2512 if (!wait_for_sock(sock[0], 2)) {
b8defc3d 2513 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
237e200e
SH
2514 goto out;
2515 }
2516 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2517 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
237e200e
SH
2518 goto out;
2519 }
2520 must_strcat_pid(d, &sz, &asz, qpid);
2521next:
2522 ptr = strchr(ptr, '\n');
2523 if (!ptr)
2524 break;
2525 ptr++;
2526 }
2527
2528 cred.pid = getpid();
2529 v = '1';
2530 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2531 // failed to ask child to exit
b8defc3d 2532 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
237e200e
SH
2533 goto out;
2534 }
2535
2536 answer = true;
2537
2538out:
2539 free(tmpdata);
2540 if (cpid != -1)
2541 wait_for_pid(cpid);
2542 if (sock[0] != -1) {
2543 close(sock[0]);
2544 close(sock[1]);
2545 }
2546 return answer;
2547}
2548
2549int cg_read(const char *path, char *buf, size_t size, off_t offset,
2550 struct fuse_file_info *fi)
2551{
2552 struct fuse_context *fc = fuse_get_context();
2553 struct file_info *f = (struct file_info *)fi->fh;
2554 struct cgfs_files *k = NULL;
2555 char *data = NULL;
2556 int ret, s;
2557 bool r;
2558
2559 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 2560 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
237e200e
SH
2561 return -EIO;
2562 }
2563
2564 if (offset)
2565 return 0;
2566
2567 if (!fc)
2568 return -EIO;
2569
2570 if (!f->controller)
2571 return -EINVAL;
2572
2573 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2574 return -EINVAL;
2575 }
2576 free_key(k);
2577
2578
888f8f3c 2579 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
237e200e
SH
2580 ret = -EACCES;
2581 goto out;
2582 }
2583
2584 if (strcmp(f->file, "tasks") == 0 ||
2585 strcmp(f->file, "/tasks") == 0 ||
2586 strcmp(f->file, "/cgroup.procs") == 0 ||
2587 strcmp(f->file, "cgroup.procs") == 0)
2588 // special case - we have to translate the pids
2589 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2590 else
2591 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2592
2593 if (!r) {
2594 ret = -EINVAL;
2595 goto out;
2596 }
2597
2598 if (!data) {
2599 ret = 0;
2600 goto out;
2601 }
2602 s = strlen(data);
2603 if (s > size)
2604 s = size;
2605 memcpy(buf, data, s);
2606 if (s > 0 && s < size && data[s-1] != '\n')
2607 buf[s++] = '\n';
2608
2609 ret = s;
2610
2611out:
2612 free(data);
2613 return ret;
2614}
2615
35174b0f 2616static int pid_from_ns(int sock, pid_t tpid)
237e200e
SH
2617{
2618 pid_t vpid;
2619 struct ucred cred;
2620 char v;
2621 int ret;
2622
2623 cred.uid = 0;
2624 cred.gid = 0;
2625 while (1) {
2626 if (!wait_for_sock(sock, 2)) {
b8defc3d 2627 lxcfs_error("%s\n", "Timeout reading from parent.");
35174b0f 2628 return 1;
237e200e
SH
2629 }
2630 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
b8defc3d 2631 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
35174b0f 2632 return 1;
237e200e
SH
2633 }
2634 if (vpid == -1) // done
2635 break;
2636 v = '0';
2637 cred.pid = vpid;
2638 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2639 v = '1';
2640 cred.pid = getpid();
2641 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
35174b0f 2642 return 1;
237e200e
SH
2643 }
2644 }
35174b0f 2645 return 0;
237e200e
SH
2646}
2647
2648static void pid_from_ns_wrapper(int sock, pid_t tpid)
2649{
2650 int newnsfd = -1, ret, cpipe[2];
2651 char fnam[100];
2652 pid_t cpid;
2653 char v;
2654
2655 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2656 if (ret < 0 || ret >= sizeof(fnam))
2657 _exit(1);
2658 newnsfd = open(fnam, O_RDONLY);
2659 if (newnsfd < 0)
2660 _exit(1);
2661 if (setns(newnsfd, 0) < 0)
2662 _exit(1);
2663 close(newnsfd);
2664
2665 if (pipe(cpipe) < 0)
2666 _exit(1);
2667
35174b0f
FG
2668 struct pid_ns_clone_args args = {
2669 .cpipe = cpipe,
2670 .sock = sock,
2671 .tpid = tpid,
2672 .wrapped = &pid_from_ns
2673 };
f0f8b851
SH
2674 size_t stack_size = sysconf(_SC_PAGESIZE);
2675 void *stack = alloca(stack_size);
35174b0f
FG
2676
2677 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2678 if (cpid < 0)
2679 _exit(1);
2680
237e200e
SH
2681 // give the child 1 second to be done forking and
2682 // write its ack
2683 if (!wait_for_sock(cpipe[0], 1))
f0f8b851 2684 _exit(1);
237e200e 2685 ret = read(cpipe[0], &v, 1);
f0f8b851
SH
2686 if (ret != sizeof(char) || v != '1')
2687 _exit(1);
237e200e
SH
2688
2689 if (!wait_for_pid(cpid))
2690 _exit(1);
2691 _exit(0);
237e200e
SH
2692}
2693
2694/*
2695 * Given host @uid, return the uid to which it maps in
2696 * @pid's user namespace, or -1 if none.
2697 */
2698bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2699{
2700 FILE *f;
2701 char line[400];
2702
2703 sprintf(line, "/proc/%d/uid_map", pid);
2704 if ((f = fopen(line, "r")) == NULL) {
2705 return false;
2706 }
2707
2708 *answer = convert_id_to_ns(f, uid);
2709 fclose(f);
2710
2711 if (*answer == -1)
2712 return false;
2713 return true;
2714}
2715
2716/*
2717 * get_pid_creds: get the real uid and gid of @pid from
2718 * /proc/$$/status
2719 * (XXX should we use euid here?)
2720 */
2721void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2722{
2723 char line[400];
2724 uid_t u;
2725 gid_t g;
2726 FILE *f;
2727
2728 *uid = -1;
2729 *gid = -1;
2730 sprintf(line, "/proc/%d/status", pid);
2731 if ((f = fopen(line, "r")) == NULL) {
b8defc3d 2732 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
237e200e
SH
2733 return;
2734 }
2735 while (fgets(line, 400, f)) {
2736 if (strncmp(line, "Uid:", 4) == 0) {
2737 if (sscanf(line+4, "%u", &u) != 1) {
b8defc3d 2738 lxcfs_error("bad uid line for pid %u\n", pid);
237e200e
SH
2739 fclose(f);
2740 return;
2741 }
2742 *uid = u;
2743 } else if (strncmp(line, "Gid:", 4) == 0) {
2744 if (sscanf(line+4, "%u", &g) != 1) {
b8defc3d 2745 lxcfs_error("bad gid line for pid %u\n", pid);
237e200e
SH
2746 fclose(f);
2747 return;
2748 }
2749 *gid = g;
2750 }
2751 }
2752 fclose(f);
2753}
2754
2755/*
2756 * May the requestor @r move victim @v to a new cgroup?
2757 * This is allowed if
2758 * . they are the same task
2759 * . they are ownedy by the same uid
2760 * . @r is root on the host, or
2761 * . @v's uid is mapped into @r's where @r is root.
2762 */
2763bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2764{
2765 uid_t v_uid, tmpuid;
2766 gid_t v_gid;
2767
2768 if (r == v)
2769 return true;
2770 if (r_uid == 0)
2771 return true;
2772 get_pid_creds(v, &v_uid, &v_gid);
2773 if (r_uid == v_uid)
2774 return true;
2775 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2776 && hostuid_to_ns(v_uid, r, &tmpuid))
2777 return true;
2778 return false;
2779}
2780
2781static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2782 const char *file, const char *buf)
2783{
2784 int sock[2] = {-1, -1};
2785 pid_t qpid, cpid = -1;
2786 FILE *pids_file = NULL;
2787 bool answer = false, fail = false;
2788
2789 pids_file = open_pids_file(contrl, cg);
2790 if (!pids_file)
2791 return false;
2792
2793 /*
2794 * write the pids to a socket, have helper in writer's pidns
2795 * call movepid for us
2796 */
2797 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2798 perror("socketpair");
2799 goto out;
2800 }
2801
2802 cpid = fork();
2803 if (cpid == -1)
2804 goto out;
2805
2806 if (!cpid) { // child
2807 fclose(pids_file);
2808 pid_from_ns_wrapper(sock[1], tpid);
2809 }
2810
2811 const char *ptr = buf;
2812 while (sscanf(ptr, "%d", &qpid) == 1) {
2813 struct ucred cred;
2814 char v;
2815
2816 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2817 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
237e200e
SH
2818 goto out;
2819 }
2820
2821 if (recv_creds(sock[0], &cred, &v)) {
2822 if (v == '0') {
2823 if (!may_move_pid(tpid, tuid, cred.pid)) {
2824 fail = true;
2825 break;
2826 }
2827 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2828 fail = true;
2829 }
2830 }
2831
2832 ptr = strchr(ptr, '\n');
2833 if (!ptr)
2834 break;
2835 ptr++;
2836 }
2837
2838 /* All good, write the value */
2839 qpid = -1;
2840 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
b8defc3d 2841 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
237e200e
SH
2842
2843 if (!fail)
2844 answer = true;
2845
2846out:
2847 if (cpid != -1)
2848 wait_for_pid(cpid);
2849 if (sock[0] != -1) {
2850 close(sock[0]);
2851 close(sock[1]);
2852 }
2853 if (pids_file) {
2854 if (fclose(pids_file) != 0)
2855 answer = false;
2856 }
2857 return answer;
2858}
2859
2860int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2861 struct fuse_file_info *fi)
2862{
2863 struct fuse_context *fc = fuse_get_context();
2864 char *localbuf = NULL;
2865 struct cgfs_files *k = NULL;
2866 struct file_info *f = (struct file_info *)fi->fh;
2867 bool r;
2868
2869 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 2870 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
237e200e
SH
2871 return -EIO;
2872 }
2873
2874 if (offset)
2875 return 0;
2876
2877 if (!fc)
2878 return -EIO;
2879
2880 localbuf = alloca(size+1);
2881 localbuf[size] = '\0';
2882 memcpy(localbuf, buf, size);
2883
2884 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2885 size = -EINVAL;
2886 goto out;
2887 }
2888
2889 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2890 size = -EACCES;
2891 goto out;
2892 }
2893
2894 if (strcmp(f->file, "tasks") == 0 ||
2895 strcmp(f->file, "/tasks") == 0 ||
2896 strcmp(f->file, "/cgroup.procs") == 0 ||
2897 strcmp(f->file, "cgroup.procs") == 0)
2898 // special case - we have to translate the pids
2899 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2900 else
2901 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2902
2903 if (!r)
2904 size = -EINVAL;
2905
2906out:
2907 free_key(k);
2908 return size;
2909}
2910
2911int cg_chown(const char *path, uid_t uid, gid_t gid)
2912{
2913 struct fuse_context *fc = fuse_get_context();
2914 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2915 struct cgfs_files *k = NULL;
2916 const char *cgroup;
2917 int ret;
2918
2919 if (!fc)
2920 return -EIO;
2921
2922 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 2923 return -EPERM;
237e200e
SH
2924
2925 controller = pick_controller_from_path(fc, path);
2926 if (!controller)
bc70ba9b
CB
2927 return errno == ENOENT ? -EPERM : -errno;
2928
237e200e
SH
2929 cgroup = find_cgroup_in_path(path);
2930 if (!cgroup)
2931 /* this is just /cgroup/controller */
bc70ba9b 2932 return -EPERM;
237e200e
SH
2933
2934 get_cgdir_and_path(cgroup, &cgdir, &last);
2935
2936 if (!last) {
2937 path1 = "/";
2938 path2 = cgdir;
2939 } else {
2940 path1 = cgdir;
2941 path2 = last;
2942 }
2943
2944 if (is_child_cgroup(controller, path1, path2)) {
2945 // get uid, gid, from '/tasks' file and make up a mode
2946 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2947 k = cgfs_get_key(controller, cgroup, "tasks");
2948
2949 } else
2950 k = cgfs_get_key(controller, path1, path2);
2951
2952 if (!k) {
2953 ret = -EINVAL;
2954 goto out;
2955 }
2956
2957 /*
2958 * This being a fuse request, the uid and gid must be valid
2959 * in the caller's namespace. So we can just check to make
2960 * sure that the caller is root in his uid, and privileged
2961 * over the file's current owner.
2962 */
2963 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2964 ret = -EACCES;
2965 goto out;
2966 }
2967
2968 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2969
2970out:
2971 free_key(k);
2972 free(cgdir);
2973
2974 return ret;
2975}
2976
2977int cg_chmod(const char *path, mode_t mode)
2978{
2979 struct fuse_context *fc = fuse_get_context();
2980 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2981 struct cgfs_files *k = NULL;
2982 const char *cgroup;
2983 int ret;
2984
2985 if (!fc)
2986 return -EIO;
2987
2988 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 2989 return -EPERM;
237e200e
SH
2990
2991 controller = pick_controller_from_path(fc, path);
2992 if (!controller)
bc70ba9b
CB
2993 return errno == ENOENT ? -EPERM : -errno;
2994
237e200e
SH
2995 cgroup = find_cgroup_in_path(path);
2996 if (!cgroup)
2997 /* this is just /cgroup/controller */
bc70ba9b 2998 return -EPERM;
237e200e
SH
2999
3000 get_cgdir_and_path(cgroup, &cgdir, &last);
3001
3002 if (!last) {
3003 path1 = "/";
3004 path2 = cgdir;
3005 } else {
3006 path1 = cgdir;
3007 path2 = last;
3008 }
3009
3010 if (is_child_cgroup(controller, path1, path2)) {
3011 // get uid, gid, from '/tasks' file and make up a mode
3012 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3013 k = cgfs_get_key(controller, cgroup, "tasks");
3014
3015 } else
3016 k = cgfs_get_key(controller, path1, path2);
3017
3018 if (!k) {
3019 ret = -EINVAL;
3020 goto out;
3021 }
3022
3023 /*
3024 * This being a fuse request, the uid and gid must be valid
3025 * in the caller's namespace. So we can just check to make
3026 * sure that the caller is root in his uid, and privileged
3027 * over the file's current owner.
3028 */
3029 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3030 ret = -EPERM;
3031 goto out;
3032 }
3033
3034 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3035 ret = -EINVAL;
3036 goto out;
3037 }
3038
3039 ret = 0;
3040out:
3041 free_key(k);
3042 free(cgdir);
3043 return ret;
3044}
3045
3046int cg_mkdir(const char *path, mode_t mode)
3047{
3048 struct fuse_context *fc = fuse_get_context();
3049 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3050 const char *cgroup;
3051 int ret;
3052
3053 if (!fc)
3054 return -EIO;
3055
237e200e
SH
3056 controller = pick_controller_from_path(fc, path);
3057 if (!controller)
2f7036d0 3058 return errno == ENOENT ? -EPERM : -errno;
237e200e
SH
3059
3060 cgroup = find_cgroup_in_path(path);
3061 if (!cgroup)
bc70ba9b 3062 return -errno;
237e200e
SH
3063
3064 get_cgdir_and_path(cgroup, &cgdir, &last);
3065 if (!last)
3066 path1 = "/";
3067 else
3068 path1 = cgdir;
3069
3070 pid_t initpid = lookup_initpid_in_store(fc->pid);
3071 if (initpid <= 0)
3072 initpid = fc->pid;
3073 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3074 if (!next)
3075 ret = -EINVAL;
3076 else if (last && strcmp(next, last) == 0)
3077 ret = -EEXIST;
3078 else
2f7036d0 3079 ret = -EPERM;
237e200e
SH
3080 goto out;
3081 }
3082
3083 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3084 ret = -EACCES;
3085 goto out;
3086 }
3087 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3088 ret = -EACCES;
3089 goto out;
3090 }
3091
3092 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3093
3094out:
3095 free(cgdir);
3096 free(next);
3097 return ret;
3098}
3099
3100int cg_rmdir(const char *path)
3101{
3102 struct fuse_context *fc = fuse_get_context();
3103 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3104 const char *cgroup;
3105 int ret;
3106
3107 if (!fc)
3108 return -EIO;
3109
3110 controller = pick_controller_from_path(fc, path);
e254948f
CB
3111 if (!controller) /* Someone's trying to delete "/cgroup". */
3112 return -EPERM;
237e200e
SH
3113
3114 cgroup = find_cgroup_in_path(path);
e254948f
CB
3115 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3116 return -EPERM;
237e200e
SH
3117
3118 get_cgdir_and_path(cgroup, &cgdir, &last);
3119 if (!last) {
e254948f
CB
3120 /* Someone's trying to delete a cgroup on the same level as the
3121 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3122 * rmdir "/cgroup/blkio/init.slice".
3123 */
3124 ret = -EPERM;
237e200e
SH
3125 goto out;
3126 }
3127
3128 pid_t initpid = lookup_initpid_in_store(fc->pid);
3129 if (initpid <= 0)
3130 initpid = fc->pid;
3131 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
de77249b 3132 if (!last || (next && (strcmp(next, last) == 0)))
237e200e
SH
3133 ret = -EBUSY;
3134 else
3135 ret = -ENOENT;
3136 goto out;
3137 }
3138
3139 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3140 ret = -EACCES;
3141 goto out;
3142 }
3143 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3144 ret = -EACCES;
3145 goto out;
3146 }
3147
3148 if (!cgfs_remove(controller, cgroup)) {
3149 ret = -EINVAL;
3150 goto out;
3151 }
3152
3153 ret = 0;
3154
3155out:
3156 free(cgdir);
3157 free(next);
3158 return ret;
3159}
3160
3161static bool startswith(const char *line, const char *pref)
3162{
3163 if (strncmp(line, pref, strlen(pref)) == 0)
3164 return true;
3165 return false;
3166}
3167
c6095b08
SH
3168static void parse_memstat(char *memstat, unsigned long *cached,
3169 unsigned long *active_anon, unsigned long *inactive_anon,
3170 unsigned long *active_file, unsigned long *inactive_file,
559eaa8f 3171 unsigned long *unevictable, unsigned long *shmem)
237e200e
SH
3172{
3173 char *eol;
3174
237e200e 3175 while (*memstat) {
4accebfb
AS
3176 if (startswith(memstat, "total_cache")) {
3177 sscanf(memstat + 11, "%lu", cached);
c6095b08 3178 *cached /= 1024;
4accebfb
AS
3179 } else if (startswith(memstat, "total_active_anon")) {
3180 sscanf(memstat + 17, "%lu", active_anon);
c6095b08 3181 *active_anon /= 1024;
4accebfb
AS
3182 } else if (startswith(memstat, "total_inactive_anon")) {
3183 sscanf(memstat + 19, "%lu", inactive_anon);
c6095b08 3184 *inactive_anon /= 1024;
4accebfb
AS
3185 } else if (startswith(memstat, "total_active_file")) {
3186 sscanf(memstat + 17, "%lu", active_file);
c6095b08 3187 *active_file /= 1024;
4accebfb
AS
3188 } else if (startswith(memstat, "total_inactive_file")) {
3189 sscanf(memstat + 19, "%lu", inactive_file);
c6095b08 3190 *inactive_file /= 1024;
4accebfb
AS
3191 } else if (startswith(memstat, "total_unevictable")) {
3192 sscanf(memstat + 17, "%lu", unevictable);
c6095b08 3193 *unevictable /= 1024;
559eaa8f
JS
3194 } else if (startswith(memstat, "total_shmem")) {
3195 sscanf(memstat + 11, "%lu", shmem);
3196 *shmem /= 1024;
237e200e
SH
3197 }
3198 eol = strchr(memstat, '\n');
3199 if (!eol)
3200 return;
3201 memstat = eol+1;
3202 }
3203}
3204
3205static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3206{
3207 char *eol;
3208 char key[32];
3209
3210 memset(key, 0, 32);
3211 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3212
3213 size_t len = strlen(key);
3214 *v = 0;
3215
3216 while (*str) {
3217 if (startswith(str, key)) {
3218 sscanf(str + len, "%lu", v);
3219 return;
3220 }
3221 eol = strchr(str, '\n');
3222 if (!eol)
3223 return;
3224 str = eol+1;
3225 }
3226}
3227
3228static int read_file(const char *path, char *buf, size_t size,
3229 struct file_info *d)
3230{
3231 size_t linelen = 0, total_len = 0, rv = 0;
3232 char *line = NULL;
3233 char *cache = d->buf;
3234 size_t cache_size = d->buflen;
3235 FILE *f = fopen(path, "r");
3236 if (!f)
3237 return 0;
3238
3239 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3240 ssize_t l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
3241 if (l < 0) {
3242 perror("Error writing to cache");
3243 rv = 0;
3244 goto err;
3245 }
3246 if (l >= cache_size) {
b8defc3d 3247 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3248 rv = 0;
3249 goto err;
3250 }
3251 cache += l;
3252 cache_size -= l;
3253 total_len += l;
3254 }
3255
3256 d->size = total_len;
a262ddb7
CB
3257 if (total_len > size)
3258 total_len = size;
237e200e
SH
3259
3260 /* read from off 0 */
3261 memcpy(buf, d->buf, total_len);
3262 rv = total_len;
3263 err:
3264 fclose(f);
3265 free(line);
3266 return rv;
3267}
3268
3269/*
3270 * FUSE ops for /proc
3271 */
3272
018246ff 3273static unsigned long get_memlimit(const char *cgroup, const char *file)
237e200e
SH
3274{
3275 char *memlimit_str = NULL;
3276 unsigned long memlimit = -1;
3277
018246ff 3278 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
237e200e
SH
3279 memlimit = strtoul(memlimit_str, NULL, 10);
3280
3281 free(memlimit_str);
3282
3283 return memlimit;
3284}
3285
018246ff 3286static unsigned long get_min_memlimit(const char *cgroup, const char *file)
237e200e
SH
3287{
3288 char *copy = strdupa(cgroup);
3289 unsigned long memlimit = 0, retlimit;
3290
018246ff 3291 retlimit = get_memlimit(copy, file);
237e200e
SH
3292
3293 while (strcmp(copy, "/") != 0) {
3294 copy = dirname(copy);
018246ff 3295 memlimit = get_memlimit(copy, file);
237e200e
SH
3296 if (memlimit != -1 && memlimit < retlimit)
3297 retlimit = memlimit;
3298 };
3299
3300 return retlimit;
3301}
3302
3303static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3304 struct fuse_file_info *fi)
3305{
3306 struct fuse_context *fc = fuse_get_context();
3307 struct file_info *d = (struct file_info *)fi->fh;
3308 char *cg;
3309 char *memusage_str = NULL, *memstat_str = NULL,
018246ff 3310 *memswlimit_str = NULL, *memswusage_str = NULL;
237e200e 3311 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
c6095b08 3312 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
559eaa8f 3313 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
594a10e6 3314 hostswtotal = 0;
237e200e
SH
3315 char *line = NULL;
3316 size_t linelen = 0, total_len = 0, rv = 0;
3317 char *cache = d->buf;
3318 size_t cache_size = d->buflen;
3319 FILE *f = NULL;
3320
3321 if (offset){
3322 if (offset > d->size)
3323 return -EINVAL;
3324 if (!d->cached)
3325 return 0;
3326 int left = d->size - offset;
3327 total_len = left > size ? size: left;
3328 memcpy(buf, cache + offset, total_len);
3329 return total_len;
3330 }
3331
3332 pid_t initpid = lookup_initpid_in_store(fc->pid);
3333 if (initpid <= 0)
3334 initpid = fc->pid;
3335 cg = get_pid_cgroup(initpid, "memory");
3336 if (!cg)
3337 return read_file("/proc/meminfo", buf, size, d);
6d2f6996 3338 prune_init_slice(cg);
237e200e 3339
018246ff 3340 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
237e200e
SH
3341 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3342 goto err;
3343 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3344 goto err;
3345
3346 // Following values are allowed to fail, because swapaccount might be turned
3347 // off for current kernel
3348 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3349 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3350 {
018246ff 3351 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
237e200e
SH
3352 memswusage = strtoul(memswusage_str, NULL, 10);
3353
237e200e
SH
3354 memswlimit = memswlimit / 1024;
3355 memswusage = memswusage / 1024;
3356 }
3357
3358 memusage = strtoul(memusage_str, NULL, 10);
3359 memlimit /= 1024;
3360 memusage /= 1024;
3361
c6095b08
SH
3362 parse_memstat(memstat_str, &cached, &active_anon,
3363 &inactive_anon, &active_file, &inactive_file,
559eaa8f 3364 &unevictable, &shmem);
237e200e
SH
3365
3366 f = fopen("/proc/meminfo", "r");
3367 if (!f)
3368 goto err;
3369
3370 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3371 ssize_t l;
237e200e
SH
3372 char *printme, lbuf[100];
3373
3374 memset(lbuf, 0, 100);
3375 if (startswith(line, "MemTotal:")) {
594a10e6 3376 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
237e200e
SH
3377 if (hosttotal < memlimit)
3378 memlimit = hosttotal;
3379 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3380 printme = lbuf;
3381 } else if (startswith(line, "MemFree:")) {
3382 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3383 printme = lbuf;
3384 } else if (startswith(line, "MemAvailable:")) {
ad19b86d 3385 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
237e200e
SH
3386 printme = lbuf;
3387 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
594a10e6 3388 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
4127e51b 3389 if (hostswtotal < memswlimit)
3390 memswlimit = hostswtotal;
3391 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
237e200e
SH
3392 printme = lbuf;
3393 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
4127e51b 3394 unsigned long swaptotal = memswlimit,
b4665ce0
SH
3395 swapusage = memswusage - memusage,
3396 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3397 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
237e200e 3398 printme = lbuf;
da35d72a
SH
3399 } else if (startswith(line, "Slab:")) {
3400 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3401 printme = lbuf;
237e200e
SH
3402 } else if (startswith(line, "Buffers:")) {
3403 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3404 printme = lbuf;
3405 } else if (startswith(line, "Cached:")) {
3406 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3407 printme = lbuf;
3408 } else if (startswith(line, "SwapCached:")) {
3409 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3410 printme = lbuf;
2f306ad3 3411 } else if (startswith(line, "Active:")) {
c6095b08
SH
3412 snprintf(lbuf, 100, "Active: %8lu kB\n",
3413 active_anon + active_file);
3414 printme = lbuf;
2f306ad3 3415 } else if (startswith(line, "Inactive:")) {
c6095b08
SH
3416 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3417 inactive_anon + inactive_file);
3418 printme = lbuf;
3419 } else if (startswith(line, "Active(anon)")) {
3420 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3421 printme = lbuf;
3422 } else if (startswith(line, "Inactive(anon)")) {
3423 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3424 printme = lbuf;
3425 } else if (startswith(line, "Active(file)")) {
3426 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3427 printme = lbuf;
3428 } else if (startswith(line, "Inactive(file)")) {
3429 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3430 printme = lbuf;
3431 } else if (startswith(line, "Unevictable")) {
3432 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3433 printme = lbuf;
3434 } else if (startswith(line, "SReclaimable")) {
3435 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3436 printme = lbuf;
3437 } else if (startswith(line, "SUnreclaim")) {
3438 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3439 printme = lbuf;
559eaa8f
JS
3440 } else if (startswith(line, "Shmem:")) {
3441 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3442 printme = lbuf;
28cdea9b
JS
3443 } else if (startswith(line, "ShmemHugePages")) {
3444 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3445 printme = lbuf;
3446 } else if (startswith(line, "ShmemPmdMapped")) {
3447 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3448 printme = lbuf;
237e200e
SH
3449 } else
3450 printme = line;
3451
3452 l = snprintf(cache, cache_size, "%s", printme);
3453 if (l < 0) {
3454 perror("Error writing to cache");
3455 rv = 0;
3456 goto err;
3457
3458 }
3459 if (l >= cache_size) {
b8defc3d 3460 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3461 rv = 0;
3462 goto err;
3463 }
3464
3465 cache += l;
3466 cache_size -= l;
3467 total_len += l;
3468 }
3469
3470 d->cached = 1;
3471 d->size = total_len;
3472 if (total_len > size ) total_len = size;
3473 memcpy(buf, d->buf, total_len);
3474
3475 rv = total_len;
3476err:
3477 if (f)
3478 fclose(f);
3479 free(line);
3480 free(cg);
3481 free(memusage_str);
3482 free(memswlimit_str);
3483 free(memswusage_str);
3484 free(memstat_str);
237e200e
SH
3485 return rv;
3486}
3487
3488/*
3489 * Read the cpuset.cpus for cg
3490 * Return the answer in a newly allocated string which must be freed
3491 */
3492static char *get_cpuset(const char *cg)
3493{
3494 char *answer;
3495
3496 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3497 return NULL;
3498 return answer;
3499}
3500
3501bool cpu_in_cpuset(int cpu, const char *cpuset);
3502
3503static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3504{
3505 int cpu;
3506
3507 if (sscanf(line, "processor : %d", &cpu) != 1)
3508 return false;
3509 return cpu_in_cpuset(cpu, cpuset);
3510}
3511
c59d6a55
JS
3512/*
3513 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3514 * depending on `param`. Parameter value is returned throuh `value`.
3515 */
3516static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3517{
3518 bool rv = false;
3519 char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3520 char *str = NULL;
3521
3522 sprintf(file, "cpu.cfs_%s_us", param);
3523
3524 if (!cgfs_get_value("cpu", cg, file, &str))
3525 goto err;
3526
3527 if (sscanf(str, "%ld", value) != 1)
3528 goto err;
3529
3530 rv = true;
3531
3532err:
3533 if (str)
3534 free(str);
3535 return rv;
3536}
3537
3538/*
3539 * Return the maximum number of visible CPUs based on CPU quotas.
3540 * If there is no quota set, zero is returned.
3541 */
3542int max_cpu_count(const char *cg)
3543{
3544 int rv, nprocs;
3545 int64_t cfs_quota, cfs_period;
3546
3547 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3548 return 0;
3549
3550 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3551 return 0;
3552
3553 if (cfs_quota <= 0 || cfs_period <= 0)
3554 return 0;
3555
3556 rv = cfs_quota / cfs_period;
3557
3558 /* In case quota/period does not yield a whole number, add one CPU for
3559 * the remainder.
3560 */
3561 if ((cfs_quota % cfs_period) > 0)
3562 rv += 1;
3563
3564 nprocs = get_nprocs();
3565
3566 if (rv > nprocs)
3567 rv = nprocs;
3568
3569 return rv;
3570}
3571
3572/*
3573 * Determine whether CPU views should be used or not.
3574 */
3575bool use_cpuview(const char *cg)
3576{
3577 int cfd;
3578 char *tmpc;
3579
3580 tmpc = find_mounted_controller("cpu", &cfd);
3581 if (!tmpc)
3582 return false;
3583
3584 tmpc = find_mounted_controller("cpuacct", &cfd);
3585 if (!tmpc)
3586 return false;
3587
3588 return true;
3589}
3590
237e200e
SH
3591/*
3592 * check whether this is a '^processor" line in /proc/cpuinfo
3593 */
3594static bool is_processor_line(const char *line)
3595{
3596 int cpu;
3597
3598 if (sscanf(line, "processor : %d", &cpu) == 1)
3599 return true;
3600 return false;
3601}
3602
3603static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3604 struct fuse_file_info *fi)
3605{
3606 struct fuse_context *fc = fuse_get_context();
3607 struct file_info *d = (struct file_info *)fi->fh;
3608 char *cg;
3609 char *cpuset = NULL;
3610 char *line = NULL;
3611 size_t linelen = 0, total_len = 0, rv = 0;
f676eb79 3612 bool am_printing = false, firstline = true, is_s390x = false;
c59d6a55
JS
3613 int curcpu = -1, cpu, max_cpus = 0;
3614 bool use_view;
237e200e
SH
3615 char *cache = d->buf;
3616 size_t cache_size = d->buflen;
3617 FILE *f = NULL;
3618
3619 if (offset){
3620 if (offset > d->size)
3621 return -EINVAL;
3622 if (!d->cached)
3623 return 0;
3624 int left = d->size - offset;
3625 total_len = left > size ? size: left;
3626 memcpy(buf, cache + offset, total_len);
3627 return total_len;
3628 }
3629
3630 pid_t initpid = lookup_initpid_in_store(fc->pid);
3631 if (initpid <= 0)
3632 initpid = fc->pid;
3633 cg = get_pid_cgroup(initpid, "cpuset");
3634 if (!cg)
3635 return read_file("proc/cpuinfo", buf, size, d);
6d2f6996 3636 prune_init_slice(cg);
237e200e
SH
3637
3638 cpuset = get_cpuset(cg);
3639 if (!cpuset)
3640 goto err;
3641
c59d6a55
JS
3642 use_view = use_cpuview(cg);
3643
3644 if (use_view)
3645 max_cpus = max_cpu_count(cg);
3646
237e200e
SH
3647 f = fopen("/proc/cpuinfo", "r");
3648 if (!f)
3649 goto err;
3650
3651 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3652 ssize_t l;
f676eb79
SH
3653 if (firstline) {
3654 firstline = false;
3655 if (strstr(line, "IBM/S390") != NULL) {
3656 is_s390x = true;
3657 am_printing = true;
5ed9d4e2 3658 continue;
f676eb79
SH
3659 }
3660 }
5ed9d4e2
SH
3661 if (strncmp(line, "# processors:", 12) == 0)
3662 continue;
237e200e 3663 if (is_processor_line(line)) {
c59d6a55
JS
3664 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3665 break;
237e200e
SH
3666 am_printing = cpuline_in_cpuset(line, cpuset);
3667 if (am_printing) {
3668 curcpu ++;
3669 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3670 if (l < 0) {
3671 perror("Error writing to cache");
3672 rv = 0;
3673 goto err;
3674 }
3675 if (l >= cache_size) {
b8defc3d 3676 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3677 rv = 0;
3678 goto err;
3679 }
3680 cache += l;
3681 cache_size -= l;
3682 total_len += l;
3683 }
3684 continue;
f676eb79
SH
3685 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3686 char *p;
c59d6a55
JS
3687 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3688 break;
f676eb79
SH
3689 if (!cpu_in_cpuset(cpu, cpuset))
3690 continue;
3691 curcpu ++;
3692 p = strchr(line, ':');
3693 if (!p || !*p)
3694 goto err;
3695 p++;
5ed9d4e2 3696 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
f676eb79
SH
3697 if (l < 0) {
3698 perror("Error writing to cache");
3699 rv = 0;
3700 goto err;
3701 }
3702 if (l >= cache_size) {
b8defc3d 3703 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
f676eb79
SH
3704 rv = 0;
3705 goto err;
3706 }
3707 cache += l;
3708 cache_size -= l;
3709 total_len += l;
3710 continue;
3711
237e200e
SH
3712 }
3713 if (am_printing) {
3714 l = snprintf(cache, cache_size, "%s", line);
3715 if (l < 0) {
3716 perror("Error writing to cache");
3717 rv = 0;
3718 goto err;
3719 }
3720 if (l >= cache_size) {
b8defc3d 3721 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3722 rv = 0;
3723 goto err;
3724 }
3725 cache += l;
3726 cache_size -= l;
3727 total_len += l;
3728 }
3729 }
3730
5ed9d4e2
SH
3731 if (is_s390x) {
3732 char *origcache = d->buf;
a262ddb7 3733 ssize_t l;
5ed9d4e2
SH
3734 do {
3735 d->buf = malloc(d->buflen);
3736 } while (!d->buf);
3737 cache = d->buf;
3738 cache_size = d->buflen;
3739 total_len = 0;
3740 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3741 if (l < 0 || l >= cache_size) {
3742 free(origcache);
3743 goto err;
3744 }
3745 cache_size -= l;
3746 cache += l;
3747 total_len += l;
3748 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3749 if (l < 0 || l >= cache_size) {
3750 free(origcache);
3751 goto err;
3752 }
3753 cache_size -= l;
3754 cache += l;
3755 total_len += l;
3756 l = snprintf(cache, cache_size, "%s", origcache);
3757 free(origcache);
3758 if (l < 0 || l >= cache_size)
3759 goto err;
3760 total_len += l;
3761 }
3762
237e200e
SH
3763 d->cached = 1;
3764 d->size = total_len;
3765 if (total_len > size ) total_len = size;
3766
3767 /* read from off 0 */
3768 memcpy(buf, d->buf, total_len);
3769 rv = total_len;
3770err:
3771 if (f)
3772 fclose(f);
3773 free(line);
3774 free(cpuset);
3775 free(cg);
3776 return rv;
3777}
3778
0ecddf02 3779static uint64_t get_reaper_start_time(pid_t pid)
9ac264cf 3780{
9ac264cf 3781 int ret;
0ecddf02
CB
3782 FILE *f;
3783 uint64_t starttime;
3784 /* strlen("/proc/") = 6
3785 * +
3786 * LXCFS_NUMSTRLEN64
3787 * +
3788 * strlen("/stat") = 5
3789 * +
3790 * \0 = 1
3791 * */
3792#define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3793 char path[__PROC_PID_STAT_LEN];
9ac264cf
JB
3794 pid_t qpid;
3795
3796 qpid = lookup_initpid_in_store(pid);
0ecddf02
CB
3797 if (qpid <= 0) {
3798 /* Caller can check for EINVAL on 0. */
3799 errno = EINVAL;
9ac264cf 3800 return 0;
0ecddf02 3801 }
9ac264cf 3802
0ecddf02
CB
3803 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3804 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3805 /* Caller can check for EINVAL on 0. */
3806 errno = EINVAL;
9ac264cf 3807 return 0;
0ecddf02 3808 }
9ac264cf 3809
0ecddf02
CB
3810 f = fopen(path, "r");
3811 if (!f) {
3812 /* Caller can check for EINVAL on 0. */
3813 errno = EINVAL;
9ac264cf 3814 return 0;
0ecddf02 3815 }
9ac264cf 3816
0ecddf02
CB
3817 /* Note that the *scanf() argument supression requires that length
3818 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3819 * at us. It's like telling someone you're not married and then asking
3820 * if you can bring your wife to the party.
3821 */
3822 ret = fscanf(f, "%*d " /* (1) pid %d */
3823 "%*s " /* (2) comm %s */
3824 "%*c " /* (3) state %c */
3825 "%*d " /* (4) ppid %d */
3826 "%*d " /* (5) pgrp %d */
3827 "%*d " /* (6) session %d */
3828 "%*d " /* (7) tty_nr %d */
3829 "%*d " /* (8) tpgid %d */
3830 "%*u " /* (9) flags %u */
3831 "%*u " /* (10) minflt %lu */
3832 "%*u " /* (11) cminflt %lu */
3833 "%*u " /* (12) majflt %lu */
3834 "%*u " /* (13) cmajflt %lu */
3835 "%*u " /* (14) utime %lu */
3836 "%*u " /* (15) stime %lu */
3837 "%*d " /* (16) cutime %ld */
3838 "%*d " /* (17) cstime %ld */
3839 "%*d " /* (18) priority %ld */
3840 "%*d " /* (19) nice %ld */
3841 "%*d " /* (20) num_threads %ld */
3842 "%*d " /* (21) itrealvalue %ld */
3843 "%" PRIu64, /* (22) starttime %llu */
3844 &starttime);
3845 if (ret != 1) {
3846 fclose(f);
3847 /* Caller can check for EINVAL on 0. */
3848 errno = EINVAL;
3849 return 0;
3850 }
3851
3852 fclose(f);
3853
3854 errno = 0;
3855 return starttime;
3856}
3857
3858static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3859{
3860 uint64_t clockticks;
3861 int64_t ticks_per_sec;
3862
3863 clockticks = get_reaper_start_time(pid);
3864 if (clockticks == 0 && errno == EINVAL) {
3865 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3866 return 0;
3867 }
3868
3869 ticks_per_sec = sysconf(_SC_CLK_TCK);
3870 if (ticks_per_sec < 0 && errno == EINVAL) {
3871 lxcfs_debug(
3872 "%s\n",
3873 "failed to determine number of clock ticks in a second");
3874 return 0;
3875 }
3876
3877 return (clockticks /= ticks_per_sec);
3878}
3879
3880static uint64_t get_reaper_age(pid_t pid)
3881{
3882 uint64_t procstart, uptime, procage;
3883
3884 /* We need to substract the time the process has started since system
3885 * boot minus the time when the system has started to get the actual
3886 * reaper age.
3887 */
3888 procstart = get_reaper_start_time_in_sec(pid);
3889 procage = procstart;
3890 if (procstart > 0) {
3891 int ret;
3892 struct timespec spec;
3893
3894 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3895 if (ret < 0)
3896 return 0;
3897 /* We could make this more precise here by using the tv_nsec
3898 * field in the timespec struct and convert it to milliseconds
3899 * and then create a double for the seconds and milliseconds but
3900 * that seems more work than it is worth.
3901 */
3902 uptime = spec.tv_sec;
3903 procage = uptime - procstart;
3904 }
3905
3906 return procage;
3907}
3908
8be92dd1
JS
3909/*
3910 * Returns 0 on success.
3911 * It is the caller's responsibility to free `return_usage`, unless this
3912 * function returns an error.
3913 */
3914static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage)
3915{
3916 int cpucount = get_nprocs();
3917 struct cpuacct_usage *cpu_usage;
3918 int rv = 0, i, j, ret, read_pos = 0, read_cnt;
3919 int cg_cpu;
3920 uint64_t cg_user, cg_system;
3921 int64_t ticks_per_sec;
3922 char *usage_str = NULL;
3923
3924 ticks_per_sec = sysconf(_SC_CLK_TCK);
3925
3926 if (ticks_per_sec < 0 && errno == EINVAL) {
3927 lxcfs_debug(
3928 "%s\n",
3929 "read_cpuacct_usage_all failed to determine number of clock ticks "
3930 "in a second");
3931 return -1;
3932 }
3933
3934 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
3935 if (!cpu_usage)
3936 return -ENOMEM;
3937
3938 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
3939 rv = -1;
3940 goto err;
3941 }
3942
3943 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
3944 lxcfs_error("read_cpuacct_usage_all reading first line from "
3945 "%s/cpuacct.usage_all failed.\n", cg);
3946 rv = -1;
3947 goto err;
3948 }
3949
3950 read_pos += read_cnt;
3951
3952 for (i = 0, j = 0; i < cpucount; i++) {
3953 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
3954 &cg_system, &read_cnt);
3955
3956 if (ret == EOF)
3957 break;
3958
3959 if (ret != 3) {
3960 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
3961 "failed.\n", cg);
3962 rv = -1;
3963 goto err;
3964 }
3965
3966 read_pos += read_cnt;
3967
3968 if (!cpu_in_cpuset(i, cpuset))
3969 continue;
3970
3971 /* Convert the time from nanoseconds to USER_HZ */
3972 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
3973 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
3974 j++;
3975 }
3976
3977 rv = 0;
3978 *return_usage = cpu_usage;
3979
3980err:
3981 if (usage_str)
3982 free(usage_str);
3983
3984 if (rv != 0) {
3985 free(cpu_usage);
3986 *return_usage = NULL;
3987 }
3988
3989 return rv;
3990}
3991
f34de69a 3992#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
237e200e
SH
3993static int proc_stat_read(char *buf, size_t size, off_t offset,
3994 struct fuse_file_info *fi)
3995{
3996 struct fuse_context *fc = fuse_get_context();
3997 struct file_info *d = (struct file_info *)fi->fh;
3998 char *cg;
3999 char *cpuset = NULL;
4000 char *line = NULL;
4001 size_t linelen = 0, total_len = 0, rv = 0;
4002 int curcpu = -1; /* cpu numbering starts at 0 */
7144f069 4003 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
237e200e 4004 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
7144f069 4005 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
237e200e
SH
4006 char cpuall[CPUALL_MAX_SIZE];
4007 /* reserve for cpu all */
4008 char *cache = d->buf + CPUALL_MAX_SIZE;
4009 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4010 FILE *f = NULL;
8be92dd1 4011 struct cpuacct_usage *cg_cpu_usage = NULL;
237e200e
SH
4012
4013 if (offset){
4014 if (offset > d->size)
4015 return -EINVAL;
4016 if (!d->cached)
4017 return 0;
4018 int left = d->size - offset;
4019 total_len = left > size ? size: left;
4020 memcpy(buf, d->buf + offset, total_len);
4021 return total_len;
4022 }
4023
4024 pid_t initpid = lookup_initpid_in_store(fc->pid);
4025 if (initpid <= 0)
4026 initpid = fc->pid;
4027 cg = get_pid_cgroup(initpid, "cpuset");
4028 if (!cg)
4029 return read_file("/proc/stat", buf, size, d);
6d2f6996 4030 prune_init_slice(cg);
237e200e
SH
4031
4032 cpuset = get_cpuset(cg);
4033 if (!cpuset)
4034 goto err;
4035
8be92dd1
JS
4036 /*
4037 * Read cpuacct.usage_all for all CPUs.
4038 * If the cpuacct cgroup is present, it is used to calculate the container's
4039 * CPU usage. If not, values from the host's /proc/stat are used.
4040 */
4041 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage) != 0) {
4042 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
4043 "falling back to the host's /proc/stat");
4044 }
4045
237e200e
SH
4046 f = fopen("/proc/stat", "r");
4047 if (!f)
4048 goto err;
4049
4050 //skip first line
4051 if (getline(&line, &linelen, f) < 0) {
b8defc3d 4052 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
237e200e
SH
4053 goto err;
4054 }
4055
4056 while (getline(&line, &linelen, f) != -1) {
a262ddb7 4057 ssize_t l;
237e200e
SH
4058 int cpu;
4059 char cpu_char[10]; /* That's a lot of cores */
4060 char *c;
8be92dd1
JS
4061 uint64_t all_used, cg_used, new_idle;
4062 int ret;
237e200e 4063
b4665ce0
SH
4064 if (strlen(line) == 0)
4065 continue;
237e200e
SH
4066 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4067 /* not a ^cpuN line containing a number N, just print it */
9502bae2 4068 l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
4069 if (l < 0) {
4070 perror("Error writing to cache");
4071 rv = 0;
4072 goto err;
4073 }
4074 if (l >= cache_size) {
b8defc3d 4075 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
4076 rv = 0;
4077 goto err;
4078 }
4079 cache += l;
4080 cache_size -= l;
4081 total_len += l;
4082 continue;
4083 }
4084
4085 if (sscanf(cpu_char, "%d", &cpu) != 1)
4086 continue;
4087 if (!cpu_in_cpuset(cpu, cpuset))
4088 continue;
4089 curcpu ++;
4090
8be92dd1 4091 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
7144f069
CB
4092 &user,
4093 &nice,
4094 &system,
4095 &idle,
4096 &iowait,
4097 &irq,
4098 &softirq,
4099 &steal,
4100 &guest,
8be92dd1
JS
4101 &guest_nice);
4102
4103 if (ret != 10 || !cg_cpu_usage) {
4104 c = strchr(line, ' ');
4105 if (!c)
4106 continue;
4107 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4108 if (l < 0) {
4109 perror("Error writing to cache");
4110 rv = 0;
4111 goto err;
4112
4113 }
4114 if (l >= cache_size) {
4115 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4116 rv = 0;
4117 goto err;
4118 }
4119
4120 cache += l;
4121 cache_size -= l;
4122 total_len += l;
4123
4124 if (ret != 10)
4125 continue;
4126 }
4127
4128 if (cg_cpu_usage) {
4129 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4130 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4131
4132 if (all_used >= cg_used) {
4133 new_idle = idle + (all_used - cg_used);
4134
4135 } else {
4136 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4137 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4138 curcpu, cg, all_used, cg_used);
4139 new_idle = idle;
4140 }
4141
4142 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4143 curcpu, cg_cpu_usage[curcpu].user, cg_cpu_usage[curcpu].system,
4144 new_idle);
4145
4146 if (l < 0) {
4147 perror("Error writing to cache");
4148 rv = 0;
4149 goto err;
4150
4151 }
4152 if (l >= cache_size) {
4153 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4154 rv = 0;
4155 goto err;
4156 }
4157
4158 cache += l;
4159 cache_size -= l;
4160 total_len += l;
4161
4162 user_sum += cg_cpu_usage[curcpu].user;
4163 system_sum += cg_cpu_usage[curcpu].system;
4164 idle_sum += new_idle;
4165
4166 } else {
4167 user_sum += user;
4168 nice_sum += nice;
4169 system_sum += system;
4170 idle_sum += idle;
4171 iowait_sum += iowait;
4172 irq_sum += irq;
4173 softirq_sum += softirq;
4174 steal_sum += steal;
4175 guest_sum += guest;
4176 guest_nice_sum += guest_nice;
4177 }
237e200e
SH
4178 }
4179
4180 cache = d->buf;
4181
7144f069
CB
4182 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4183 user_sum,
4184 nice_sum,
4185 system_sum,
4186 idle_sum,
4187 iowait_sum,
4188 irq_sum,
4189 softirq_sum,
4190 steal_sum,
4191 guest_sum,
4192 guest_nice_sum);
4193 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
237e200e
SH
4194 memcpy(cache, cpuall, cpuall_len);
4195 cache += cpuall_len;
7144f069 4196 } else {
237e200e 4197 /* shouldn't happen */
b8defc3d 4198 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
237e200e
SH
4199 cpuall_len = 0;
4200 }
4201
4202 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4203 total_len += cpuall_len;
4204 d->cached = 1;
4205 d->size = total_len;
7144f069
CB
4206 if (total_len > size)
4207 total_len = size;
237e200e
SH
4208
4209 memcpy(buf, d->buf, total_len);
4210 rv = total_len;
4211
4212err:
4213 if (f)
4214 fclose(f);
8be92dd1
JS
4215 if (cg_cpu_usage)
4216 free(cg_cpu_usage);
237e200e
SH
4217 free(line);
4218 free(cpuset);
4219 free(cg);
4220 return rv;
4221}
4222
0ecddf02
CB
4223/* This function retrieves the busy time of a group of tasks by looking at
4224 * cpuacct.usage. Unfortunately, this only makes sense when the container has
4225 * been given it's own cpuacct cgroup. If not, this function will take the busy
4226 * time of all other taks that do not actually belong to the container into
4227 * account as well. If someone has a clever solution for this please send a
4228 * patch!
4229 */
237e200e
SH
4230static unsigned long get_reaper_busy(pid_t task)
4231{
4232 pid_t initpid = lookup_initpid_in_store(task);
4233 char *cgroup = NULL, *usage_str = NULL;
4234 unsigned long usage = 0;
4235
4236 if (initpid <= 0)
4237 return 0;
4238
4239 cgroup = get_pid_cgroup(initpid, "cpuacct");
4240 if (!cgroup)
4241 goto out;
6d2f6996 4242 prune_init_slice(cgroup);
237e200e
SH
4243 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
4244 goto out;
4245 usage = strtoul(usage_str, NULL, 10);
4246 usage /= 1000000000;
4247
4248out:
4249 free(cgroup);
4250 free(usage_str);
4251 return usage;
4252}
4253
4254#if RELOADTEST
4255void iwashere(void)
4256{
237e200e
SH
4257 int fd;
4258
ec2b5e7c 4259 fd = creat("/tmp/lxcfs-iwashere", 0644);
237e200e
SH
4260 if (fd >= 0)
4261 close(fd);
4262}
4263#endif
4264
4265/*
4266 * We read /proc/uptime and reuse its second field.
4267 * For the first field, we use the mtime for the reaper for
4268 * the calling pid as returned by getreaperage
4269 */
4270static int proc_uptime_read(char *buf, size_t size, off_t offset,
4271 struct fuse_file_info *fi)
4272{
4273 struct fuse_context *fc = fuse_get_context();
4274 struct file_info *d = (struct file_info *)fi->fh;
0ecddf02 4275 unsigned long int busytime = get_reaper_busy(fc->pid);
237e200e 4276 char *cache = d->buf;
a262ddb7 4277 ssize_t total_len = 0;
0ecddf02 4278 uint64_t idletime, reaperage;
237e200e
SH
4279
4280#if RELOADTEST
4281 iwashere();
4282#endif
4283
4284 if (offset){
237e200e
SH
4285 if (!d->cached)
4286 return 0;
bbdf646b
BM
4287 if (offset > d->size)
4288 return -EINVAL;
237e200e
SH
4289 int left = d->size - offset;
4290 total_len = left > size ? size: left;
4291 memcpy(buf, cache + offset, total_len);
4292 return total_len;
4293 }
4294
0ecddf02
CB
4295 reaperage = get_reaper_age(fc->pid);
4296 /* To understand why this is done, please read the comment to the
4297 * get_reaper_busy() function.
4298 */
4299 idletime = reaperage;
4300 if (reaperage >= busytime)
4301 idletime = reaperage - busytime;
237e200e 4302
bbdf646b
BM
4303 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
4304 if (total_len < 0 || total_len >= d->buflen){
0ecddf02 4305 lxcfs_error("%s\n", "failed to write to cache");
237e200e
SH
4306 return 0;
4307 }
4308
4309 d->size = (int)total_len;
4310 d->cached = 1;
4311
4312 if (total_len > size) total_len = size;
4313
4314 memcpy(buf, d->buf, total_len);
4315 return total_len;
4316}
4317
4318static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4319 struct fuse_file_info *fi)
4320{
4321 char dev_name[72];
4322 struct fuse_context *fc = fuse_get_context();
4323 struct file_info *d = (struct file_info *)fi->fh;
4324 char *cg;
4325 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
4326 *io_wait_time_str = NULL, *io_service_time_str = NULL;
4327 unsigned long read = 0, write = 0;
4328 unsigned long read_merged = 0, write_merged = 0;
4329 unsigned long read_sectors = 0, write_sectors = 0;
4330 unsigned long read_ticks = 0, write_ticks = 0;
4331 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4332 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4333 char *cache = d->buf;
4334 size_t cache_size = d->buflen;
4335 char *line = NULL;
4336 size_t linelen = 0, total_len = 0, rv = 0;
4337 unsigned int major = 0, minor = 0;
4338 int i = 0;
4339 FILE *f = NULL;
4340
4341 if (offset){
4342 if (offset > d->size)
4343 return -EINVAL;
4344 if (!d->cached)
4345 return 0;
4346 int left = d->size - offset;
4347 total_len = left > size ? size: left;
4348 memcpy(buf, cache + offset, total_len);
4349 return total_len;
4350 }
4351
4352 pid_t initpid = lookup_initpid_in_store(fc->pid);
4353 if (initpid <= 0)
4354 initpid = fc->pid;
4355 cg = get_pid_cgroup(initpid, "blkio");
4356 if (!cg)
4357 return read_file("/proc/diskstats", buf, size, d);
6d2f6996 4358 prune_init_slice(cg);
237e200e 4359
2209fe50 4360 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
237e200e 4361 goto err;
2209fe50 4362 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
237e200e 4363 goto err;
2209fe50 4364 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
237e200e 4365 goto err;
2209fe50 4366 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
237e200e 4367 goto err;
2209fe50 4368 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
237e200e
SH
4369 goto err;
4370
4371
4372 f = fopen("/proc/diskstats", "r");
4373 if (!f)
4374 goto err;
4375
4376 while (getline(&line, &linelen, f) != -1) {
a262ddb7 4377 ssize_t l;
2209fe50 4378 char lbuf[256];
237e200e
SH
4379
4380 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2209fe50 4381 if (i != 3)
237e200e 4382 continue;
2209fe50
SH
4383
4384 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
4385 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
4386 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
4387 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
4388 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
4389 read_sectors = read_sectors/512;
4390 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
4391 write_sectors = write_sectors/512;
4392
4393 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
4394 rd_svctm = rd_svctm/1000000;
4395 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
4396 rd_wait = rd_wait/1000000;
4397 read_ticks = rd_svctm + rd_wait;
4398
4399 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
4400 wr_svctm = wr_svctm/1000000;
4401 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
4402 wr_wait = wr_wait/1000000;
4403 write_ticks = wr_svctm + wr_wait;
4404
4405 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
4406 tot_ticks = tot_ticks/1000000;
237e200e
SH
4407
4408 memset(lbuf, 0, 256);
2db31eb6
SH
4409 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
4410 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4411 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
4412 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
4413 else
4414 continue;
237e200e 4415
2209fe50 4416 l = snprintf(cache, cache_size, "%s", lbuf);
237e200e
SH
4417 if (l < 0) {
4418 perror("Error writing to fuse buf");
4419 rv = 0;
4420 goto err;
4421 }
4422 if (l >= cache_size) {
b8defc3d 4423 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
4424 rv = 0;
4425 goto err;
4426 }
4427 cache += l;
4428 cache_size -= l;
4429 total_len += l;
4430 }
4431
4432 d->cached = 1;
4433 d->size = total_len;
4434 if (total_len > size ) total_len = size;
4435 memcpy(buf, d->buf, total_len);
4436
4437 rv = total_len;
4438err:
4439 free(cg);
4440 if (f)
4441 fclose(f);
4442 free(line);
4443 free(io_serviced_str);
4444 free(io_merged_str);
4445 free(io_service_bytes_str);
4446 free(io_wait_time_str);
4447 free(io_service_time_str);
4448 return rv;
4449}
4450
70dcc12e
SH
4451static int proc_swaps_read(char *buf, size_t size, off_t offset,
4452 struct fuse_file_info *fi)
4453{
4454 struct fuse_context *fc = fuse_get_context();
4455 struct file_info *d = (struct file_info *)fi->fh;
4456 char *cg = NULL;
018246ff 4457 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
70dcc12e 4458 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
a262ddb7
CB
4459 ssize_t total_len = 0, rv = 0;
4460 ssize_t l = 0;
70dcc12e
SH
4461 char *cache = d->buf;
4462
4463 if (offset) {
4464 if (offset > d->size)
4465 return -EINVAL;
4466 if (!d->cached)
4467 return 0;
4468 int left = d->size - offset;
4469 total_len = left > size ? size: left;
4470 memcpy(buf, cache + offset, total_len);
4471 return total_len;
4472 }
4473
4474 pid_t initpid = lookup_initpid_in_store(fc->pid);
4475 if (initpid <= 0)
4476 initpid = fc->pid;
4477 cg = get_pid_cgroup(initpid, "memory");
4478 if (!cg)
4479 return read_file("/proc/swaps", buf, size, d);
6d2f6996 4480 prune_init_slice(cg);
70dcc12e 4481
018246ff 4482 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
70dcc12e
SH
4483
4484 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4485 goto err;
4486
70dcc12e
SH
4487 memusage = strtoul(memusage_str, NULL, 10);
4488
4489 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
4490 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
4491
018246ff 4492 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
70dcc12e
SH
4493 memswusage = strtoul(memswusage_str, NULL, 10);
4494
70dcc12e
SH
4495 swap_total = (memswlimit - memlimit) / 1024;
4496 swap_free = (memswusage - memusage) / 1024;
4497 }
4498
4499 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
4500
4501 /* When no mem + swap limit is specified or swapaccount=0*/
4502 if (!memswlimit) {
4503 char *line = NULL;
4504 size_t linelen = 0;
4505 FILE *f = fopen("/proc/meminfo", "r");
4506
4507 if (!f)
4508 goto err;
4509
4510 while (getline(&line, &linelen, f) != -1) {
4511 if (startswith(line, "SwapTotal:")) {
4512 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
4513 } else if (startswith(line, "SwapFree:")) {
4514 sscanf(line, "SwapFree: %8lu kB", &swap_free);
4515 }
4516 }
4517
4518 free(line);
4519 fclose(f);
4520 }
4521
4522 if (swap_total > 0) {
a262ddb7
CB
4523 l = snprintf(d->buf + total_len, d->size - total_len,
4524 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
4525 swap_total, swap_free);
4526 total_len += l;
70dcc12e
SH
4527 }
4528
a262ddb7 4529 if (total_len < 0 || l < 0) {
70dcc12e
SH
4530 perror("Error writing to cache");
4531 rv = 0;
4532 goto err;
4533 }
4534
4535 d->cached = 1;
4536 d->size = (int)total_len;
4537
4538 if (total_len > size) total_len = size;
4539 memcpy(buf, d->buf, total_len);
4540 rv = total_len;
4541
4542err:
4543 free(cg);
4544 free(memswlimit_str);
4545 free(memlimit_str);
4546 free(memusage_str);
4547 free(memswusage_str);
70dcc12e
SH
4548 return rv;
4549}
6db4f7a3 4550/*
4551 * Find the process pid from cgroup path.
4552 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
4553 * @pid_buf : put pid to pid_buf.
4554 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
4555 * @depth : the depth of cgroup in container.
4556 * @sum : return the number of pid.
4557 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
4558 */
4559static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
4560{
4561 DIR *dir;
4562 int fd;
4563 struct dirent *file;
4564 FILE *f = NULL;
4565 size_t linelen = 0;
4566 char *line = NULL;
4567 int pd;
4568 char *path_dir, *path;
4569 char **pid;
4570
4571 /* path = dpath + "/cgroup.procs" + /0 */
4572 do {
4573 path = malloc(strlen(dpath) + 20);
4574 } while (!path);
4575
4576 strcpy(path, dpath);
4577 fd = openat(cfd, path, O_RDONLY);
4578 if (fd < 0)
4579 goto out;
4580
4581 dir = fdopendir(fd);
4582 if (dir == NULL) {
4583 close(fd);
4584 goto out;
4585 }
4586
4587 while (((file = readdir(dir)) != NULL) && depth > 0) {
4588 if (strncmp(file->d_name, ".", 1) == 0)
4589 continue;
4590 if (strncmp(file->d_name, "..", 1) == 0)
4591 continue;
4592 if (file->d_type == DT_DIR) {
4593 /* path + '/' + d_name +/0 */
4594 do {
4595 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
4596 } while (!path_dir);
4597 strcpy(path_dir, path);
4598 strcat(path_dir, "/");
4599 strcat(path_dir, file->d_name);
4600 pd = depth - 1;
4601 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
4602 free(path_dir);
4603 }
4604 }
4605 closedir(dir);
4606
4607 strcat(path, "/cgroup.procs");
4608 fd = openat(cfd, path, O_RDONLY);
4609 if (fd < 0)
4610 goto out;
4611
4612 f = fdopen(fd, "r");
4613 if (!f) {
4614 close(fd);
4615 goto out;
4616 }
4617
4618 while (getline(&line, &linelen, f) != -1) {
4619 do {
4620 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
4621 } while (!pid);
4622 *pid_buf = pid;
4623 do {
4624 *(*pid_buf + sum) = malloc(strlen(line) + 1);
4625 } while (*(*pid_buf + sum) == NULL);
4626 strcpy(*(*pid_buf + sum), line);
4627 sum++;
4628 }
4629 fclose(f);
4630out:
832904c1
JS
4631 if (line)
4632 free(line);
6db4f7a3 4633 free(path);
4634 return sum;
4635}
4636/*
4637 * calc_load calculates the load according to the following formula:
4638 * load1 = load0 * exp + active * (1 - exp)
4639 *
4640 * @load1: the new loadavg.
4641 * @load0: the former loadavg.
4642 * @active: the total number of running pid at this moment.
4643 * @exp: the fixed-point defined in the beginning.
4644 */
4645static unsigned long
4646calc_load(unsigned long load, unsigned long exp, unsigned long active)
4647{
4648 unsigned long newload;
4649
4650 active = active > 0 ? active * FIXED_1 : 0;
4651 newload = load * exp + active * (FIXED_1 - exp);
4652 if (active >= load)
4653 newload += FIXED_1 - 1;
4654
4655 return newload / FIXED_1;
4656}
4657
4658/*
4659 * Return 0 means that container p->cg is closed.
4660 * Return -1 means that error occurred in refresh.
4661 * Positive num equals the total number of pid.
4662 */
4663static int refresh_load(struct load_node *p, char *path)
4664{
4665 FILE *f = NULL;
4666 char **idbuf;
4667 char proc_path[256];
4668 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
4669 char *line = NULL;
4670 size_t linelen = 0;
4671 int sum, length;
4672 DIR *dp;
4673 struct dirent *file;
4674
4675 do {
4676 idbuf = malloc(sizeof(char *));
4677 } while (!idbuf);
4678 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
4679 /* normal exit */
4680 if (sum == 0)
4681 goto out;
4682
4683 for (i = 0; i < sum; i++) {
4684 /*clean up '\n' */
4685 length = strlen(idbuf[i])-1;
4686 idbuf[i][length] = '\0';
4687 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
4688 if (ret < 0 || ret > 255) {
4689 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4690 i = sum;
4691 sum = -1;
4692 goto err_out;
4693 }
4694
4695 dp = opendir(proc_path);
4696 if (!dp) {
4697 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
4698 continue;
4699 }
4700 while ((file = readdir(dp)) != NULL) {
4701 if (strncmp(file->d_name, ".", 1) == 0)
4702 continue;
4703 if (strncmp(file->d_name, "..", 1) == 0)
4704 continue;
4705 total_pid++;
4706 /* We make the biggest pid become last_pid.*/
4707 ret = atof(file->d_name);
4708 last_pid = (ret > last_pid) ? ret : last_pid;
4709
4710 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
4711 if (ret < 0 || ret > 255) {
4712 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4713 i = sum;
4714 sum = -1;
4715 closedir(dp);
4716 goto err_out;
4717 }
4718 f = fopen(proc_path, "r");
4719 if (f != NULL) {
4720 while (getline(&line, &linelen, f) != -1) {
4721 /* Find State */
4722 if ((line[0] == 'S') && (line[1] == 't'))
4723 break;
4724 }
4725 if ((line[7] == 'R') || (line[7] == 'D'))
4726 run_pid++;
4727 fclose(f);
4728 }
4729 }
4730 closedir(dp);
4731 }
4732 /*Calculate the loadavg.*/
4733 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
4734 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
4735 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
4736 p->run_pid = run_pid;
4737 p->total_pid = total_pid;
4738 p->last_pid = last_pid;
4739
4740 free(line);
beb5024e 4741err_out:
6db4f7a3 4742 for (; i > 0; i--)
4743 free(idbuf[i-1]);
4744out:
4745 free(idbuf);
4746 return sum;
4747}
4748/*
4749 * Traverse the hash table and update it.
4750 */
4751void *load_begin(void *arg)
4752{
4753
4754 char *path = NULL;
4755 int i, sum, length, ret;
4756 struct load_node *f;
4757 int first_node;
4758 clock_t time1, time2;
4759
4760 while (1) {
a83618e2
JS
4761 if (loadavg_stop == 1)
4762 return NULL;
4763
6db4f7a3 4764 time1 = clock();
4765 for (i = 0; i < LOAD_SIZE; i++) {
4766 pthread_mutex_lock(&load_hash[i].lock);
4767 if (load_hash[i].next == NULL) {
4768 pthread_mutex_unlock(&load_hash[i].lock);
4769 continue;
4770 }
4771 f = load_hash[i].next;
4772 first_node = 1;
4773 while (f) {
4774 length = strlen(f->cg) + 2;
4775 do {
4776 /* strlen(f->cg) + '.' or '' + \0 */
4777 path = malloc(length);
4778 } while (!path);
4779
4780 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
4781 if (ret < 0 || ret > length - 1) {
4782 /* snprintf failed, ignore the node.*/
4783 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
4784 goto out;
4785 }
4786 sum = refresh_load(f, path);
4787 if (sum == 0) {
4788 f = del_node(f, i);
4789 } else {
4790out: f = f->next;
4791 }
4792 free(path);
4793 /* load_hash[i].lock locks only on the first node.*/
4794 if (first_node == 1) {
4795 first_node = 0;
4796 pthread_mutex_unlock(&load_hash[i].lock);
4797 }
4798 }
4799 }
a83618e2
JS
4800
4801 if (loadavg_stop == 1)
4802 return NULL;
4803
6db4f7a3 4804 time2 = clock();
4805 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
4806 }
4807}
4808
4809static int proc_loadavg_read(char *buf, size_t size, off_t offset,
4810 struct fuse_file_info *fi)
4811{
4812 struct fuse_context *fc = fuse_get_context();
4813 struct file_info *d = (struct file_info *)fi->fh;
4814 pid_t initpid;
4815 char *cg;
4816 size_t total_len = 0;
4817 char *cache = d->buf;
4818 struct load_node *n;
4819 int hash;
01d88ede 4820 int cfd, rv = 0;
6db4f7a3 4821 unsigned long a, b, c;
4822
4823 if (offset) {
4824 if (offset > d->size)
4825 return -EINVAL;
4826 if (!d->cached)
4827 return 0;
4828 int left = d->size - offset;
4829 total_len = left > size ? size : left;
4830 memcpy(buf, cache + offset, total_len);
4831 return total_len;
4832 }
4833 if (!loadavg)
4834 return read_file("/proc/loadavg", buf, size, d);
4835
4836 initpid = lookup_initpid_in_store(fc->pid);
4837 if (initpid <= 0)
4838 initpid = fc->pid;
4839 cg = get_pid_cgroup(initpid, "cpu");
4840 if (!cg)
4841 return read_file("/proc/loadavg", buf, size, d);
4842
4843 prune_init_slice(cg);
b077527b 4844 hash = calc_hash(cg) % LOAD_SIZE;
6db4f7a3 4845 n = locate_node(cg, hash);
4846
4847 /* First time */
4848 if (n == NULL) {
4849 if (!find_mounted_controller("cpu", &cfd)) {
4850 /*
4851 * In locate_node() above, pthread_rwlock_unlock() isn't used
4852 * because delete is not allowed before read has ended.
4853 */
4854 pthread_rwlock_unlock(&load_hash[hash].rdlock);
01d88ede
JS
4855 rv = 0;
4856 goto err;
6db4f7a3 4857 }
4858 do {
4859 n = malloc(sizeof(struct load_node));
4860 } while (!n);
4861
4862 do {
4863 n->cg = malloc(strlen(cg)+1);
4864 } while (!n->cg);
4865 strcpy(n->cg, cg);
4866 n->avenrun[0] = 0;
4867 n->avenrun[1] = 0;
4868 n->avenrun[2] = 0;
4869 n->run_pid = 0;
4870 n->total_pid = 1;
4871 n->last_pid = initpid;
4872 n->cfd = cfd;
4873 insert_node(&n, hash);
4874 }
4875 a = n->avenrun[0] + (FIXED_1/200);
4876 b = n->avenrun[1] + (FIXED_1/200);
4877 c = n->avenrun[2] + (FIXED_1/200);
4878 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
4879 LOAD_INT(a), LOAD_FRAC(a),
4880 LOAD_INT(b), LOAD_FRAC(b),
4881 LOAD_INT(c), LOAD_FRAC(c),
4882 n->run_pid, n->total_pid, n->last_pid);
4883 pthread_rwlock_unlock(&load_hash[hash].rdlock);
4884 if (total_len < 0 || total_len >= d->buflen) {
4885 lxcfs_error("%s\n", "Failed to write to cache");
01d88ede
JS
4886 rv = 0;
4887 goto err;
6db4f7a3 4888 }
4889 d->size = (int)total_len;
4890 d->cached = 1;
4891
4892 if (total_len > size)
4893 total_len = size;
4894 memcpy(buf, d->buf, total_len);
01d88ede
JS
4895 rv = total_len;
4896
4897err:
4898 free(cg);
4899 return rv;
6db4f7a3 4900}
4901/* Return a positive number on success, return 0 on failure.*/
4902pthread_t load_daemon(int load_use)
4903{
4904 int ret;
4905 pthread_t pid;
4906
4907 ret = init_load();
4908 if (ret == -1) {
4909 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
4910 return 0;
4911 }
4912 ret = pthread_create(&pid, NULL, load_begin, NULL);
4913 if (ret != 0) {
4914 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
4915 load_free();
4916 return 0;
4917 }
4918 /* use loadavg, here loadavg = 1*/
4919 loadavg = load_use;
4920 return pid;
4921}
70dcc12e 4922
a83618e2
JS
4923/* Returns 0 on success. */
4924int stop_load_daemon(pthread_t pid)
4925{
4926 int s;
4927
4928 /* Signal the thread to gracefully stop */
4929 loadavg_stop = 1;
4930
4931 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
4932 if (s != 0) {
4933 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
4934 return -1;
4935 }
4936
4937 load_free();
4938 loadavg_stop = 0;
4939
4940 return 0;
4941}
4942
237e200e
SH
4943static off_t get_procfile_size(const char *which)
4944{
4945 FILE *f = fopen(which, "r");
4946 char *line = NULL;
4947 size_t len = 0;
4948 ssize_t sz, answer = 0;
4949 if (!f)
4950 return 0;
4951
4952 while ((sz = getline(&line, &len, f)) != -1)
4953 answer += sz;
4954 fclose (f);
4955 free(line);
4956
4957 return answer;
4958}
4959
4960int proc_getattr(const char *path, struct stat *sb)
4961{
4962 struct timespec now;
4963
4964 memset(sb, 0, sizeof(struct stat));
4965 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4966 return -EINVAL;
4967 sb->st_uid = sb->st_gid = 0;
4968 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4969 if (strcmp(path, "/proc") == 0) {
4970 sb->st_mode = S_IFDIR | 00555;
4971 sb->st_nlink = 2;
4972 return 0;
4973 }
4974 if (strcmp(path, "/proc/meminfo") == 0 ||
4975 strcmp(path, "/proc/cpuinfo") == 0 ||
4976 strcmp(path, "/proc/uptime") == 0 ||
4977 strcmp(path, "/proc/stat") == 0 ||
70dcc12e 4978 strcmp(path, "/proc/diskstats") == 0 ||
46be8eed 4979 strcmp(path, "/proc/swaps") == 0 ||
4980 strcmp(path, "/proc/loadavg") == 0) {
237e200e
SH
4981 sb->st_size = 0;
4982 sb->st_mode = S_IFREG | 00444;
4983 sb->st_nlink = 1;
4984 return 0;
4985 }
4986
4987 return -ENOENT;
4988}
4989
4990int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4991 struct fuse_file_info *fi)
4992{
d639f863
CB
4993 if (filler(buf, ".", NULL, 0) != 0 ||
4994 filler(buf, "..", NULL, 0) != 0 ||
4995 filler(buf, "cpuinfo", NULL, 0) != 0 ||
4996 filler(buf, "meminfo", NULL, 0) != 0 ||
4997 filler(buf, "stat", NULL, 0) != 0 ||
4998 filler(buf, "uptime", NULL, 0) != 0 ||
4999 filler(buf, "diskstats", NULL, 0) != 0 ||
46be8eed 5000 filler(buf, "swaps", NULL, 0) != 0 ||
5001 filler(buf, "loadavg", NULL, 0) != 0)
237e200e
SH
5002 return -EINVAL;
5003 return 0;
5004}
5005
5006int proc_open(const char *path, struct fuse_file_info *fi)
5007{
5008 int type = -1;
5009 struct file_info *info;
5010
5011 if (strcmp(path, "/proc/meminfo") == 0)
5012 type = LXC_TYPE_PROC_MEMINFO;
5013 else if (strcmp(path, "/proc/cpuinfo") == 0)
5014 type = LXC_TYPE_PROC_CPUINFO;
5015 else if (strcmp(path, "/proc/uptime") == 0)
5016 type = LXC_TYPE_PROC_UPTIME;
5017 else if (strcmp(path, "/proc/stat") == 0)
5018 type = LXC_TYPE_PROC_STAT;
5019 else if (strcmp(path, "/proc/diskstats") == 0)
5020 type = LXC_TYPE_PROC_DISKSTATS;
70dcc12e
SH
5021 else if (strcmp(path, "/proc/swaps") == 0)
5022 type = LXC_TYPE_PROC_SWAPS;
46be8eed 5023 else if (strcmp(path, "/proc/loadavg") == 0)
5024 type = LXC_TYPE_PROC_LOADAVG;
237e200e
SH
5025 if (type == -1)
5026 return -ENOENT;
5027
5028 info = malloc(sizeof(*info));
5029 if (!info)
5030 return -ENOMEM;
5031
5032 memset(info, 0, sizeof(*info));
5033 info->type = type;
5034
5035 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5036 do {
5037 info->buf = malloc(info->buflen);
5038 } while (!info->buf);
5039 memset(info->buf, 0, info->buflen);
5040 /* set actual size to buffer size */
5041 info->size = info->buflen;
5042
5043 fi->fh = (unsigned long)info;
5044 return 0;
5045}
5046
bddbb106
SH
5047int proc_access(const char *path, int mask)
5048{
e7849aa3
CB
5049 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5050 return 0;
5051
bddbb106
SH
5052 /* these are all read-only */
5053 if ((mask & ~R_OK) != 0)
1b060d0a 5054 return -EACCES;
bddbb106
SH
5055 return 0;
5056}
5057
237e200e
SH
5058int proc_release(const char *path, struct fuse_file_info *fi)
5059{
43215927 5060 do_release_file_info(fi);
237e200e
SH
5061 return 0;
5062}
5063
5064int proc_read(const char *path, char *buf, size_t size, off_t offset,
5065 struct fuse_file_info *fi)
5066{
5067 struct file_info *f = (struct file_info *) fi->fh;
5068
5069 switch (f->type) {
5070 case LXC_TYPE_PROC_MEMINFO:
5071 return proc_meminfo_read(buf, size, offset, fi);
5072 case LXC_TYPE_PROC_CPUINFO:
5073 return proc_cpuinfo_read(buf, size, offset, fi);
5074 case LXC_TYPE_PROC_UPTIME:
5075 return proc_uptime_read(buf, size, offset, fi);
5076 case LXC_TYPE_PROC_STAT:
5077 return proc_stat_read(buf, size, offset, fi);
5078 case LXC_TYPE_PROC_DISKSTATS:
5079 return proc_diskstats_read(buf, size, offset, fi);
70dcc12e
SH
5080 case LXC_TYPE_PROC_SWAPS:
5081 return proc_swaps_read(buf, size, offset, fi);
46be8eed 5082 case LXC_TYPE_PROC_LOADAVG:
5083 return proc_loadavg_read(buf, size, offset, fi);
237e200e
SH
5084 default:
5085 return -EINVAL;
5086 }
5087}
5088
29a73c2f
CB
5089/*
5090 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
5091 */
5092
5093static bool mkdir_p(const char *dir, mode_t mode)
5094{
5095 const char *tmp = dir;
5096 const char *orig = dir;
5097 char *makeme;
5098
5099 do {
5100 dir = tmp + strspn(tmp, "/");
5101 tmp = dir + strcspn(dir, "/");
5102 makeme = strndup(orig, dir - orig);
5103 if (!makeme)
5104 return false;
5105 if (mkdir(makeme, mode) && errno != EEXIST) {
b8defc3d 5106 lxcfs_error("Failed to create directory '%s': %s.\n",
29a73c2f
CB
5107 makeme, strerror(errno));
5108 free(makeme);
5109 return false;
5110 }
5111 free(makeme);
5112 } while(tmp != dir);
5113
5114 return true;
5115}
5116
5117static bool umount_if_mounted(void)
5118{
5119 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 5120 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
5121 return false;
5122 }
5123 return true;
5124}
5125
2283e240
CB
5126/* __typeof__ should be safe to use with all compilers. */
5127typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5128static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5129{
5130 return (fs->f_type == (fs_type_magic)magic_val);
5131}
5132
0a4dea41
CB
5133/*
5134 * looking at fs/proc_namespace.c, it appears we can
5135 * actually expect the rootfs entry to very specifically contain
5136 * " - rootfs rootfs "
5137 * IIUC, so long as we've chrooted so that rootfs is not our root,
5138 * the rootfs entry should always be skipped in mountinfo contents.
5139 */
5140static bool is_on_ramfs(void)
5141{
5142 FILE *f;
5143 char *p, *p2;
5144 char *line = NULL;
5145 size_t len = 0;
5146 int i;
5147
5148 f = fopen("/proc/self/mountinfo", "r");
5149 if (!f)
5150 return false;
5151
5152 while (getline(&line, &len, f) != -1) {
5153 for (p = line, i = 0; p && i < 4; i++)
5154 p = strchr(p + 1, ' ');
5155 if (!p)
5156 continue;
5157 p2 = strchr(p + 1, ' ');
5158 if (!p2)
5159 continue;
5160 *p2 = '\0';
5161 if (strcmp(p + 1, "/") == 0) {
5162 // this is '/'. is it the ramfs?
5163 p = strchr(p2 + 1, '-');
5164 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5165 free(line);
5166 fclose(f);
5167 return true;
5168 }
5169 }
5170 }
5171 free(line);
5172 fclose(f);
5173 return false;
5174}
5175
cc309f33 5176static int pivot_enter()
0a4dea41 5177{
cc309f33
CB
5178 int ret = -1, oldroot = -1, newroot = -1;
5179
5180 oldroot = open("/", O_DIRECTORY | O_RDONLY);
5181 if (oldroot < 0) {
5182 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5183 return ret;
5184 }
5185
5186 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5187 if (newroot < 0) {
5188 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5189 goto err;
5190 }
5191
5192 /* change into new root fs */
5193 if (fchdir(newroot) < 0) {
5194 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5195 goto err;
5196 }
5197
0a4dea41
CB
5198 /* pivot_root into our new root fs */
5199 if (pivot_root(".", ".") < 0) {
5200 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
cc309f33 5201 goto err;
0a4dea41
CB
5202 }
5203
5204 /*
5205 * At this point the old-root is mounted on top of our new-root.
5206 * To unmounted it we must not be chdir'd into it, so escape back
5207 * to the old-root.
5208 */
5209 if (fchdir(oldroot) < 0) {
5210 lxcfs_error("%s\n", "Failed to enter old root.");
cc309f33 5211 goto err;
0a4dea41
CB
5212 }
5213
5214 if (umount2(".", MNT_DETACH) < 0) {
5215 lxcfs_error("%s\n", "Failed to detach old root.");
cc309f33 5216 goto err;
0a4dea41
CB
5217 }
5218
5219 if (fchdir(newroot) < 0) {
5220 lxcfs_error("%s\n", "Failed to re-enter new root.");
cc309f33 5221 goto err;
0a4dea41
CB
5222 }
5223
cc309f33
CB
5224 ret = 0;
5225
5226err:
5227 if (oldroot > 0)
5228 close(oldroot);
5229 if (newroot > 0)
5230 close(newroot);
5231
5232 return ret;
0a4dea41
CB
5233}
5234
5235static int chroot_enter()
5236{
5237 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5238 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5239 return -1;
5240 }
5241
5242 if (chroot(".") < 0) {
5243 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
5244 return -1;
5245 }
5246
5247 if (chdir("/") < 0) {
5248 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
5249 return -1;
5250 }
5251
5252 return 0;
5253}
5254
0232cbac 5255static int permute_and_enter(void)
29a73c2f 5256{
0a4dea41
CB
5257 struct statfs sb;
5258
5259 if (statfs("/", &sb) < 0) {
5260 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 5261 return -1;
0a4dea41
CB
5262 }
5263
5264 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5265 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5266 * /proc/1/mountinfo. */
5267 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5268 return chroot_enter();
29a73c2f 5269
cc309f33 5270 if (pivot_enter() < 0) {
0a4dea41 5271 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 5272 return -1;
29a73c2f
CB
5273 }
5274
cc309f33 5275 return 0;
29a73c2f
CB
5276}
5277
5278/* Prepare our new clean root. */
0232cbac 5279static int permute_prepare(void)
29a73c2f
CB
5280{
5281 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 5282 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
5283 return -1;
5284 }
5285
5286 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 5287 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
5288 return -1;
5289 }
5290
5291 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 5292 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
5293 return -1;
5294 }
5295
5296 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 5297 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
5298 return -1;
5299 }
5300
5301 return 0;
5302}
5303
0232cbac
CB
5304/* Calls chroot() on ramfs, pivot_root() in all other cases. */
5305static bool permute_root(void)
29a73c2f
CB
5306{
5307 /* Prepare new root. */
0232cbac 5308 if (permute_prepare() < 0)
29a73c2f
CB
5309 return false;
5310
5311 /* Pivot into new root. */
0232cbac 5312 if (permute_and_enter() < 0)
29a73c2f
CB
5313 return false;
5314
5315 return true;
5316}
5317
a257a8ee
CB
5318static int preserve_mnt_ns(int pid)
5319{
5320 int ret;
5321 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
5322 char path[len];
5323
5324 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
5325 if (ret < 0 || (size_t)ret >= len)
5326 return -1;
5327
5328 return open(path, O_RDONLY | O_CLOEXEC);
5329}
5330
0a4dea41 5331static bool cgfs_prepare_mounts(void)
29a73c2f
CB
5332{
5333 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 5334 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
5335 return false;
5336 }
480262c9 5337
29a73c2f 5338 if (!umount_if_mounted()) {
b8defc3d 5339 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
5340 return false;
5341 }
5342
5343 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 5344 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
5345 return false;
5346 }
5347
a257a8ee
CB
5348 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
5349 if (cgroup_mount_ns_fd < 0) {
5350 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
5351 return false;
5352 }
5353
480262c9 5354 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 5355 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
5356 return false;
5357 }
480262c9 5358
29a73c2f 5359 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 5360 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
5361 return false;
5362 }
480262c9 5363
29a73c2f
CB
5364 return true;
5365}
5366
0a4dea41 5367static bool cgfs_mount_hierarchies(void)
29a73c2f
CB
5368{
5369 char *target;
5370 size_t clen, len;
5371 int i, ret;
5372
5373 for (i = 0; i < num_hierarchies; i++) {
5374 char *controller = hierarchies[i];
51c7ca35 5375
29a73c2f
CB
5376 clen = strlen(controller);
5377 len = strlen(BASEDIR) + clen + 2;
5378 target = malloc(len);
5379 if (!target)
5380 return false;
51c7ca35 5381
29a73c2f
CB
5382 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
5383 if (ret < 0 || ret >= len) {
5384 free(target);
5385 return false;
5386 }
5387 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
5388 free(target);
5389 return false;
5390 }
51c7ca35
CB
5391 if (!strcmp(controller, "unified"))
5392 ret = mount("none", target, "cgroup2", 0, NULL);
5393 else
5394 ret = mount(controller, target, "cgroup", 0, controller);
5395 if (ret < 0) {
5396 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
29a73c2f
CB
5397 free(target);
5398 return false;
5399 }
5400
5401 fd_hierarchies[i] = open(target, O_DIRECTORY);
5402 if (fd_hierarchies[i] < 0) {
5403 free(target);
5404 return false;
5405 }
5406 free(target);
5407 }
5408 return true;
5409}
5410
480262c9 5411static bool cgfs_setup_controllers(void)
29a73c2f 5412{
0a4dea41 5413 if (!cgfs_prepare_mounts())
29a73c2f 5414 return false;
29a73c2f 5415
0a4dea41 5416 if (!cgfs_mount_hierarchies()) {
b8defc3d 5417 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
29a73c2f
CB
5418 return false;
5419 }
5420
0232cbac 5421 if (!permute_root())
29a73c2f
CB
5422 return false;
5423
5424 return true;
5425}
5426
5427static void __attribute__((constructor)) collect_and_mount_subsystems(void)
237e200e
SH
5428{
5429 FILE *f;
e58dab00
CB
5430 char *cret, *line = NULL;
5431 char cwd[MAXPATHLEN];
237e200e 5432 size_t len = 0;
480262c9 5433 int i, init_ns = -1;
51c7ca35 5434 bool found_unified = false;
237e200e
SH
5435
5436 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
b8defc3d 5437 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
237e200e
SH
5438 return;
5439 }
e58dab00 5440
237e200e 5441 while (getline(&line, &len, f) != -1) {
51c7ca35 5442 char *idx, *p, *p2;
237e200e
SH
5443
5444 p = strchr(line, ':');
5445 if (!p)
5446 goto out;
51c7ca35 5447 idx = line;
237e200e
SH
5448 *(p++) = '\0';
5449
5450 p2 = strrchr(p, ':');
5451 if (!p2)
5452 goto out;
5453 *p2 = '\0';
5454
a67719f6
CB
5455 /* With cgroupv2 /proc/self/cgroup can contain entries of the
5456 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
5457 * because it parses out the empty string "" and later on passes
5458 * it to mount(). Let's skip such entries.
5459 */
51c7ca35
CB
5460 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
5461 found_unified = true;
5462 p = "unified";
5463 }
a67719f6 5464
237e200e
SH
5465 if (!store_hierarchy(line, p))
5466 goto out;
5467 }
5468
480262c9 5469 /* Preserve initial namespace. */
a257a8ee 5470 init_ns = preserve_mnt_ns(getpid());
b8defc3d
CB
5471 if (init_ns < 0) {
5472 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
480262c9 5473 goto out;
b8defc3d 5474 }
480262c9 5475
92c3ee11 5476 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
b8defc3d
CB
5477 if (!fd_hierarchies) {
5478 lxcfs_error("%s\n", strerror(errno));
29a73c2f 5479 goto out;
b8defc3d 5480 }
29a73c2f 5481
480262c9
CB
5482 for (i = 0; i < num_hierarchies; i++)
5483 fd_hierarchies[i] = -1;
5484
e58dab00
CB
5485 cret = getcwd(cwd, MAXPATHLEN);
5486 if (!cret)
5487 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
5488
480262c9
CB
5489 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
5490 * to privately mount lxcfs cgroups. */
b8defc3d
CB
5491 if (!cgfs_setup_controllers()) {
5492 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
29a73c2f 5493 goto out;
b8defc3d 5494 }
480262c9 5495
b8defc3d
CB
5496 if (setns(init_ns, 0) < 0) {
5497 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
29a73c2f 5498 goto out;
b8defc3d 5499 }
29a73c2f 5500
e58dab00
CB
5501 if (!cret || chdir(cwd) < 0)
5502 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
5503
237e200e
SH
5504 print_subsystems();
5505
5506out:
5507 free(line);
5508 fclose(f);
480262c9
CB
5509 if (init_ns >= 0)
5510 close(init_ns);
237e200e
SH
5511}
5512
5513static void __attribute__((destructor)) free_subsystems(void)
5514{
5515 int i;
5516
b8defc3d
CB
5517 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
5518
29a73c2f 5519 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
5520 if (hierarchies[i])
5521 free(hierarchies[i]);
480262c9 5522 if (fd_hierarchies && fd_hierarchies[i] >= 0)
29a73c2f
CB
5523 close(fd_hierarchies[i]);
5524 }
237e200e 5525 free(hierarchies);
480262c9 5526 free(fd_hierarchies);
a257a8ee
CB
5527
5528 if (cgroup_mount_ns_fd >= 0)
5529 close(cgroup_mount_ns_fd);
237e200e 5530}