]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
Release LXCFS 6.0.0
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #define __STDC_FORMAT_MACROS
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <fuse.h>
16 #include <inttypes.h>
17 #include <libgen.h>
18 #include <pthread.h>
19 #include <sched.h>
20 #include <stdbool.h>
21 #include <stdint.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <time.h>
26 #include <unistd.h>
27 #include <wait.h>
28 #include <linux/magic.h>
29 #include <linux/sched.h>
30 #include <sys/epoll.h>
31 #include <sys/mman.h>
32 #include <sys/mount.h>
33 #include <sys/param.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/sysinfo.h>
37 #include <sys/vfs.h>
38
39 #include "bindings.h"
40 #include "config.h" // for VERSION
41
42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
43 #define LXCFS_NUMSTRLEN64 21
44
45 /* Define pivot_root() if missing from the C library */
46 #ifndef HAVE_PIVOT_ROOT
47 static int pivot_root(const char * new_root, const char * put_old)
48 {
49 #ifdef __NR_pivot_root
50 return syscall(__NR_pivot_root, new_root, put_old);
51 #else
52 errno = ENOSYS;
53 return -1;
54 #endif
55 }
56 #else
57 extern int pivot_root(const char * new_root, const char * put_old);
58 #endif
59
60 enum {
61 LXC_TYPE_CGDIR,
62 LXC_TYPE_CGFILE,
63 LXC_TYPE_PROC_MEMINFO,
64 LXC_TYPE_PROC_CPUINFO,
65 LXC_TYPE_PROC_UPTIME,
66 LXC_TYPE_PROC_STAT,
67 LXC_TYPE_PROC_DISKSTATS,
68 LXC_TYPE_PROC_SWAPS,
69 LXC_TYPE_PROC_LOADAVG,
70 };
71
72 struct file_info {
73 char *controller;
74 char *cgroup;
75 char *file;
76 int type;
77 char *buf; // unused as of yet
78 int buflen;
79 int size; //actual data size
80 int cached;
81 };
82
83 struct cpuacct_usage {
84 uint64_t user;
85 uint64_t system;
86 uint64_t idle;
87 bool online;
88 };
89
90 /* The function of hash table.*/
91 #define LOAD_SIZE 100 /*the size of hash_table */
92 #define FLUSH_TIME 5 /*the flush rate */
93 #define DEPTH_DIR 3 /*the depth of per cgroup */
94 /* The function of calculate loadavg .*/
95 #define FSHIFT 11 /* nr of bits of precision */
96 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
97 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
98 #define EXP_5 2014 /* 1/exp(5sec/5min) */
99 #define EXP_15 2037 /* 1/exp(5sec/15min) */
100 #define LOAD_INT(x) ((x) >> FSHIFT)
101 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
102 /*
103 * This parameter is used for proc_loadavg_read().
104 * 1 means use loadavg, 0 means not use.
105 */
106 static int loadavg = 0;
107 static volatile sig_atomic_t loadavg_stop = 0;
108 static int calc_hash(const char *name)
109 {
110 unsigned int hash = 0;
111 unsigned int x = 0;
112 /* ELFHash algorithm. */
113 while (*name) {
114 hash = (hash << 4) + *name++;
115 x = hash & 0xf0000000;
116 if (x != 0)
117 hash ^= (x >> 24);
118 hash &= ~x;
119 }
120 return (hash & 0x7fffffff);
121 }
122
123 struct load_node {
124 char *cg; /*cg */
125 unsigned long avenrun[3]; /* Load averages */
126 unsigned int run_pid;
127 unsigned int total_pid;
128 unsigned int last_pid;
129 int cfd; /* The file descriptor of the mounted cgroup */
130 struct load_node *next;
131 struct load_node **pre;
132 };
133
134 struct load_head {
135 /*
136 * The lock is about insert load_node and refresh load_node.To the first
137 * load_node of each hash bucket, insert and refresh in this hash bucket is
138 * mutually exclusive.
139 */
140 pthread_mutex_t lock;
141 /*
142 * The rdlock is about read loadavg and delete load_node.To each hash
143 * bucket, read and delete is mutually exclusive. But at the same time, we
144 * allow paratactic read operation. This rdlock is at list level.
145 */
146 pthread_rwlock_t rdlock;
147 /*
148 * The rilock is about read loadavg and insert load_node.To the first
149 * load_node of each hash bucket, read and insert is mutually exclusive.
150 * But at the same time, we allow paratactic read operation.
151 */
152 pthread_rwlock_t rilock;
153 struct load_node *next;
154 };
155
156 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
157 /*
158 * init_load initialize the hash table.
159 * Return 0 on success, return -1 on failure.
160 */
161 static int init_load(void)
162 {
163 int i;
164 int ret;
165
166 for (i = 0; i < LOAD_SIZE; i++) {
167 load_hash[i].next = NULL;
168 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
169 if (ret != 0) {
170 lxcfs_error("%s\n", "Failed to initialize lock");
171 goto out3;
172 }
173 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
174 if (ret != 0) {
175 lxcfs_error("%s\n", "Failed to initialize rdlock");
176 goto out2;
177 }
178 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
179 if (ret != 0) {
180 lxcfs_error("%s\n", "Failed to initialize rilock");
181 goto out1;
182 }
183 }
184 return 0;
185 out1:
186 pthread_rwlock_destroy(&load_hash[i].rdlock);
187 out2:
188 pthread_mutex_destroy(&load_hash[i].lock);
189 out3:
190 while (i > 0) {
191 i--;
192 pthread_mutex_destroy(&load_hash[i].lock);
193 pthread_rwlock_destroy(&load_hash[i].rdlock);
194 pthread_rwlock_destroy(&load_hash[i].rilock);
195 }
196 return -1;
197 }
198
199 static void insert_node(struct load_node **n, int locate)
200 {
201 struct load_node *f;
202
203 pthread_mutex_lock(&load_hash[locate].lock);
204 pthread_rwlock_wrlock(&load_hash[locate].rilock);
205 f = load_hash[locate].next;
206 load_hash[locate].next = *n;
207
208 (*n)->pre = &(load_hash[locate].next);
209 if (f)
210 f->pre = &((*n)->next);
211 (*n)->next = f;
212 pthread_mutex_unlock(&load_hash[locate].lock);
213 pthread_rwlock_unlock(&load_hash[locate].rilock);
214 }
215 /*
216 * locate_node() finds special node. Not return NULL means success.
217 * It should be noted that rdlock isn't unlocked at the end of code
218 * because this function is used to read special node. Delete is not
219 * allowed before read has ended.
220 * unlock rdlock only in proc_loadavg_read().
221 */
222 static struct load_node *locate_node(char *cg, int locate)
223 {
224 struct load_node *f = NULL;
225 int i = 0;
226
227 pthread_rwlock_rdlock(&load_hash[locate].rilock);
228 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
229 if (load_hash[locate].next == NULL) {
230 pthread_rwlock_unlock(&load_hash[locate].rilock);
231 return f;
232 }
233 f = load_hash[locate].next;
234 pthread_rwlock_unlock(&load_hash[locate].rilock);
235 while (f && ((i = strcmp(f->cg, cg)) != 0))
236 f = f->next;
237 return f;
238 }
239 /* Delete the load_node n and return the next node of it. */
240 static struct load_node *del_node(struct load_node *n, int locate)
241 {
242 struct load_node *g;
243
244 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
245 if (n->next == NULL) {
246 *(n->pre) = NULL;
247 } else {
248 *(n->pre) = n->next;
249 n->next->pre = n->pre;
250 }
251 g = n->next;
252 free(n->cg);
253 free(n);
254 pthread_rwlock_unlock(&load_hash[locate].rdlock);
255 return g;
256 }
257
258 static void load_free(void)
259 {
260 int i;
261 struct load_node *f, *p;
262
263 for (i = 0; i < LOAD_SIZE; i++) {
264 pthread_mutex_lock(&load_hash[i].lock);
265 pthread_rwlock_wrlock(&load_hash[i].rilock);
266 pthread_rwlock_wrlock(&load_hash[i].rdlock);
267 if (load_hash[i].next == NULL) {
268 pthread_mutex_unlock(&load_hash[i].lock);
269 pthread_mutex_destroy(&load_hash[i].lock);
270 pthread_rwlock_unlock(&load_hash[i].rilock);
271 pthread_rwlock_destroy(&load_hash[i].rilock);
272 pthread_rwlock_unlock(&load_hash[i].rdlock);
273 pthread_rwlock_destroy(&load_hash[i].rdlock);
274 continue;
275 }
276 for (f = load_hash[i].next; f; ) {
277 free(f->cg);
278 p = f->next;
279 free(f);
280 f = p;
281 }
282 pthread_mutex_unlock(&load_hash[i].lock);
283 pthread_mutex_destroy(&load_hash[i].lock);
284 pthread_rwlock_unlock(&load_hash[i].rilock);
285 pthread_rwlock_destroy(&load_hash[i].rilock);
286 pthread_rwlock_unlock(&load_hash[i].rdlock);
287 pthread_rwlock_destroy(&load_hash[i].rdlock);
288 }
289 }
290
291 /* Data for CPU view */
292 struct cg_proc_stat {
293 char *cg;
294 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
295 struct cpuacct_usage *view; // Usage stats reported to the container
296 int cpu_count;
297 pthread_mutex_t lock; // For node manipulation
298 struct cg_proc_stat *next;
299 };
300
301 struct cg_proc_stat_head {
302 struct cg_proc_stat *next;
303 time_t lastcheck;
304
305 /*
306 * For access to the list. Reading can be parallel, pruning is exclusive.
307 */
308 pthread_rwlock_t lock;
309 };
310
311 #define CPUVIEW_HASH_SIZE 100
312 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
313
314 static bool cpuview_init_head(struct cg_proc_stat_head **head)
315 {
316 *head = malloc(sizeof(struct cg_proc_stat_head));
317 if (!(*head)) {
318 lxcfs_error("%s\n", strerror(errno));
319 return false;
320 }
321
322 (*head)->lastcheck = time(NULL);
323 (*head)->next = NULL;
324
325 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
326 lxcfs_error("%s\n", "Failed to initialize list lock");
327 free(*head);
328 return false;
329 }
330
331 return true;
332 }
333
334 static bool init_cpuview()
335 {
336 int i;
337
338 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
339 proc_stat_history[i] = NULL;
340
341 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
342 if (!cpuview_init_head(&proc_stat_history[i]))
343 goto err;
344 }
345
346 return true;
347
348 err:
349 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
350 if (proc_stat_history[i]) {
351 free(proc_stat_history[i]);
352 proc_stat_history[i] = NULL;
353 }
354 }
355
356 return false;
357 }
358
359 static void free_proc_stat_node(struct cg_proc_stat *node)
360 {
361 pthread_mutex_destroy(&node->lock);
362 free(node->cg);
363 free(node->usage);
364 free(node->view);
365 free(node);
366 }
367
368 static void cpuview_free_head(struct cg_proc_stat_head *head)
369 {
370 struct cg_proc_stat *node, *tmp;
371
372 if (head->next) {
373 node = head->next;
374
375 for (;;) {
376 tmp = node;
377 node = node->next;
378 free_proc_stat_node(tmp);
379
380 if (!node)
381 break;
382 }
383 }
384
385 pthread_rwlock_destroy(&head->lock);
386 free(head);
387 }
388
389 static void free_cpuview()
390 {
391 int i;
392
393 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
394 if (proc_stat_history[i])
395 cpuview_free_head(proc_stat_history[i]);
396 }
397 }
398
399 /* Reserve buffer size to account for file size changes. */
400 #define BUF_RESERVE_SIZE 512
401
402 /*
403 * A table caching which pid is init for a pid namespace.
404 * When looking up which pid is init for $qpid, we first
405 * 1. Stat /proc/$qpid/ns/pid.
406 * 2. Check whether the ino_t is in our store.
407 * a. if not, fork a child in qpid's ns to send us
408 * ucred.pid = 1, and read the initpid. Cache
409 * initpid and creation time for /proc/initpid
410 * in a new store entry.
411 * b. if so, verify that /proc/initpid still matches
412 * what we have saved. If not, clear the store
413 * entry and go back to a. If so, return the
414 * cached initpid.
415 */
416 struct pidns_init_store {
417 ino_t ino; // inode number for /proc/$pid/ns/pid
418 pid_t initpid; // the pid of nit in that ns
419 long int ctime; // the time at which /proc/$initpid was created
420 struct pidns_init_store *next;
421 long int lastcheck;
422 };
423
424 /* lol - look at how they are allocated in the kernel */
425 #define PIDNS_HASH_SIZE 4096
426 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
427
428 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
429 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
430 static void lock_mutex(pthread_mutex_t *l)
431 {
432 int ret;
433
434 if ((ret = pthread_mutex_lock(l)) != 0) {
435 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
436 exit(1);
437 }
438 }
439
440 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
441 * Number of hierarchies mounted. */
442 static int num_hierarchies;
443
444 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
445 * Hierachies mounted {cpuset, blkio, ...}:
446 * Initialized via __constructor__ collect_and_mount_subsystems(). */
447 static char **hierarchies;
448
449 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
450 * Open file descriptors:
451 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
452 * private mount namespace.
453 * Initialized via __constructor__ collect_and_mount_subsystems().
454 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
455 * mounts and respective files in the private namespace even when located in
456 * another namespace using the *at() family of functions
457 * {openat(), fchownat(), ...}. */
458 static int *fd_hierarchies;
459 static int cgroup_mount_ns_fd = -1;
460
461 static void unlock_mutex(pthread_mutex_t *l)
462 {
463 int ret;
464
465 if ((ret = pthread_mutex_unlock(l)) != 0) {
466 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
467 exit(1);
468 }
469 }
470
471 static void store_lock(void)
472 {
473 lock_mutex(&pidns_store_mutex);
474 }
475
476 static void store_unlock(void)
477 {
478 unlock_mutex(&pidns_store_mutex);
479 }
480
481 /* Must be called under store_lock */
482 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
483 {
484 struct stat initsb;
485 char fnam[100];
486
487 snprintf(fnam, 100, "/proc/%d", e->initpid);
488 if (stat(fnam, &initsb) < 0)
489 return false;
490
491 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
492 initsb.st_ctime, e->initpid);
493
494 if (e->ctime != initsb.st_ctime)
495 return false;
496 return true;
497 }
498
499 /* Must be called under store_lock */
500 static void remove_initpid(struct pidns_init_store *e)
501 {
502 struct pidns_init_store *tmp;
503 int h;
504
505 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
506
507 h = HASH(e->ino);
508 if (pidns_hash_table[h] == e) {
509 pidns_hash_table[h] = e->next;
510 free(e);
511 return;
512 }
513
514 tmp = pidns_hash_table[h];
515 while (tmp) {
516 if (tmp->next == e) {
517 tmp->next = e->next;
518 free(e);
519 return;
520 }
521 tmp = tmp->next;
522 }
523 }
524
525 #define PURGE_SECS 5
526 /* Must be called under store_lock */
527 static void prune_initpid_store(void)
528 {
529 static long int last_prune = 0;
530 struct pidns_init_store *e, *prev, *delme;
531 long int now, threshold;
532 int i;
533
534 if (!last_prune) {
535 last_prune = time(NULL);
536 return;
537 }
538 now = time(NULL);
539 if (now < last_prune + PURGE_SECS)
540 return;
541
542 lxcfs_debug("%s\n", "Pruning.");
543
544 last_prune = now;
545 threshold = now - 2 * PURGE_SECS;
546
547 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
548 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
549 if (e->lastcheck < threshold) {
550
551 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
552
553 delme = e;
554 if (prev)
555 prev->next = e->next;
556 else
557 pidns_hash_table[i] = e->next;
558 e = e->next;
559 free(delme);
560 } else {
561 prev = e;
562 e = e->next;
563 }
564 }
565 }
566 }
567
568 /* Must be called under store_lock */
569 static void save_initpid(struct stat *sb, pid_t pid)
570 {
571 struct pidns_init_store *e;
572 char fpath[100];
573 struct stat procsb;
574 int h;
575
576 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
577
578 snprintf(fpath, 100, "/proc/%d", pid);
579 if (stat(fpath, &procsb) < 0)
580 return;
581 do {
582 e = malloc(sizeof(*e));
583 } while (!e);
584 e->ino = sb->st_ino;
585 e->initpid = pid;
586 e->ctime = procsb.st_ctime;
587 h = HASH(e->ino);
588 e->next = pidns_hash_table[h];
589 e->lastcheck = time(NULL);
590 pidns_hash_table[h] = e;
591 }
592
593 /*
594 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
595 * entry for the inode number and creation time. Verify that the init pid
596 * is still valid. If not, remove it. Return the entry if valid, NULL
597 * otherwise.
598 * Must be called under store_lock
599 */
600 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
601 {
602 int h = HASH(sb->st_ino);
603 struct pidns_init_store *e = pidns_hash_table[h];
604
605 while (e) {
606 if (e->ino == sb->st_ino) {
607 if (initpid_still_valid(e, sb)) {
608 e->lastcheck = time(NULL);
609 return e;
610 }
611 remove_initpid(e);
612 return NULL;
613 }
614 e = e->next;
615 }
616
617 return NULL;
618 }
619
620 static int is_dir(const char *path, int fd)
621 {
622 struct stat statbuf;
623 int ret = fstatat(fd, path, &statbuf, fd);
624 if (ret == 0 && S_ISDIR(statbuf.st_mode))
625 return 1;
626 return 0;
627 }
628
629 static char *must_copy_string(const char *str)
630 {
631 char *dup = NULL;
632 if (!str)
633 return NULL;
634 do {
635 dup = strdup(str);
636 } while (!dup);
637
638 return dup;
639 }
640
641 static inline void drop_trailing_newlines(char *s)
642 {
643 int l;
644
645 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
646 s[l-1] = '\0';
647 }
648
649 #define BATCH_SIZE 50
650 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
651 {
652 int newbatches = (newlen / BATCH_SIZE) + 1;
653 int oldbatches = (oldlen / BATCH_SIZE) + 1;
654
655 if (!*mem || newbatches > oldbatches) {
656 char *tmp;
657 do {
658 tmp = realloc(*mem, newbatches * BATCH_SIZE);
659 } while (!tmp);
660 *mem = tmp;
661 }
662 }
663 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
664 {
665 size_t newlen = *len + linelen;
666 dorealloc(contents, *len, newlen + 1);
667 memcpy(*contents + *len, line, linelen+1);
668 *len = newlen;
669 }
670
671 static char *slurp_file(const char *from, int fd)
672 {
673 char *line = NULL;
674 char *contents = NULL;
675 FILE *f = fdopen(fd, "r");
676 size_t len = 0, fulllen = 0;
677 ssize_t linelen;
678
679 if (!f)
680 return NULL;
681
682 while ((linelen = getline(&line, &len, f)) != -1) {
683 append_line(&contents, &fulllen, line, linelen);
684 }
685 fclose(f);
686
687 if (contents)
688 drop_trailing_newlines(contents);
689 free(line);
690 return contents;
691 }
692
693 static bool write_string(const char *fnam, const char *string, int fd)
694 {
695 FILE *f;
696 size_t len, ret;
697
698 f = fdopen(fd, "w");
699 if (!f)
700 return false;
701
702 len = strlen(string);
703 ret = fwrite(string, 1, len, f);
704 if (ret != len) {
705 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
706 strerror(errno), string, fnam);
707 fclose(f);
708 return false;
709 }
710
711 if (fclose(f) < 0) {
712 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
713 return false;
714 }
715
716 return true;
717 }
718
719 struct cgfs_files {
720 char *name;
721 uint32_t uid, gid;
722 uint32_t mode;
723 };
724
725 #define ALLOC_NUM 20
726 static bool store_hierarchy(char *stridx, char *h)
727 {
728 if (num_hierarchies % ALLOC_NUM == 0) {
729 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
730 n *= ALLOC_NUM;
731 char **tmp = realloc(hierarchies, n * sizeof(char *));
732 if (!tmp) {
733 lxcfs_error("%s\n", strerror(errno));
734 exit(1);
735 }
736 hierarchies = tmp;
737 }
738
739 hierarchies[num_hierarchies++] = must_copy_string(h);
740 return true;
741 }
742
743 static void print_subsystems(void)
744 {
745 int i;
746
747 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
748 fprintf(stderr, "hierarchies:\n");
749 for (i = 0; i < num_hierarchies; i++) {
750 if (hierarchies[i])
751 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
752 fd_hierarchies[i], hierarchies[i]);
753 }
754 }
755
756 static bool in_comma_list(const char *needle, const char *haystack)
757 {
758 const char *s = haystack, *e;
759 size_t nlen = strlen(needle);
760
761 while (*s && (e = strchr(s, ','))) {
762 if (nlen != e - s) {
763 s = e + 1;
764 continue;
765 }
766 if (strncmp(needle, s, nlen) == 0)
767 return true;
768 s = e + 1;
769 }
770 if (strcmp(needle, s) == 0)
771 return true;
772 return false;
773 }
774
775 /* do we need to do any massaging here? I'm not sure... */
776 /* Return the mounted controller and store the corresponding open file descriptor
777 * referring to the controller mountpoint in the private lxcfs namespace in
778 * @cfd.
779 */
780 static char *find_mounted_controller(const char *controller, int *cfd)
781 {
782 int i;
783
784 for (i = 0; i < num_hierarchies; i++) {
785 if (!hierarchies[i])
786 continue;
787 if (strcmp(hierarchies[i], controller) == 0) {
788 *cfd = fd_hierarchies[i];
789 return hierarchies[i];
790 }
791 if (in_comma_list(controller, hierarchies[i])) {
792 *cfd = fd_hierarchies[i];
793 return hierarchies[i];
794 }
795 }
796
797 return NULL;
798 }
799
800 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
801 const char *value)
802 {
803 int ret, fd, cfd;
804 size_t len;
805 char *fnam, *tmpc;
806
807 tmpc = find_mounted_controller(controller, &cfd);
808 if (!tmpc)
809 return false;
810
811 /* Make sure we pass a relative path to *at() family of functions.
812 * . + /cgroup + / + file + \0
813 */
814 len = strlen(cgroup) + strlen(file) + 3;
815 fnam = alloca(len);
816 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
817 if (ret < 0 || (size_t)ret >= len)
818 return false;
819
820 fd = openat(cfd, fnam, O_WRONLY);
821 if (fd < 0)
822 return false;
823
824 return write_string(fnam, value, fd);
825 }
826
827 // Chown all the files in the cgroup directory. We do this when we create
828 // a cgroup on behalf of a user.
829 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
830 {
831 struct dirent *direntp;
832 char path[MAXPATHLEN];
833 size_t len;
834 DIR *d;
835 int fd1, ret;
836
837 len = strlen(dirname);
838 if (len >= MAXPATHLEN) {
839 lxcfs_error("Pathname too long: %s\n", dirname);
840 return;
841 }
842
843 fd1 = openat(fd, dirname, O_DIRECTORY);
844 if (fd1 < 0)
845 return;
846
847 d = fdopendir(fd1);
848 if (!d) {
849 lxcfs_error("Failed to open %s\n", dirname);
850 return;
851 }
852
853 while ((direntp = readdir(d))) {
854 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
855 continue;
856 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
857 if (ret < 0 || ret >= MAXPATHLEN) {
858 lxcfs_error("Pathname too long under %s\n", dirname);
859 continue;
860 }
861 if (fchownat(fd, path, uid, gid, 0) < 0)
862 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
863 }
864 closedir(d);
865 }
866
867 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
868 {
869 int cfd;
870 size_t len;
871 char *dirnam, *tmpc;
872
873 tmpc = find_mounted_controller(controller, &cfd);
874 if (!tmpc)
875 return -EINVAL;
876
877 /* Make sure we pass a relative path to *at() family of functions.
878 * . + /cg + \0
879 */
880 len = strlen(cg) + 2;
881 dirnam = alloca(len);
882 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
883
884 if (mkdirat(cfd, dirnam, 0755) < 0)
885 return -errno;
886
887 if (uid == 0 && gid == 0)
888 return 0;
889
890 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
891 return -errno;
892
893 chown_all_cgroup_files(dirnam, uid, gid, cfd);
894
895 return 0;
896 }
897
898 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
899 {
900 struct dirent *direntp;
901 DIR *dir;
902 bool ret = false;
903 char pathname[MAXPATHLEN];
904 int dupfd;
905
906 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
907 if (dupfd < 0)
908 return false;
909
910 dir = fdopendir(dupfd);
911 if (!dir) {
912 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
913 close(dupfd);
914 return false;
915 }
916
917 while ((direntp = readdir(dir))) {
918 struct stat mystat;
919 int rc;
920
921 if (!strcmp(direntp->d_name, ".") ||
922 !strcmp(direntp->d_name, ".."))
923 continue;
924
925 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
926 if (rc < 0 || rc >= MAXPATHLEN) {
927 lxcfs_error("%s\n", "Pathname too long.");
928 continue;
929 }
930
931 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
932 if (rc) {
933 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
934 continue;
935 }
936 if (S_ISDIR(mystat.st_mode))
937 if (!recursive_rmdir(pathname, fd, cfd))
938 lxcfs_debug("Error removing %s.\n", pathname);
939 }
940
941 ret = true;
942 if (closedir(dir) < 0) {
943 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
944 ret = false;
945 }
946
947 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
948 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
949 ret = false;
950 }
951
952 close(dupfd);
953
954 return ret;
955 }
956
957 bool cgfs_remove(const char *controller, const char *cg)
958 {
959 int fd, cfd;
960 size_t len;
961 char *dirnam, *tmpc;
962 bool bret;
963
964 tmpc = find_mounted_controller(controller, &cfd);
965 if (!tmpc)
966 return false;
967
968 /* Make sure we pass a relative path to *at() family of functions.
969 * . + /cg + \0
970 */
971 len = strlen(cg) + 2;
972 dirnam = alloca(len);
973 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
974
975 fd = openat(cfd, dirnam, O_DIRECTORY);
976 if (fd < 0)
977 return false;
978
979 bret = recursive_rmdir(dirnam, fd, cfd);
980 close(fd);
981 return bret;
982 }
983
984 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
985 {
986 int cfd;
987 size_t len;
988 char *pathname, *tmpc;
989
990 tmpc = find_mounted_controller(controller, &cfd);
991 if (!tmpc)
992 return false;
993
994 /* Make sure we pass a relative path to *at() family of functions.
995 * . + /file + \0
996 */
997 len = strlen(file) + 2;
998 pathname = alloca(len);
999 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1000 if (fchmodat(cfd, pathname, mode, 0) < 0)
1001 return false;
1002 return true;
1003 }
1004
1005 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
1006 {
1007 size_t len;
1008 char *fname;
1009
1010 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
1011 fname = alloca(len);
1012 snprintf(fname, len, "%s/tasks", dirname);
1013 if (fchownat(fd, fname, uid, gid, 0) != 0)
1014 return -errno;
1015 snprintf(fname, len, "%s/cgroup.procs", dirname);
1016 if (fchownat(fd, fname, uid, gid, 0) != 0)
1017 return -errno;
1018 return 0;
1019 }
1020
1021 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
1022 {
1023 int cfd;
1024 size_t len;
1025 char *pathname, *tmpc;
1026
1027 tmpc = find_mounted_controller(controller, &cfd);
1028 if (!tmpc)
1029 return -EINVAL;
1030
1031 /* Make sure we pass a relative path to *at() family of functions.
1032 * . + /file + \0
1033 */
1034 len = strlen(file) + 2;
1035 pathname = alloca(len);
1036 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1037 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
1038 return -errno;
1039
1040 if (is_dir(pathname, cfd))
1041 // like cgmanager did, we want to chown the tasks file as well
1042 return chown_tasks_files(pathname, uid, gid, cfd);
1043
1044 return 0;
1045 }
1046
1047 FILE *open_pids_file(const char *controller, const char *cgroup)
1048 {
1049 int fd, cfd;
1050 size_t len;
1051 char *pathname, *tmpc;
1052
1053 tmpc = find_mounted_controller(controller, &cfd);
1054 if (!tmpc)
1055 return NULL;
1056
1057 /* Make sure we pass a relative path to *at() family of functions.
1058 * . + /cgroup + / "cgroup.procs" + \0
1059 */
1060 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
1061 pathname = alloca(len);
1062 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1063
1064 fd = openat(cfd, pathname, O_WRONLY);
1065 if (fd < 0)
1066 return NULL;
1067
1068 return fdopen(fd, "w");
1069 }
1070
1071 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1072 void ***list, size_t typesize,
1073 void* (*iterator)(const char*, const char*, const char*))
1074 {
1075 int cfd, fd, ret;
1076 size_t len;
1077 char *cg, *tmpc;
1078 char pathname[MAXPATHLEN];
1079 size_t sz = 0, asz = 0;
1080 struct dirent *dirent;
1081 DIR *dir;
1082
1083 tmpc = find_mounted_controller(controller, &cfd);
1084 *list = NULL;
1085 if (!tmpc)
1086 return false;
1087
1088 /* Make sure we pass a relative path to *at() family of functions. */
1089 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1090 cg = alloca(len);
1091 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1092 if (ret < 0 || (size_t)ret >= len) {
1093 lxcfs_error("Pathname too long under %s\n", cgroup);
1094 return false;
1095 }
1096
1097 fd = openat(cfd, cg, O_DIRECTORY);
1098 if (fd < 0)
1099 return false;
1100
1101 dir = fdopendir(fd);
1102 if (!dir)
1103 return false;
1104
1105 while ((dirent = readdir(dir))) {
1106 struct stat mystat;
1107
1108 if (!strcmp(dirent->d_name, ".") ||
1109 !strcmp(dirent->d_name, ".."))
1110 continue;
1111
1112 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1113 if (ret < 0 || ret >= MAXPATHLEN) {
1114 lxcfs_error("Pathname too long under %s\n", cg);
1115 continue;
1116 }
1117
1118 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1119 if (ret) {
1120 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1121 continue;
1122 }
1123 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1124 (directories && !S_ISDIR(mystat.st_mode)))
1125 continue;
1126
1127 if (sz+2 >= asz) {
1128 void **tmp;
1129 asz += BATCH_SIZE;
1130 do {
1131 tmp = realloc(*list, asz * typesize);
1132 } while (!tmp);
1133 *list = tmp;
1134 }
1135 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1136 (*list)[sz+1] = NULL;
1137 sz++;
1138 }
1139 if (closedir(dir) < 0) {
1140 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1141 return false;
1142 }
1143 return true;
1144 }
1145
1146 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1147 {
1148 char *dup;
1149 do {
1150 dup = strdup(dir_entry);
1151 } while (!dup);
1152 return dup;
1153 }
1154
1155 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1156 {
1157 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1158 }
1159
1160 void free_key(struct cgfs_files *k)
1161 {
1162 if (!k)
1163 return;
1164 free(k->name);
1165 free(k);
1166 }
1167
1168 void free_keys(struct cgfs_files **keys)
1169 {
1170 int i;
1171
1172 if (!keys)
1173 return;
1174 for (i = 0; keys[i]; i++) {
1175 free_key(keys[i]);
1176 }
1177 free(keys);
1178 }
1179
1180 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1181 {
1182 int ret, fd, cfd;
1183 size_t len;
1184 char *fnam, *tmpc;
1185
1186 tmpc = find_mounted_controller(controller, &cfd);
1187 if (!tmpc)
1188 return false;
1189
1190 /* Make sure we pass a relative path to *at() family of functions.
1191 * . + /cgroup + / + file + \0
1192 */
1193 len = strlen(cgroup) + strlen(file) + 3;
1194 fnam = alloca(len);
1195 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1196 if (ret < 0 || (size_t)ret >= len)
1197 return false;
1198
1199 fd = openat(cfd, fnam, O_RDONLY);
1200 if (fd < 0)
1201 return false;
1202
1203 *value = slurp_file(fnam, fd);
1204 return *value != NULL;
1205 }
1206
1207 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1208 {
1209 int ret, cfd;
1210 size_t len;
1211 char *fnam, *tmpc;
1212
1213 tmpc = find_mounted_controller(controller, &cfd);
1214 if (!tmpc)
1215 return false;
1216
1217 /* Make sure we pass a relative path to *at() family of functions.
1218 * . + /cgroup + / + file + \0
1219 */
1220 len = strlen(cgroup) + strlen(file) + 3;
1221 fnam = alloca(len);
1222 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1223 if (ret < 0 || (size_t)ret >= len)
1224 return false;
1225
1226 return (faccessat(cfd, fnam, F_OK, 0) == 0);
1227 }
1228
1229 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1230 {
1231 int ret, cfd;
1232 size_t len;
1233 char *fnam, *tmpc;
1234 struct stat sb;
1235 struct cgfs_files *newkey;
1236
1237 tmpc = find_mounted_controller(controller, &cfd);
1238 if (!tmpc)
1239 return false;
1240
1241 if (file && *file == '/')
1242 file++;
1243
1244 if (file && strchr(file, '/'))
1245 return NULL;
1246
1247 /* Make sure we pass a relative path to *at() family of functions.
1248 * . + /cgroup + / + file + \0
1249 */
1250 len = strlen(cgroup) + 3;
1251 if (file)
1252 len += strlen(file) + 1;
1253 fnam = alloca(len);
1254 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1255 file ? "/" : "", file ? file : "");
1256
1257 ret = fstatat(cfd, fnam, &sb, 0);
1258 if (ret < 0)
1259 return NULL;
1260
1261 do {
1262 newkey = malloc(sizeof(struct cgfs_files));
1263 } while (!newkey);
1264 if (file)
1265 newkey->name = must_copy_string(file);
1266 else if (strrchr(cgroup, '/'))
1267 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1268 else
1269 newkey->name = must_copy_string(cgroup);
1270 newkey->uid = sb.st_uid;
1271 newkey->gid = sb.st_gid;
1272 newkey->mode = sb.st_mode;
1273
1274 return newkey;
1275 }
1276
1277 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1278 {
1279 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1280 if (!entry) {
1281 lxcfs_error("Error getting files under %s:%s\n", controller,
1282 cgroup);
1283 }
1284 return entry;
1285 }
1286
1287 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1288 {
1289 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1290 }
1291
1292 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1293 {
1294 int cfd;
1295 size_t len;
1296 char *fnam, *tmpc;
1297 int ret;
1298 struct stat sb;
1299
1300 tmpc = find_mounted_controller(controller, &cfd);
1301 if (!tmpc)
1302 return false;
1303
1304 /* Make sure we pass a relative path to *at() family of functions.
1305 * . + /cgroup + / + f + \0
1306 */
1307 len = strlen(cgroup) + strlen(f) + 3;
1308 fnam = alloca(len);
1309 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1310 if (ret < 0 || (size_t)ret >= len)
1311 return false;
1312
1313 ret = fstatat(cfd, fnam, &sb, 0);
1314 if (ret < 0 || !S_ISDIR(sb.st_mode))
1315 return false;
1316
1317 return true;
1318 }
1319
1320 #define SEND_CREDS_OK 0
1321 #define SEND_CREDS_NOTSK 1
1322 #define SEND_CREDS_FAIL 2
1323 static bool recv_creds(int sock, struct ucred *cred, char *v);
1324 static int wait_for_pid(pid_t pid);
1325 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1326 static int send_creds_clone_wrapper(void *arg);
1327
1328 /*
1329 * clone a task which switches to @task's namespace and writes '1'.
1330 * over a unix sock so we can read the task's reaper's pid in our
1331 * namespace
1332 *
1333 * Note: glibc's fork() does not respect pidns, which can lead to failed
1334 * assertions inside glibc (and thus failed forks) if the child's pid in
1335 * the pidns and the parent pid outside are identical. Using clone prevents
1336 * this issue.
1337 */
1338 static void write_task_init_pid_exit(int sock, pid_t target)
1339 {
1340 char fnam[100];
1341 pid_t pid;
1342 int fd, ret;
1343 size_t stack_size = sysconf(_SC_PAGESIZE);
1344 void *stack = alloca(stack_size);
1345
1346 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1347 if (ret < 0 || ret >= sizeof(fnam))
1348 _exit(1);
1349
1350 fd = open(fnam, O_RDONLY);
1351 if (fd < 0) {
1352 perror("write_task_init_pid_exit open of ns/pid");
1353 _exit(1);
1354 }
1355 if (setns(fd, 0)) {
1356 perror("write_task_init_pid_exit setns 1");
1357 close(fd);
1358 _exit(1);
1359 }
1360 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1361 if (pid < 0)
1362 _exit(1);
1363 if (pid != 0) {
1364 if (!wait_for_pid(pid))
1365 _exit(1);
1366 _exit(0);
1367 }
1368 }
1369
1370 static int send_creds_clone_wrapper(void *arg) {
1371 struct ucred cred;
1372 char v;
1373 int sock = *(int *)arg;
1374
1375 /* we are the child */
1376 cred.uid = 0;
1377 cred.gid = 0;
1378 cred.pid = 1;
1379 v = '1';
1380 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1381 return 1;
1382 return 0;
1383 }
1384
1385 static pid_t get_init_pid_for_task(pid_t task)
1386 {
1387 int sock[2];
1388 pid_t pid;
1389 pid_t ret = -1;
1390 char v = '0';
1391 struct ucred cred;
1392
1393 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1394 perror("socketpair");
1395 return -1;
1396 }
1397
1398 pid = fork();
1399 if (pid < 0)
1400 goto out;
1401 if (!pid) {
1402 close(sock[1]);
1403 write_task_init_pid_exit(sock[0], task);
1404 _exit(0);
1405 }
1406
1407 if (!recv_creds(sock[1], &cred, &v))
1408 goto out;
1409 ret = cred.pid;
1410
1411 out:
1412 close(sock[0]);
1413 close(sock[1]);
1414 if (pid > 0)
1415 wait_for_pid(pid);
1416 return ret;
1417 }
1418
1419 static pid_t lookup_initpid_in_store(pid_t qpid)
1420 {
1421 pid_t answer = 0;
1422 struct stat sb;
1423 struct pidns_init_store *e;
1424 char fnam[100];
1425
1426 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1427 store_lock();
1428 if (stat(fnam, &sb) < 0)
1429 goto out;
1430 e = lookup_verify_initpid(&sb);
1431 if (e) {
1432 answer = e->initpid;
1433 goto out;
1434 }
1435 answer = get_init_pid_for_task(qpid);
1436 if (answer > 0)
1437 save_initpid(&sb, answer);
1438
1439 out:
1440 /* we prune at end in case we are returning
1441 * the value we were about to return */
1442 prune_initpid_store();
1443 store_unlock();
1444 return answer;
1445 }
1446
1447 static int wait_for_pid(pid_t pid)
1448 {
1449 int status, ret;
1450
1451 if (pid <= 0)
1452 return -1;
1453
1454 again:
1455 ret = waitpid(pid, &status, 0);
1456 if (ret == -1) {
1457 if (errno == EINTR)
1458 goto again;
1459 return -1;
1460 }
1461 if (ret != pid)
1462 goto again;
1463 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1464 return -1;
1465 return 0;
1466 }
1467
1468
1469 /*
1470 * append pid to *src.
1471 * src: a pointer to a char* in which ot append the pid.
1472 * sz: the number of characters printed so far, minus trailing \0.
1473 * asz: the allocated size so far
1474 * pid: the pid to append
1475 */
1476 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1477 {
1478 char tmp[30];
1479
1480 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1481
1482 if (!*src || tmplen + *sz + 1 >= *asz) {
1483 char *tmp;
1484 do {
1485 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1486 } while (!tmp);
1487 *src = tmp;
1488 *asz += BUF_RESERVE_SIZE;
1489 }
1490 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1491 *sz += tmplen;
1492 }
1493
1494 /*
1495 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1496 * valid in the caller's namespace, return the id mapped into
1497 * pid's namespace.
1498 * Returns the mapped id, or -1 on error.
1499 */
1500 unsigned int
1501 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1502 {
1503 unsigned int nsuid, // base id for a range in the idfile's namespace
1504 hostuid, // base id for a range in the caller's namespace
1505 count; // number of ids in this range
1506 char line[400];
1507 int ret;
1508
1509 fseek(idfile, 0L, SEEK_SET);
1510 while (fgets(line, 400, idfile)) {
1511 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1512 if (ret != 3)
1513 continue;
1514 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1515 /*
1516 * uids wrapped around - unexpected as this is a procfile,
1517 * so just bail.
1518 */
1519 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1520 nsuid, hostuid, count, line);
1521 return -1;
1522 }
1523 if (hostuid <= in_id && hostuid+count > in_id) {
1524 /*
1525 * now since hostuid <= in_id < hostuid+count, and
1526 * hostuid+count and nsuid+count do not wrap around,
1527 * we know that nsuid+(in_id-hostuid) which must be
1528 * less that nsuid+(count) must not wrap around
1529 */
1530 return (in_id - hostuid) + nsuid;
1531 }
1532 }
1533
1534 // no answer found
1535 return -1;
1536 }
1537
1538 /*
1539 * for is_privileged_over,
1540 * specify whether we require the calling uid to be root in his
1541 * namespace
1542 */
1543 #define NS_ROOT_REQD true
1544 #define NS_ROOT_OPT false
1545
1546 #define PROCLEN 100
1547
1548 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1549 {
1550 char fpath[PROCLEN];
1551 int ret;
1552 bool answer = false;
1553 uid_t nsuid;
1554
1555 if (victim == -1 || uid == -1)
1556 return false;
1557
1558 /*
1559 * If the request is one not requiring root in the namespace,
1560 * then having the same uid suffices. (i.e. uid 1000 has write
1561 * access to files owned by uid 1000
1562 */
1563 if (!req_ns_root && uid == victim)
1564 return true;
1565
1566 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1567 if (ret < 0 || ret >= PROCLEN)
1568 return false;
1569 FILE *f = fopen(fpath, "r");
1570 if (!f)
1571 return false;
1572
1573 /* if caller's not root in his namespace, reject */
1574 nsuid = convert_id_to_ns(f, uid);
1575 if (nsuid)
1576 goto out;
1577
1578 /*
1579 * If victim is not mapped into caller's ns, reject.
1580 * XXX I'm not sure this check is needed given that fuse
1581 * will be sending requests where the vfs has converted
1582 */
1583 nsuid = convert_id_to_ns(f, victim);
1584 if (nsuid == -1)
1585 goto out;
1586
1587 answer = true;
1588
1589 out:
1590 fclose(f);
1591 return answer;
1592 }
1593
1594 static bool perms_include(int fmode, mode_t req_mode)
1595 {
1596 mode_t r;
1597
1598 switch (req_mode & O_ACCMODE) {
1599 case O_RDONLY:
1600 r = S_IROTH;
1601 break;
1602 case O_WRONLY:
1603 r = S_IWOTH;
1604 break;
1605 case O_RDWR:
1606 r = S_IROTH | S_IWOTH;
1607 break;
1608 default:
1609 return false;
1610 }
1611 return ((fmode & r) == r);
1612 }
1613
1614
1615 /*
1616 * taskcg is a/b/c
1617 * querycg is /a/b/c/d/e
1618 * we return 'd'
1619 */
1620 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1621 {
1622 char *start, *end;
1623
1624 if (strlen(taskcg) <= strlen(querycg)) {
1625 lxcfs_error("%s\n", "I was fed bad input.");
1626 return NULL;
1627 }
1628
1629 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1630 start = strdup(taskcg + 1);
1631 else
1632 start = strdup(taskcg + strlen(querycg) + 1);
1633 if (!start)
1634 return NULL;
1635 end = strchr(start, '/');
1636 if (end)
1637 *end = '\0';
1638 return start;
1639 }
1640
1641 static void stripnewline(char *x)
1642 {
1643 size_t l = strlen(x);
1644 if (l && x[l-1] == '\n')
1645 x[l-1] = '\0';
1646 }
1647
1648 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1649 {
1650 int cfd;
1651 char fnam[PROCLEN];
1652 FILE *f;
1653 char *answer = NULL;
1654 char *line = NULL;
1655 size_t len = 0;
1656 int ret;
1657 const char *h = find_mounted_controller(contrl, &cfd);
1658 if (!h)
1659 return NULL;
1660
1661 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1662 if (ret < 0 || ret >= PROCLEN)
1663 return NULL;
1664 if (!(f = fopen(fnam, "r")))
1665 return NULL;
1666
1667 while (getline(&line, &len, f) != -1) {
1668 char *c1, *c2;
1669 if (!line[0])
1670 continue;
1671 c1 = strchr(line, ':');
1672 if (!c1)
1673 goto out;
1674 c1++;
1675 c2 = strchr(c1, ':');
1676 if (!c2)
1677 goto out;
1678 *c2 = '\0';
1679 if (strcmp(c1, h) != 0)
1680 continue;
1681 c2++;
1682 stripnewline(c2);
1683 do {
1684 answer = strdup(c2);
1685 } while (!answer);
1686 break;
1687 }
1688
1689 out:
1690 fclose(f);
1691 free(line);
1692 return answer;
1693 }
1694
1695 /*
1696 * check whether a fuse context may access a cgroup dir or file
1697 *
1698 * If file is not null, it is a cgroup file to check under cg.
1699 * If file is null, then we are checking perms on cg itself.
1700 *
1701 * For files we can check the mode of the list_keys result.
1702 * For cgroups, we must make assumptions based on the files under the
1703 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1704 * yet.
1705 */
1706 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1707 {
1708 struct cgfs_files *k = NULL;
1709 bool ret = false;
1710
1711 k = cgfs_get_key(contrl, cg, file);
1712 if (!k)
1713 return false;
1714
1715 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1716 if (perms_include(k->mode >> 6, mode)) {
1717 ret = true;
1718 goto out;
1719 }
1720 }
1721 if (fc->gid == k->gid) {
1722 if (perms_include(k->mode >> 3, mode)) {
1723 ret = true;
1724 goto out;
1725 }
1726 }
1727 ret = perms_include(k->mode, mode);
1728
1729 out:
1730 free_key(k);
1731 return ret;
1732 }
1733
1734 #define INITSCOPE "/init.scope"
1735 static void prune_init_slice(char *cg)
1736 {
1737 char *point;
1738 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1739
1740 if (cg_len < initscope_len)
1741 return;
1742
1743 point = cg + cg_len - initscope_len;
1744 if (strcmp(point, INITSCOPE) == 0) {
1745 if (point == cg)
1746 *(point+1) = '\0';
1747 else
1748 *point = '\0';
1749 }
1750 }
1751
1752 /*
1753 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1754 * If pid is in /a, he may act on /a/b, but not on /b.
1755 * if the answer is false and nextcg is not NULL, then *nextcg will point
1756 * to a string containing the next cgroup directory under cg, which must be
1757 * freed by the caller.
1758 */
1759 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1760 {
1761 bool answer = false;
1762 char *c2 = get_pid_cgroup(pid, contrl);
1763 char *linecmp;
1764
1765 if (!c2)
1766 return false;
1767 prune_init_slice(c2);
1768
1769 /*
1770 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1771 * they pass in a cgroup without leading '/'
1772 *
1773 * The original line here was:
1774 * linecmp = *cg == '/' ? c2 : c2+1;
1775 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1776 * Serge, do you know?
1777 */
1778 if (*cg == '/' || !strncmp(cg, "./", 2))
1779 linecmp = c2;
1780 else
1781 linecmp = c2 + 1;
1782 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1783 if (nextcg) {
1784 *nextcg = get_next_cgroup_dir(linecmp, cg);
1785 }
1786 goto out;
1787 }
1788 answer = true;
1789
1790 out:
1791 free(c2);
1792 return answer;
1793 }
1794
1795 /*
1796 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1797 */
1798 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1799 {
1800 bool answer = false;
1801 char *c2, *task_cg;
1802 size_t target_len, task_len;
1803
1804 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1805 return true;
1806
1807 c2 = get_pid_cgroup(pid, contrl);
1808 if (!c2)
1809 return false;
1810 prune_init_slice(c2);
1811
1812 task_cg = c2 + 1;
1813 target_len = strlen(cg);
1814 task_len = strlen(task_cg);
1815 if (task_len == 0) {
1816 /* Task is in the root cg, it can see everything. This case is
1817 * not handled by the strmcps below, since they test for the
1818 * last /, but that is the first / that we've chopped off
1819 * above.
1820 */
1821 answer = true;
1822 goto out;
1823 }
1824 if (strcmp(cg, task_cg) == 0) {
1825 answer = true;
1826 goto out;
1827 }
1828 if (target_len < task_len) {
1829 /* looking up a parent dir */
1830 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1831 answer = true;
1832 goto out;
1833 }
1834 if (target_len > task_len) {
1835 /* looking up a child dir */
1836 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1837 answer = true;
1838 goto out;
1839 }
1840
1841 out:
1842 free(c2);
1843 return answer;
1844 }
1845
1846 /*
1847 * given /cgroup/freezer/a/b, return "freezer".
1848 * the returned char* should NOT be freed.
1849 */
1850 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1851 {
1852 const char *p1;
1853 char *contr, *slash;
1854
1855 if (strlen(path) < 9) {
1856 errno = EACCES;
1857 return NULL;
1858 }
1859 if (*(path + 7) != '/') {
1860 errno = EINVAL;
1861 return NULL;
1862 }
1863 p1 = path + 8;
1864 contr = strdupa(p1);
1865 if (!contr) {
1866 errno = ENOMEM;
1867 return NULL;
1868 }
1869 slash = strstr(contr, "/");
1870 if (slash)
1871 *slash = '\0';
1872
1873 int i;
1874 for (i = 0; i < num_hierarchies; i++) {
1875 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1876 return hierarchies[i];
1877 }
1878 errno = ENOENT;
1879 return NULL;
1880 }
1881
1882 /*
1883 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1884 * Note that the returned value may include files (keynames) etc
1885 */
1886 static const char *find_cgroup_in_path(const char *path)
1887 {
1888 const char *p1;
1889
1890 if (strlen(path) < 9) {
1891 errno = EACCES;
1892 return NULL;
1893 }
1894 p1 = strstr(path + 8, "/");
1895 if (!p1) {
1896 errno = EINVAL;
1897 return NULL;
1898 }
1899 errno = 0;
1900 return p1 + 1;
1901 }
1902
1903 /*
1904 * split the last path element from the path in @cg.
1905 * @dir is newly allocated and should be freed, @last not
1906 */
1907 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1908 {
1909 char *p;
1910
1911 do {
1912 *dir = strdup(cg);
1913 } while (!*dir);
1914 *last = strrchr(cg, '/');
1915 if (!*last) {
1916 *last = NULL;
1917 return;
1918 }
1919 p = strrchr(*dir, '/');
1920 *p = '\0';
1921 }
1922
1923 /*
1924 * FUSE ops for /cgroup
1925 */
1926
1927 int cg_getattr(const char *path, struct stat *sb)
1928 {
1929 struct timespec now;
1930 struct fuse_context *fc = fuse_get_context();
1931 char * cgdir = NULL;
1932 char *last = NULL, *path1, *path2;
1933 struct cgfs_files *k = NULL;
1934 const char *cgroup;
1935 const char *controller = NULL;
1936 int ret = -ENOENT;
1937
1938
1939 if (!fc)
1940 return -EIO;
1941
1942 memset(sb, 0, sizeof(struct stat));
1943
1944 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1945 return -EINVAL;
1946
1947 sb->st_uid = sb->st_gid = 0;
1948 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1949 sb->st_size = 0;
1950
1951 if (strcmp(path, "/cgroup") == 0) {
1952 sb->st_mode = S_IFDIR | 00755;
1953 sb->st_nlink = 2;
1954 return 0;
1955 }
1956
1957 controller = pick_controller_from_path(fc, path);
1958 if (!controller)
1959 return -errno;
1960 cgroup = find_cgroup_in_path(path);
1961 if (!cgroup) {
1962 /* this is just /cgroup/controller, return it as a dir */
1963 sb->st_mode = S_IFDIR | 00755;
1964 sb->st_nlink = 2;
1965 return 0;
1966 }
1967
1968 get_cgdir_and_path(cgroup, &cgdir, &last);
1969
1970 if (!last) {
1971 path1 = "/";
1972 path2 = cgdir;
1973 } else {
1974 path1 = cgdir;
1975 path2 = last;
1976 }
1977
1978 pid_t initpid = lookup_initpid_in_store(fc->pid);
1979 if (initpid <= 0)
1980 initpid = fc->pid;
1981 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1982 * Then check that caller's cgroup is under path if last is a child
1983 * cgroup, or cgdir if last is a file */
1984
1985 if (is_child_cgroup(controller, path1, path2)) {
1986 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1987 ret = -ENOENT;
1988 goto out;
1989 }
1990 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1991 /* this is just /cgroup/controller, return it as a dir */
1992 sb->st_mode = S_IFDIR | 00555;
1993 sb->st_nlink = 2;
1994 ret = 0;
1995 goto out;
1996 }
1997 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1998 ret = -EACCES;
1999 goto out;
2000 }
2001
2002 // get uid, gid, from '/tasks' file and make up a mode
2003 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2004 sb->st_mode = S_IFDIR | 00755;
2005 k = cgfs_get_key(controller, cgroup, NULL);
2006 if (!k) {
2007 sb->st_uid = sb->st_gid = 0;
2008 } else {
2009 sb->st_uid = k->uid;
2010 sb->st_gid = k->gid;
2011 }
2012 free_key(k);
2013 sb->st_nlink = 2;
2014 ret = 0;
2015 goto out;
2016 }
2017
2018 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
2019 sb->st_mode = S_IFREG | k->mode;
2020 sb->st_nlink = 1;
2021 sb->st_uid = k->uid;
2022 sb->st_gid = k->gid;
2023 sb->st_size = 0;
2024 free_key(k);
2025 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2026 ret = -ENOENT;
2027 goto out;
2028 }
2029 ret = 0;
2030 }
2031
2032 out:
2033 free(cgdir);
2034 return ret;
2035 }
2036
2037 int cg_opendir(const char *path, struct fuse_file_info *fi)
2038 {
2039 struct fuse_context *fc = fuse_get_context();
2040 const char *cgroup;
2041 struct file_info *dir_info;
2042 char *controller = NULL;
2043
2044 if (!fc)
2045 return -EIO;
2046
2047 if (strcmp(path, "/cgroup") == 0) {
2048 cgroup = NULL;
2049 controller = NULL;
2050 } else {
2051 // return list of keys for the controller, and list of child cgroups
2052 controller = pick_controller_from_path(fc, path);
2053 if (!controller)
2054 return -errno;
2055
2056 cgroup = find_cgroup_in_path(path);
2057 if (!cgroup) {
2058 /* this is just /cgroup/controller, return its contents */
2059 cgroup = "/";
2060 }
2061 }
2062
2063 pid_t initpid = lookup_initpid_in_store(fc->pid);
2064 if (initpid <= 0)
2065 initpid = fc->pid;
2066 if (cgroup) {
2067 if (!caller_may_see_dir(initpid, controller, cgroup))
2068 return -ENOENT;
2069 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2070 return -EACCES;
2071 }
2072
2073 /* we'll free this at cg_releasedir */
2074 dir_info = malloc(sizeof(*dir_info));
2075 if (!dir_info)
2076 return -ENOMEM;
2077 dir_info->controller = must_copy_string(controller);
2078 dir_info->cgroup = must_copy_string(cgroup);
2079 dir_info->type = LXC_TYPE_CGDIR;
2080 dir_info->buf = NULL;
2081 dir_info->file = NULL;
2082 dir_info->buflen = 0;
2083
2084 fi->fh = (unsigned long)dir_info;
2085 return 0;
2086 }
2087
2088 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2089 struct fuse_file_info *fi)
2090 {
2091 struct file_info *d = (struct file_info *)fi->fh;
2092 struct cgfs_files **list = NULL;
2093 int i, ret;
2094 char *nextcg = NULL;
2095 struct fuse_context *fc = fuse_get_context();
2096 char **clist = NULL;
2097
2098 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2099 return -EIO;
2100
2101 if (d->type != LXC_TYPE_CGDIR) {
2102 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
2103 return -EIO;
2104 }
2105 if (!d->cgroup && !d->controller) {
2106 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2107 int i;
2108
2109 for (i = 0; i < num_hierarchies; i++) {
2110 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2111 return -EIO;
2112 }
2113 }
2114 return 0;
2115 }
2116
2117 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2118 // not a valid cgroup
2119 ret = -EINVAL;
2120 goto out;
2121 }
2122
2123 pid_t initpid = lookup_initpid_in_store(fc->pid);
2124 if (initpid <= 0)
2125 initpid = fc->pid;
2126 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2127 if (nextcg) {
2128 ret = filler(buf, nextcg, NULL, 0);
2129 free(nextcg);
2130 if (ret != 0) {
2131 ret = -EIO;
2132 goto out;
2133 }
2134 }
2135 ret = 0;
2136 goto out;
2137 }
2138
2139 for (i = 0; list && list[i]; i++) {
2140 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2141 ret = -EIO;
2142 goto out;
2143 }
2144 }
2145
2146 // now get the list of child cgroups
2147
2148 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2149 ret = 0;
2150 goto out;
2151 }
2152 if (clist) {
2153 for (i = 0; clist[i]; i++) {
2154 if (filler(buf, clist[i], NULL, 0) != 0) {
2155 ret = -EIO;
2156 goto out;
2157 }
2158 }
2159 }
2160 ret = 0;
2161
2162 out:
2163 free_keys(list);
2164 if (clist) {
2165 for (i = 0; clist[i]; i++)
2166 free(clist[i]);
2167 free(clist);
2168 }
2169 return ret;
2170 }
2171
2172 static void do_release_file_info(struct fuse_file_info *fi)
2173 {
2174 struct file_info *f = (struct file_info *)fi->fh;
2175
2176 if (!f)
2177 return;
2178
2179 fi->fh = 0;
2180
2181 free(f->controller);
2182 f->controller = NULL;
2183 free(f->cgroup);
2184 f->cgroup = NULL;
2185 free(f->file);
2186 f->file = NULL;
2187 free(f->buf);
2188 f->buf = NULL;
2189 free(f);
2190 f = NULL;
2191 }
2192
2193 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2194 {
2195 do_release_file_info(fi);
2196 return 0;
2197 }
2198
2199 int cg_open(const char *path, struct fuse_file_info *fi)
2200 {
2201 const char *cgroup;
2202 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2203 struct cgfs_files *k = NULL;
2204 struct file_info *file_info;
2205 struct fuse_context *fc = fuse_get_context();
2206 int ret;
2207
2208 if (!fc)
2209 return -EIO;
2210
2211 controller = pick_controller_from_path(fc, path);
2212 if (!controller)
2213 return -errno;
2214 cgroup = find_cgroup_in_path(path);
2215 if (!cgroup)
2216 return -errno;
2217
2218 get_cgdir_and_path(cgroup, &cgdir, &last);
2219 if (!last) {
2220 path1 = "/";
2221 path2 = cgdir;
2222 } else {
2223 path1 = cgdir;
2224 path2 = last;
2225 }
2226
2227 k = cgfs_get_key(controller, path1, path2);
2228 if (!k) {
2229 ret = -EINVAL;
2230 goto out;
2231 }
2232 free_key(k);
2233
2234 pid_t initpid = lookup_initpid_in_store(fc->pid);
2235 if (initpid <= 0)
2236 initpid = fc->pid;
2237 if (!caller_may_see_dir(initpid, controller, path1)) {
2238 ret = -ENOENT;
2239 goto out;
2240 }
2241 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2242 ret = -EACCES;
2243 goto out;
2244 }
2245
2246 /* we'll free this at cg_release */
2247 file_info = malloc(sizeof(*file_info));
2248 if (!file_info) {
2249 ret = -ENOMEM;
2250 goto out;
2251 }
2252 file_info->controller = must_copy_string(controller);
2253 file_info->cgroup = must_copy_string(path1);
2254 file_info->file = must_copy_string(path2);
2255 file_info->type = LXC_TYPE_CGFILE;
2256 file_info->buf = NULL;
2257 file_info->buflen = 0;
2258
2259 fi->fh = (unsigned long)file_info;
2260 ret = 0;
2261
2262 out:
2263 free(cgdir);
2264 return ret;
2265 }
2266
2267 int cg_access(const char *path, int mode)
2268 {
2269 int ret;
2270 const char *cgroup;
2271 char *path1, *path2, *controller;
2272 char *last = NULL, *cgdir = NULL;
2273 struct cgfs_files *k = NULL;
2274 struct fuse_context *fc = fuse_get_context();
2275
2276 if (strcmp(path, "/cgroup") == 0)
2277 return 0;
2278
2279 if (!fc)
2280 return -EIO;
2281
2282 controller = pick_controller_from_path(fc, path);
2283 if (!controller)
2284 return -errno;
2285 cgroup = find_cgroup_in_path(path);
2286 if (!cgroup) {
2287 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2288 if ((mode & W_OK) == 0)
2289 return 0;
2290 return -EACCES;
2291 }
2292
2293 get_cgdir_and_path(cgroup, &cgdir, &last);
2294 if (!last) {
2295 path1 = "/";
2296 path2 = cgdir;
2297 } else {
2298 path1 = cgdir;
2299 path2 = last;
2300 }
2301
2302 k = cgfs_get_key(controller, path1, path2);
2303 if (!k) {
2304 if ((mode & W_OK) == 0)
2305 ret = 0;
2306 else
2307 ret = -EACCES;
2308 goto out;
2309 }
2310 free_key(k);
2311
2312 pid_t initpid = lookup_initpid_in_store(fc->pid);
2313 if (initpid <= 0)
2314 initpid = fc->pid;
2315 if (!caller_may_see_dir(initpid, controller, path1)) {
2316 ret = -ENOENT;
2317 goto out;
2318 }
2319 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2320 ret = -EACCES;
2321 goto out;
2322 }
2323
2324 ret = 0;
2325
2326 out:
2327 free(cgdir);
2328 return ret;
2329 }
2330
2331 int cg_release(const char *path, struct fuse_file_info *fi)
2332 {
2333 do_release_file_info(fi);
2334 return 0;
2335 }
2336
2337 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2338
2339 static bool wait_for_sock(int sock, int timeout)
2340 {
2341 struct epoll_event ev;
2342 int epfd, ret, now, starttime, deltatime, saved_errno;
2343
2344 if ((starttime = time(NULL)) < 0)
2345 return false;
2346
2347 if ((epfd = epoll_create(1)) < 0) {
2348 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2349 return false;
2350 }
2351
2352 ev.events = POLLIN_SET;
2353 ev.data.fd = sock;
2354 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2355 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2356 close(epfd);
2357 return false;
2358 }
2359
2360 again:
2361 if ((now = time(NULL)) < 0) {
2362 close(epfd);
2363 return false;
2364 }
2365
2366 deltatime = (starttime + timeout) - now;
2367 if (deltatime < 0) { // timeout
2368 errno = 0;
2369 close(epfd);
2370 return false;
2371 }
2372 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2373 if (ret < 0 && errno == EINTR)
2374 goto again;
2375 saved_errno = errno;
2376 close(epfd);
2377
2378 if (ret <= 0) {
2379 errno = saved_errno;
2380 return false;
2381 }
2382 return true;
2383 }
2384
2385 static int msgrecv(int sockfd, void *buf, size_t len)
2386 {
2387 if (!wait_for_sock(sockfd, 2))
2388 return -1;
2389 return recv(sockfd, buf, len, MSG_DONTWAIT);
2390 }
2391
2392 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2393 {
2394 struct msghdr msg = { 0 };
2395 struct iovec iov;
2396 struct cmsghdr *cmsg;
2397 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2398 char buf[1];
2399 buf[0] = 'p';
2400
2401 if (pingfirst) {
2402 if (msgrecv(sock, buf, 1) != 1) {
2403 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2404 return SEND_CREDS_FAIL;
2405 }
2406 }
2407
2408 msg.msg_control = cmsgbuf;
2409 msg.msg_controllen = sizeof(cmsgbuf);
2410
2411 cmsg = CMSG_FIRSTHDR(&msg);
2412 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2413 cmsg->cmsg_level = SOL_SOCKET;
2414 cmsg->cmsg_type = SCM_CREDENTIALS;
2415 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2416
2417 msg.msg_name = NULL;
2418 msg.msg_namelen = 0;
2419
2420 buf[0] = v;
2421 iov.iov_base = buf;
2422 iov.iov_len = sizeof(buf);
2423 msg.msg_iov = &iov;
2424 msg.msg_iovlen = 1;
2425
2426 if (sendmsg(sock, &msg, 0) < 0) {
2427 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2428 if (errno == 3)
2429 return SEND_CREDS_NOTSK;
2430 return SEND_CREDS_FAIL;
2431 }
2432
2433 return SEND_CREDS_OK;
2434 }
2435
2436 static bool recv_creds(int sock, struct ucred *cred, char *v)
2437 {
2438 struct msghdr msg = { 0 };
2439 struct iovec iov;
2440 struct cmsghdr *cmsg;
2441 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2442 char buf[1];
2443 int ret;
2444 int optval = 1;
2445
2446 *v = '1';
2447
2448 cred->pid = -1;
2449 cred->uid = -1;
2450 cred->gid = -1;
2451
2452 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2453 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2454 return false;
2455 }
2456 buf[0] = '1';
2457 if (write(sock, buf, 1) != 1) {
2458 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2459 return false;
2460 }
2461
2462 msg.msg_name = NULL;
2463 msg.msg_namelen = 0;
2464 msg.msg_control = cmsgbuf;
2465 msg.msg_controllen = sizeof(cmsgbuf);
2466
2467 iov.iov_base = buf;
2468 iov.iov_len = sizeof(buf);
2469 msg.msg_iov = &iov;
2470 msg.msg_iovlen = 1;
2471
2472 if (!wait_for_sock(sock, 2)) {
2473 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2474 return false;
2475 }
2476 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2477 if (ret < 0) {
2478 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2479 return false;
2480 }
2481
2482 cmsg = CMSG_FIRSTHDR(&msg);
2483
2484 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2485 cmsg->cmsg_level == SOL_SOCKET &&
2486 cmsg->cmsg_type == SCM_CREDENTIALS) {
2487 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2488 }
2489 *v = buf[0];
2490
2491 return true;
2492 }
2493
2494 struct pid_ns_clone_args {
2495 int *cpipe;
2496 int sock;
2497 pid_t tpid;
2498 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2499 };
2500
2501 /*
2502 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2503 * with clone(). This simply writes '1' as ACK back to the parent
2504 * before calling the actual wrapped function.
2505 */
2506 static int pid_ns_clone_wrapper(void *arg) {
2507 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2508 char b = '1';
2509
2510 close(args->cpipe[0]);
2511 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2512 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2513 close(args->cpipe[1]);
2514 return args->wrapped(args->sock, args->tpid);
2515 }
2516
2517 /*
2518 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2519 * int value back over the socket. This shifts the pid from the
2520 * sender's pidns into tpid's pidns.
2521 */
2522 static int pid_to_ns(int sock, pid_t tpid)
2523 {
2524 char v = '0';
2525 struct ucred cred;
2526
2527 while (recv_creds(sock, &cred, &v)) {
2528 if (v == '1')
2529 return 0;
2530 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2531 return 1;
2532 }
2533 return 0;
2534 }
2535
2536
2537 /*
2538 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2539 * in your old pidns. Only children which you clone will be in the target
2540 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2541 * actually convert pids.
2542 *
2543 * Note: glibc's fork() does not respect pidns, which can lead to failed
2544 * assertions inside glibc (and thus failed forks) if the child's pid in
2545 * the pidns and the parent pid outside are identical. Using clone prevents
2546 * this issue.
2547 */
2548 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2549 {
2550 int newnsfd = -1, ret, cpipe[2];
2551 char fnam[100];
2552 pid_t cpid;
2553 char v;
2554
2555 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2556 if (ret < 0 || ret >= sizeof(fnam))
2557 _exit(1);
2558 newnsfd = open(fnam, O_RDONLY);
2559 if (newnsfd < 0)
2560 _exit(1);
2561 if (setns(newnsfd, 0) < 0)
2562 _exit(1);
2563 close(newnsfd);
2564
2565 if (pipe(cpipe) < 0)
2566 _exit(1);
2567
2568 struct pid_ns_clone_args args = {
2569 .cpipe = cpipe,
2570 .sock = sock,
2571 .tpid = tpid,
2572 .wrapped = &pid_to_ns
2573 };
2574 size_t stack_size = sysconf(_SC_PAGESIZE);
2575 void *stack = alloca(stack_size);
2576
2577 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2578 if (cpid < 0)
2579 _exit(1);
2580
2581 // give the child 1 second to be done forking and
2582 // write its ack
2583 if (!wait_for_sock(cpipe[0], 1))
2584 _exit(1);
2585 ret = read(cpipe[0], &v, 1);
2586 if (ret != sizeof(char) || v != '1')
2587 _exit(1);
2588
2589 if (!wait_for_pid(cpid))
2590 _exit(1);
2591 _exit(0);
2592 }
2593
2594 /*
2595 * To read cgroup files with a particular pid, we will setns into the child
2596 * pidns, open a pipe, fork a child - which will be the first to really be in
2597 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2598 */
2599 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2600 {
2601 int sock[2] = {-1, -1};
2602 char *tmpdata = NULL;
2603 int ret;
2604 pid_t qpid, cpid = -1;
2605 bool answer = false;
2606 char v = '0';
2607 struct ucred cred;
2608 size_t sz = 0, asz = 0;
2609
2610 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2611 return false;
2612
2613 /*
2614 * Now we read the pids from returned data one by one, pass
2615 * them into a child in the target namespace, read back the
2616 * translated pids, and put them into our to-return data
2617 */
2618
2619 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2620 perror("socketpair");
2621 free(tmpdata);
2622 return false;
2623 }
2624
2625 cpid = fork();
2626 if (cpid == -1)
2627 goto out;
2628
2629 if (!cpid) // child - exits when done
2630 pid_to_ns_wrapper(sock[1], tpid);
2631
2632 char *ptr = tmpdata;
2633 cred.uid = 0;
2634 cred.gid = 0;
2635 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2636 cred.pid = qpid;
2637 ret = send_creds(sock[0], &cred, v, true);
2638
2639 if (ret == SEND_CREDS_NOTSK)
2640 goto next;
2641 if (ret == SEND_CREDS_FAIL)
2642 goto out;
2643
2644 // read converted results
2645 if (!wait_for_sock(sock[0], 2)) {
2646 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2647 goto out;
2648 }
2649 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2650 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2651 goto out;
2652 }
2653 must_strcat_pid(d, &sz, &asz, qpid);
2654 next:
2655 ptr = strchr(ptr, '\n');
2656 if (!ptr)
2657 break;
2658 ptr++;
2659 }
2660
2661 cred.pid = getpid();
2662 v = '1';
2663 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2664 // failed to ask child to exit
2665 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2666 goto out;
2667 }
2668
2669 answer = true;
2670
2671 out:
2672 free(tmpdata);
2673 if (cpid != -1)
2674 wait_for_pid(cpid);
2675 if (sock[0] != -1) {
2676 close(sock[0]);
2677 close(sock[1]);
2678 }
2679 return answer;
2680 }
2681
2682 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2683 struct fuse_file_info *fi)
2684 {
2685 struct fuse_context *fc = fuse_get_context();
2686 struct file_info *f = (struct file_info *)fi->fh;
2687 struct cgfs_files *k = NULL;
2688 char *data = NULL;
2689 int ret, s;
2690 bool r;
2691
2692 if (f->type != LXC_TYPE_CGFILE) {
2693 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2694 return -EIO;
2695 }
2696
2697 if (offset)
2698 return 0;
2699
2700 if (!fc)
2701 return -EIO;
2702
2703 if (!f->controller)
2704 return -EINVAL;
2705
2706 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2707 return -EINVAL;
2708 }
2709 free_key(k);
2710
2711
2712 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2713 ret = -EACCES;
2714 goto out;
2715 }
2716
2717 if (strcmp(f->file, "tasks") == 0 ||
2718 strcmp(f->file, "/tasks") == 0 ||
2719 strcmp(f->file, "/cgroup.procs") == 0 ||
2720 strcmp(f->file, "cgroup.procs") == 0)
2721 // special case - we have to translate the pids
2722 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2723 else
2724 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2725
2726 if (!r) {
2727 ret = -EINVAL;
2728 goto out;
2729 }
2730
2731 if (!data) {
2732 ret = 0;
2733 goto out;
2734 }
2735 s = strlen(data);
2736 if (s > size)
2737 s = size;
2738 memcpy(buf, data, s);
2739 if (s > 0 && s < size && data[s-1] != '\n')
2740 buf[s++] = '\n';
2741
2742 ret = s;
2743
2744 out:
2745 free(data);
2746 return ret;
2747 }
2748
2749 static int pid_from_ns(int sock, pid_t tpid)
2750 {
2751 pid_t vpid;
2752 struct ucred cred;
2753 char v;
2754 int ret;
2755
2756 cred.uid = 0;
2757 cred.gid = 0;
2758 while (1) {
2759 if (!wait_for_sock(sock, 2)) {
2760 lxcfs_error("%s\n", "Timeout reading from parent.");
2761 return 1;
2762 }
2763 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2764 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2765 return 1;
2766 }
2767 if (vpid == -1) // done
2768 break;
2769 v = '0';
2770 cred.pid = vpid;
2771 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2772 v = '1';
2773 cred.pid = getpid();
2774 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2775 return 1;
2776 }
2777 }
2778 return 0;
2779 }
2780
2781 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2782 {
2783 int newnsfd = -1, ret, cpipe[2];
2784 char fnam[100];
2785 pid_t cpid;
2786 char v;
2787
2788 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2789 if (ret < 0 || ret >= sizeof(fnam))
2790 _exit(1);
2791 newnsfd = open(fnam, O_RDONLY);
2792 if (newnsfd < 0)
2793 _exit(1);
2794 if (setns(newnsfd, 0) < 0)
2795 _exit(1);
2796 close(newnsfd);
2797
2798 if (pipe(cpipe) < 0)
2799 _exit(1);
2800
2801 struct pid_ns_clone_args args = {
2802 .cpipe = cpipe,
2803 .sock = sock,
2804 .tpid = tpid,
2805 .wrapped = &pid_from_ns
2806 };
2807 size_t stack_size = sysconf(_SC_PAGESIZE);
2808 void *stack = alloca(stack_size);
2809
2810 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2811 if (cpid < 0)
2812 _exit(1);
2813
2814 // give the child 1 second to be done forking and
2815 // write its ack
2816 if (!wait_for_sock(cpipe[0], 1))
2817 _exit(1);
2818 ret = read(cpipe[0], &v, 1);
2819 if (ret != sizeof(char) || v != '1')
2820 _exit(1);
2821
2822 if (!wait_for_pid(cpid))
2823 _exit(1);
2824 _exit(0);
2825 }
2826
2827 /*
2828 * Given host @uid, return the uid to which it maps in
2829 * @pid's user namespace, or -1 if none.
2830 */
2831 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2832 {
2833 FILE *f;
2834 char line[400];
2835
2836 sprintf(line, "/proc/%d/uid_map", pid);
2837 if ((f = fopen(line, "r")) == NULL) {
2838 return false;
2839 }
2840
2841 *answer = convert_id_to_ns(f, uid);
2842 fclose(f);
2843
2844 if (*answer == -1)
2845 return false;
2846 return true;
2847 }
2848
2849 /*
2850 * get_pid_creds: get the real uid and gid of @pid from
2851 * /proc/$$/status
2852 * (XXX should we use euid here?)
2853 */
2854 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2855 {
2856 char line[400];
2857 uid_t u;
2858 gid_t g;
2859 FILE *f;
2860
2861 *uid = -1;
2862 *gid = -1;
2863 sprintf(line, "/proc/%d/status", pid);
2864 if ((f = fopen(line, "r")) == NULL) {
2865 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2866 return;
2867 }
2868 while (fgets(line, 400, f)) {
2869 if (strncmp(line, "Uid:", 4) == 0) {
2870 if (sscanf(line+4, "%u", &u) != 1) {
2871 lxcfs_error("bad uid line for pid %u\n", pid);
2872 fclose(f);
2873 return;
2874 }
2875 *uid = u;
2876 } else if (strncmp(line, "Gid:", 4) == 0) {
2877 if (sscanf(line+4, "%u", &g) != 1) {
2878 lxcfs_error("bad gid line for pid %u\n", pid);
2879 fclose(f);
2880 return;
2881 }
2882 *gid = g;
2883 }
2884 }
2885 fclose(f);
2886 }
2887
2888 /*
2889 * May the requestor @r move victim @v to a new cgroup?
2890 * This is allowed if
2891 * . they are the same task
2892 * . they are ownedy by the same uid
2893 * . @r is root on the host, or
2894 * . @v's uid is mapped into @r's where @r is root.
2895 */
2896 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2897 {
2898 uid_t v_uid, tmpuid;
2899 gid_t v_gid;
2900
2901 if (r == v)
2902 return true;
2903 if (r_uid == 0)
2904 return true;
2905 get_pid_creds(v, &v_uid, &v_gid);
2906 if (r_uid == v_uid)
2907 return true;
2908 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2909 && hostuid_to_ns(v_uid, r, &tmpuid))
2910 return true;
2911 return false;
2912 }
2913
2914 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2915 const char *file, const char *buf)
2916 {
2917 int sock[2] = {-1, -1};
2918 pid_t qpid, cpid = -1;
2919 FILE *pids_file = NULL;
2920 bool answer = false, fail = false;
2921
2922 pids_file = open_pids_file(contrl, cg);
2923 if (!pids_file)
2924 return false;
2925
2926 /*
2927 * write the pids to a socket, have helper in writer's pidns
2928 * call movepid for us
2929 */
2930 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2931 perror("socketpair");
2932 goto out;
2933 }
2934
2935 cpid = fork();
2936 if (cpid == -1)
2937 goto out;
2938
2939 if (!cpid) { // child
2940 fclose(pids_file);
2941 pid_from_ns_wrapper(sock[1], tpid);
2942 }
2943
2944 const char *ptr = buf;
2945 while (sscanf(ptr, "%d", &qpid) == 1) {
2946 struct ucred cred;
2947 char v;
2948
2949 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2950 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2951 goto out;
2952 }
2953
2954 if (recv_creds(sock[0], &cred, &v)) {
2955 if (v == '0') {
2956 if (!may_move_pid(tpid, tuid, cred.pid)) {
2957 fail = true;
2958 break;
2959 }
2960 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2961 fail = true;
2962 }
2963 }
2964
2965 ptr = strchr(ptr, '\n');
2966 if (!ptr)
2967 break;
2968 ptr++;
2969 }
2970
2971 /* All good, write the value */
2972 qpid = -1;
2973 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2974 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2975
2976 if (!fail)
2977 answer = true;
2978
2979 out:
2980 if (cpid != -1)
2981 wait_for_pid(cpid);
2982 if (sock[0] != -1) {
2983 close(sock[0]);
2984 close(sock[1]);
2985 }
2986 if (pids_file) {
2987 if (fclose(pids_file) != 0)
2988 answer = false;
2989 }
2990 return answer;
2991 }
2992
2993 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2994 struct fuse_file_info *fi)
2995 {
2996 struct fuse_context *fc = fuse_get_context();
2997 char *localbuf = NULL;
2998 struct cgfs_files *k = NULL;
2999 struct file_info *f = (struct file_info *)fi->fh;
3000 bool r;
3001
3002 if (f->type != LXC_TYPE_CGFILE) {
3003 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
3004 return -EIO;
3005 }
3006
3007 if (offset)
3008 return 0;
3009
3010 if (!fc)
3011 return -EIO;
3012
3013 localbuf = alloca(size+1);
3014 localbuf[size] = '\0';
3015 memcpy(localbuf, buf, size);
3016
3017 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
3018 size = -EINVAL;
3019 goto out;
3020 }
3021
3022 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
3023 size = -EACCES;
3024 goto out;
3025 }
3026
3027 if (strcmp(f->file, "tasks") == 0 ||
3028 strcmp(f->file, "/tasks") == 0 ||
3029 strcmp(f->file, "/cgroup.procs") == 0 ||
3030 strcmp(f->file, "cgroup.procs") == 0)
3031 // special case - we have to translate the pids
3032 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
3033 else
3034 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
3035
3036 if (!r)
3037 size = -EINVAL;
3038
3039 out:
3040 free_key(k);
3041 return size;
3042 }
3043
3044 int cg_chown(const char *path, uid_t uid, gid_t gid)
3045 {
3046 struct fuse_context *fc = fuse_get_context();
3047 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3048 struct cgfs_files *k = NULL;
3049 const char *cgroup;
3050 int ret;
3051
3052 if (!fc)
3053 return -EIO;
3054
3055 if (strcmp(path, "/cgroup") == 0)
3056 return -EPERM;
3057
3058 controller = pick_controller_from_path(fc, path);
3059 if (!controller)
3060 return errno == ENOENT ? -EPERM : -errno;
3061
3062 cgroup = find_cgroup_in_path(path);
3063 if (!cgroup)
3064 /* this is just /cgroup/controller */
3065 return -EPERM;
3066
3067 get_cgdir_and_path(cgroup, &cgdir, &last);
3068
3069 if (!last) {
3070 path1 = "/";
3071 path2 = cgdir;
3072 } else {
3073 path1 = cgdir;
3074 path2 = last;
3075 }
3076
3077 if (is_child_cgroup(controller, path1, path2)) {
3078 // get uid, gid, from '/tasks' file and make up a mode
3079 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3080 k = cgfs_get_key(controller, cgroup, "tasks");
3081
3082 } else
3083 k = cgfs_get_key(controller, path1, path2);
3084
3085 if (!k) {
3086 ret = -EINVAL;
3087 goto out;
3088 }
3089
3090 /*
3091 * This being a fuse request, the uid and gid must be valid
3092 * in the caller's namespace. So we can just check to make
3093 * sure that the caller is root in his uid, and privileged
3094 * over the file's current owner.
3095 */
3096 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3097 ret = -EACCES;
3098 goto out;
3099 }
3100
3101 ret = cgfs_chown_file(controller, cgroup, uid, gid);
3102
3103 out:
3104 free_key(k);
3105 free(cgdir);
3106
3107 return ret;
3108 }
3109
3110 int cg_chmod(const char *path, mode_t mode)
3111 {
3112 struct fuse_context *fc = fuse_get_context();
3113 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3114 struct cgfs_files *k = NULL;
3115 const char *cgroup;
3116 int ret;
3117
3118 if (!fc)
3119 return -EIO;
3120
3121 if (strcmp(path, "/cgroup") == 0)
3122 return -EPERM;
3123
3124 controller = pick_controller_from_path(fc, path);
3125 if (!controller)
3126 return errno == ENOENT ? -EPERM : -errno;
3127
3128 cgroup = find_cgroup_in_path(path);
3129 if (!cgroup)
3130 /* this is just /cgroup/controller */
3131 return -EPERM;
3132
3133 get_cgdir_and_path(cgroup, &cgdir, &last);
3134
3135 if (!last) {
3136 path1 = "/";
3137 path2 = cgdir;
3138 } else {
3139 path1 = cgdir;
3140 path2 = last;
3141 }
3142
3143 if (is_child_cgroup(controller, path1, path2)) {
3144 // get uid, gid, from '/tasks' file and make up a mode
3145 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3146 k = cgfs_get_key(controller, cgroup, "tasks");
3147
3148 } else
3149 k = cgfs_get_key(controller, path1, path2);
3150
3151 if (!k) {
3152 ret = -EINVAL;
3153 goto out;
3154 }
3155
3156 /*
3157 * This being a fuse request, the uid and gid must be valid
3158 * in the caller's namespace. So we can just check to make
3159 * sure that the caller is root in his uid, and privileged
3160 * over the file's current owner.
3161 */
3162 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3163 ret = -EPERM;
3164 goto out;
3165 }
3166
3167 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3168 ret = -EINVAL;
3169 goto out;
3170 }
3171
3172 ret = 0;
3173 out:
3174 free_key(k);
3175 free(cgdir);
3176 return ret;
3177 }
3178
3179 int cg_mkdir(const char *path, mode_t mode)
3180 {
3181 struct fuse_context *fc = fuse_get_context();
3182 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3183 const char *cgroup;
3184 int ret;
3185
3186 if (!fc)
3187 return -EIO;
3188
3189 controller = pick_controller_from_path(fc, path);
3190 if (!controller)
3191 return errno == ENOENT ? -EPERM : -errno;
3192
3193 cgroup = find_cgroup_in_path(path);
3194 if (!cgroup)
3195 return -errno;
3196
3197 get_cgdir_and_path(cgroup, &cgdir, &last);
3198 if (!last)
3199 path1 = "/";
3200 else
3201 path1 = cgdir;
3202
3203 pid_t initpid = lookup_initpid_in_store(fc->pid);
3204 if (initpid <= 0)
3205 initpid = fc->pid;
3206 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3207 if (!next)
3208 ret = -EINVAL;
3209 else if (last && strcmp(next, last) == 0)
3210 ret = -EEXIST;
3211 else
3212 ret = -EPERM;
3213 goto out;
3214 }
3215
3216 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3217 ret = -EACCES;
3218 goto out;
3219 }
3220 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3221 ret = -EACCES;
3222 goto out;
3223 }
3224
3225 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3226
3227 out:
3228 free(cgdir);
3229 free(next);
3230 return ret;
3231 }
3232
3233 int cg_rmdir(const char *path)
3234 {
3235 struct fuse_context *fc = fuse_get_context();
3236 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3237 const char *cgroup;
3238 int ret;
3239
3240 if (!fc)
3241 return -EIO;
3242
3243 controller = pick_controller_from_path(fc, path);
3244 if (!controller) /* Someone's trying to delete "/cgroup". */
3245 return -EPERM;
3246
3247 cgroup = find_cgroup_in_path(path);
3248 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3249 return -EPERM;
3250
3251 get_cgdir_and_path(cgroup, &cgdir, &last);
3252 if (!last) {
3253 /* Someone's trying to delete a cgroup on the same level as the
3254 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3255 * rmdir "/cgroup/blkio/init.slice".
3256 */
3257 ret = -EPERM;
3258 goto out;
3259 }
3260
3261 pid_t initpid = lookup_initpid_in_store(fc->pid);
3262 if (initpid <= 0)
3263 initpid = fc->pid;
3264 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3265 if (!last || (next && (strcmp(next, last) == 0)))
3266 ret = -EBUSY;
3267 else
3268 ret = -ENOENT;
3269 goto out;
3270 }
3271
3272 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3273 ret = -EACCES;
3274 goto out;
3275 }
3276 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3277 ret = -EACCES;
3278 goto out;
3279 }
3280
3281 if (!cgfs_remove(controller, cgroup)) {
3282 ret = -EINVAL;
3283 goto out;
3284 }
3285
3286 ret = 0;
3287
3288 out:
3289 free(cgdir);
3290 free(next);
3291 return ret;
3292 }
3293
3294 static bool startswith(const char *line, const char *pref)
3295 {
3296 if (strncmp(line, pref, strlen(pref)) == 0)
3297 return true;
3298 return false;
3299 }
3300
3301 static void parse_memstat(char *memstat, unsigned long *cached,
3302 unsigned long *active_anon, unsigned long *inactive_anon,
3303 unsigned long *active_file, unsigned long *inactive_file,
3304 unsigned long *unevictable, unsigned long *shmem)
3305 {
3306 char *eol;
3307
3308 while (*memstat) {
3309 if (startswith(memstat, "total_cache")) {
3310 sscanf(memstat + 11, "%lu", cached);
3311 *cached /= 1024;
3312 } else if (startswith(memstat, "total_active_anon")) {
3313 sscanf(memstat + 17, "%lu", active_anon);
3314 *active_anon /= 1024;
3315 } else if (startswith(memstat, "total_inactive_anon")) {
3316 sscanf(memstat + 19, "%lu", inactive_anon);
3317 *inactive_anon /= 1024;
3318 } else if (startswith(memstat, "total_active_file")) {
3319 sscanf(memstat + 17, "%lu", active_file);
3320 *active_file /= 1024;
3321 } else if (startswith(memstat, "total_inactive_file")) {
3322 sscanf(memstat + 19, "%lu", inactive_file);
3323 *inactive_file /= 1024;
3324 } else if (startswith(memstat, "total_unevictable")) {
3325 sscanf(memstat + 17, "%lu", unevictable);
3326 *unevictable /= 1024;
3327 } else if (startswith(memstat, "total_shmem")) {
3328 sscanf(memstat + 11, "%lu", shmem);
3329 *shmem /= 1024;
3330 }
3331 eol = strchr(memstat, '\n');
3332 if (!eol)
3333 return;
3334 memstat = eol+1;
3335 }
3336 }
3337
3338 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3339 {
3340 char *eol;
3341 char key[32];
3342
3343 memset(key, 0, 32);
3344 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3345
3346 size_t len = strlen(key);
3347 *v = 0;
3348
3349 while (*str) {
3350 if (startswith(str, key)) {
3351 sscanf(str + len, "%lu", v);
3352 return;
3353 }
3354 eol = strchr(str, '\n');
3355 if (!eol)
3356 return;
3357 str = eol+1;
3358 }
3359 }
3360
3361 static int read_file(const char *path, char *buf, size_t size,
3362 struct file_info *d)
3363 {
3364 size_t linelen = 0, total_len = 0, rv = 0;
3365 char *line = NULL;
3366 char *cache = d->buf;
3367 size_t cache_size = d->buflen;
3368 FILE *f = fopen(path, "r");
3369 if (!f)
3370 return 0;
3371
3372 while (getline(&line, &linelen, f) != -1) {
3373 ssize_t l = snprintf(cache, cache_size, "%s", line);
3374 if (l < 0) {
3375 perror("Error writing to cache");
3376 rv = 0;
3377 goto err;
3378 }
3379 if (l >= cache_size) {
3380 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3381 rv = 0;
3382 goto err;
3383 }
3384 cache += l;
3385 cache_size -= l;
3386 total_len += l;
3387 }
3388
3389 d->size = total_len;
3390 if (total_len > size)
3391 total_len = size;
3392
3393 /* read from off 0 */
3394 memcpy(buf, d->buf, total_len);
3395 rv = total_len;
3396 err:
3397 fclose(f);
3398 free(line);
3399 return rv;
3400 }
3401
3402 /*
3403 * FUSE ops for /proc
3404 */
3405
3406 static unsigned long get_memlimit(const char *cgroup, const char *file)
3407 {
3408 char *memlimit_str = NULL;
3409 unsigned long memlimit = -1;
3410
3411 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3412 memlimit = strtoul(memlimit_str, NULL, 10);
3413
3414 free(memlimit_str);
3415
3416 return memlimit;
3417 }
3418
3419 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3420 {
3421 char *copy = strdupa(cgroup);
3422 unsigned long memlimit = 0, retlimit;
3423
3424 retlimit = get_memlimit(copy, file);
3425
3426 while (strcmp(copy, "/") != 0) {
3427 copy = dirname(copy);
3428 memlimit = get_memlimit(copy, file);
3429 if (memlimit != -1 && memlimit < retlimit)
3430 retlimit = memlimit;
3431 };
3432
3433 return retlimit;
3434 }
3435
3436 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3437 struct fuse_file_info *fi)
3438 {
3439 struct fuse_context *fc = fuse_get_context();
3440 struct file_info *d = (struct file_info *)fi->fh;
3441 char *cg;
3442 char *memusage_str = NULL, *memstat_str = NULL,
3443 *memswlimit_str = NULL, *memswusage_str = NULL;
3444 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3445 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3446 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
3447 hostswtotal = 0;
3448 char *line = NULL;
3449 size_t linelen = 0, total_len = 0, rv = 0;
3450 char *cache = d->buf;
3451 size_t cache_size = d->buflen;
3452 FILE *f = NULL;
3453
3454 if (offset){
3455 if (offset > d->size)
3456 return -EINVAL;
3457 if (!d->cached)
3458 return 0;
3459 int left = d->size - offset;
3460 total_len = left > size ? size: left;
3461 memcpy(buf, cache + offset, total_len);
3462 return total_len;
3463 }
3464
3465 pid_t initpid = lookup_initpid_in_store(fc->pid);
3466 if (initpid <= 0)
3467 initpid = fc->pid;
3468 cg = get_pid_cgroup(initpid, "memory");
3469 if (!cg)
3470 return read_file("/proc/meminfo", buf, size, d);
3471 prune_init_slice(cg);
3472
3473 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3474 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3475 goto err;
3476 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3477 goto err;
3478
3479 // Following values are allowed to fail, because swapaccount might be turned
3480 // off for current kernel
3481 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3482 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3483 {
3484 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3485 memswusage = strtoul(memswusage_str, NULL, 10);
3486
3487 memswlimit = memswlimit / 1024;
3488 memswusage = memswusage / 1024;
3489 }
3490
3491 memusage = strtoul(memusage_str, NULL, 10);
3492 memlimit /= 1024;
3493 memusage /= 1024;
3494
3495 parse_memstat(memstat_str, &cached, &active_anon,
3496 &inactive_anon, &active_file, &inactive_file,
3497 &unevictable, &shmem);
3498
3499 f = fopen("/proc/meminfo", "r");
3500 if (!f)
3501 goto err;
3502
3503 while (getline(&line, &linelen, f) != -1) {
3504 ssize_t l;
3505 char *printme, lbuf[100];
3506
3507 memset(lbuf, 0, 100);
3508 if (startswith(line, "MemTotal:")) {
3509 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3510 if (hosttotal < memlimit)
3511 memlimit = hosttotal;
3512 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3513 printme = lbuf;
3514 } else if (startswith(line, "MemFree:")) {
3515 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3516 printme = lbuf;
3517 } else if (startswith(line, "MemAvailable:")) {
3518 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
3519 printme = lbuf;
3520 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3521 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3522 if (hostswtotal < memswlimit)
3523 memswlimit = hostswtotal;
3524 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
3525 printme = lbuf;
3526 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3527 unsigned long swaptotal = memswlimit,
3528 swapusage = memswusage - memusage,
3529 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3530 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3531 printme = lbuf;
3532 } else if (startswith(line, "Slab:")) {
3533 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3534 printme = lbuf;
3535 } else if (startswith(line, "Buffers:")) {
3536 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3537 printme = lbuf;
3538 } else if (startswith(line, "Cached:")) {
3539 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3540 printme = lbuf;
3541 } else if (startswith(line, "SwapCached:")) {
3542 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3543 printme = lbuf;
3544 } else if (startswith(line, "Active:")) {
3545 snprintf(lbuf, 100, "Active: %8lu kB\n",
3546 active_anon + active_file);
3547 printme = lbuf;
3548 } else if (startswith(line, "Inactive:")) {
3549 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3550 inactive_anon + inactive_file);
3551 printme = lbuf;
3552 } else if (startswith(line, "Active(anon)")) {
3553 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3554 printme = lbuf;
3555 } else if (startswith(line, "Inactive(anon)")) {
3556 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3557 printme = lbuf;
3558 } else if (startswith(line, "Active(file)")) {
3559 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3560 printme = lbuf;
3561 } else if (startswith(line, "Inactive(file)")) {
3562 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3563 printme = lbuf;
3564 } else if (startswith(line, "Unevictable")) {
3565 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3566 printme = lbuf;
3567 } else if (startswith(line, "SReclaimable")) {
3568 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3569 printme = lbuf;
3570 } else if (startswith(line, "SUnreclaim")) {
3571 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3572 printme = lbuf;
3573 } else if (startswith(line, "Shmem:")) {
3574 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3575 printme = lbuf;
3576 } else if (startswith(line, "ShmemHugePages")) {
3577 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3578 printme = lbuf;
3579 } else if (startswith(line, "ShmemPmdMapped")) {
3580 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3581 printme = lbuf;
3582 } else
3583 printme = line;
3584
3585 l = snprintf(cache, cache_size, "%s", printme);
3586 if (l < 0) {
3587 perror("Error writing to cache");
3588 rv = 0;
3589 goto err;
3590
3591 }
3592 if (l >= cache_size) {
3593 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3594 rv = 0;
3595 goto err;
3596 }
3597
3598 cache += l;
3599 cache_size -= l;
3600 total_len += l;
3601 }
3602
3603 d->cached = 1;
3604 d->size = total_len;
3605 if (total_len > size ) total_len = size;
3606 memcpy(buf, d->buf, total_len);
3607
3608 rv = total_len;
3609 err:
3610 if (f)
3611 fclose(f);
3612 free(line);
3613 free(cg);
3614 free(memusage_str);
3615 free(memswlimit_str);
3616 free(memswusage_str);
3617 free(memstat_str);
3618 return rv;
3619 }
3620
3621 /*
3622 * Read the cpuset.cpus for cg
3623 * Return the answer in a newly allocated string which must be freed
3624 */
3625 static char *get_cpuset(const char *cg)
3626 {
3627 char *answer;
3628
3629 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3630 return NULL;
3631 return answer;
3632 }
3633
3634 bool cpu_in_cpuset(int cpu, const char *cpuset);
3635
3636 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3637 {
3638 int cpu;
3639
3640 if (sscanf(line, "processor : %d", &cpu) != 1)
3641 return false;
3642 return cpu_in_cpuset(cpu, cpuset);
3643 }
3644
3645 /*
3646 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3647 * depending on `param`. Parameter value is returned throuh `value`.
3648 */
3649 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3650 {
3651 bool rv = false;
3652 char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3653 char *str = NULL;
3654
3655 sprintf(file, "cpu.cfs_%s_us", param);
3656
3657 if (!cgfs_get_value("cpu", cg, file, &str))
3658 goto err;
3659
3660 if (sscanf(str, "%ld", value) != 1)
3661 goto err;
3662
3663 rv = true;
3664
3665 err:
3666 if (str)
3667 free(str);
3668 return rv;
3669 }
3670
3671 /*
3672 * Return the maximum number of visible CPUs based on CPU quotas.
3673 * If there is no quota set, zero is returned.
3674 */
3675 int max_cpu_count(const char *cg)
3676 {
3677 int rv, nprocs;
3678 int64_t cfs_quota, cfs_period;
3679
3680 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3681 return 0;
3682
3683 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3684 return 0;
3685
3686 if (cfs_quota <= 0 || cfs_period <= 0)
3687 return 0;
3688
3689 rv = cfs_quota / cfs_period;
3690
3691 /* In case quota/period does not yield a whole number, add one CPU for
3692 * the remainder.
3693 */
3694 if ((cfs_quota % cfs_period) > 0)
3695 rv += 1;
3696
3697 nprocs = get_nprocs();
3698
3699 if (rv > nprocs)
3700 rv = nprocs;
3701
3702 return rv;
3703 }
3704
3705 /*
3706 * Determine whether CPU views should be used or not.
3707 */
3708 bool use_cpuview(const char *cg)
3709 {
3710 int cfd;
3711 char *tmpc;
3712
3713 tmpc = find_mounted_controller("cpu", &cfd);
3714 if (!tmpc)
3715 return false;
3716
3717 tmpc = find_mounted_controller("cpuacct", &cfd);
3718 if (!tmpc)
3719 return false;
3720
3721 return true;
3722 }
3723
3724 /*
3725 * check whether this is a '^processor" line in /proc/cpuinfo
3726 */
3727 static bool is_processor_line(const char *line)
3728 {
3729 int cpu;
3730
3731 if (sscanf(line, "processor : %d", &cpu) == 1)
3732 return true;
3733 return false;
3734 }
3735
3736 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3737 struct fuse_file_info *fi)
3738 {
3739 struct fuse_context *fc = fuse_get_context();
3740 struct file_info *d = (struct file_info *)fi->fh;
3741 char *cg;
3742 char *cpuset = NULL;
3743 char *line = NULL;
3744 size_t linelen = 0, total_len = 0, rv = 0;
3745 bool am_printing = false, firstline = true, is_s390x = false;
3746 int curcpu = -1, cpu, max_cpus = 0;
3747 bool use_view;
3748 char *cache = d->buf;
3749 size_t cache_size = d->buflen;
3750 FILE *f = NULL;
3751
3752 if (offset){
3753 if (offset > d->size)
3754 return -EINVAL;
3755 if (!d->cached)
3756 return 0;
3757 int left = d->size - offset;
3758 total_len = left > size ? size: left;
3759 memcpy(buf, cache + offset, total_len);
3760 return total_len;
3761 }
3762
3763 pid_t initpid = lookup_initpid_in_store(fc->pid);
3764 if (initpid <= 0)
3765 initpid = fc->pid;
3766 cg = get_pid_cgroup(initpid, "cpuset");
3767 if (!cg)
3768 return read_file("proc/cpuinfo", buf, size, d);
3769 prune_init_slice(cg);
3770
3771 cpuset = get_cpuset(cg);
3772 if (!cpuset)
3773 goto err;
3774
3775 use_view = use_cpuview(cg);
3776
3777 if (use_view)
3778 max_cpus = max_cpu_count(cg);
3779
3780 f = fopen("/proc/cpuinfo", "r");
3781 if (!f)
3782 goto err;
3783
3784 while (getline(&line, &linelen, f) != -1) {
3785 ssize_t l;
3786 if (firstline) {
3787 firstline = false;
3788 if (strstr(line, "IBM/S390") != NULL) {
3789 is_s390x = true;
3790 am_printing = true;
3791 continue;
3792 }
3793 }
3794 if (strncmp(line, "# processors:", 12) == 0)
3795 continue;
3796 if (is_processor_line(line)) {
3797 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3798 break;
3799 am_printing = cpuline_in_cpuset(line, cpuset);
3800 if (am_printing) {
3801 curcpu ++;
3802 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3803 if (l < 0) {
3804 perror("Error writing to cache");
3805 rv = 0;
3806 goto err;
3807 }
3808 if (l >= cache_size) {
3809 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3810 rv = 0;
3811 goto err;
3812 }
3813 cache += l;
3814 cache_size -= l;
3815 total_len += l;
3816 }
3817 continue;
3818 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3819 char *p;
3820 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3821 break;
3822 if (!cpu_in_cpuset(cpu, cpuset))
3823 continue;
3824 curcpu ++;
3825 p = strchr(line, ':');
3826 if (!p || !*p)
3827 goto err;
3828 p++;
3829 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3830 if (l < 0) {
3831 perror("Error writing to cache");
3832 rv = 0;
3833 goto err;
3834 }
3835 if (l >= cache_size) {
3836 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3837 rv = 0;
3838 goto err;
3839 }
3840 cache += l;
3841 cache_size -= l;
3842 total_len += l;
3843 continue;
3844
3845 }
3846 if (am_printing) {
3847 l = snprintf(cache, cache_size, "%s", line);
3848 if (l < 0) {
3849 perror("Error writing to cache");
3850 rv = 0;
3851 goto err;
3852 }
3853 if (l >= cache_size) {
3854 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3855 rv = 0;
3856 goto err;
3857 }
3858 cache += l;
3859 cache_size -= l;
3860 total_len += l;
3861 }
3862 }
3863
3864 if (is_s390x) {
3865 char *origcache = d->buf;
3866 ssize_t l;
3867 do {
3868 d->buf = malloc(d->buflen);
3869 } while (!d->buf);
3870 cache = d->buf;
3871 cache_size = d->buflen;
3872 total_len = 0;
3873 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3874 if (l < 0 || l >= cache_size) {
3875 free(origcache);
3876 goto err;
3877 }
3878 cache_size -= l;
3879 cache += l;
3880 total_len += l;
3881 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3882 if (l < 0 || l >= cache_size) {
3883 free(origcache);
3884 goto err;
3885 }
3886 cache_size -= l;
3887 cache += l;
3888 total_len += l;
3889 l = snprintf(cache, cache_size, "%s", origcache);
3890 free(origcache);
3891 if (l < 0 || l >= cache_size)
3892 goto err;
3893 total_len += l;
3894 }
3895
3896 d->cached = 1;
3897 d->size = total_len;
3898 if (total_len > size ) total_len = size;
3899
3900 /* read from off 0 */
3901 memcpy(buf, d->buf, total_len);
3902 rv = total_len;
3903 err:
3904 if (f)
3905 fclose(f);
3906 free(line);
3907 free(cpuset);
3908 free(cg);
3909 return rv;
3910 }
3911
3912 static uint64_t get_reaper_start_time(pid_t pid)
3913 {
3914 int ret;
3915 FILE *f;
3916 uint64_t starttime;
3917 /* strlen("/proc/") = 6
3918 * +
3919 * LXCFS_NUMSTRLEN64
3920 * +
3921 * strlen("/stat") = 5
3922 * +
3923 * \0 = 1
3924 * */
3925 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3926 char path[__PROC_PID_STAT_LEN];
3927 pid_t qpid;
3928
3929 qpid = lookup_initpid_in_store(pid);
3930 if (qpid <= 0) {
3931 /* Caller can check for EINVAL on 0. */
3932 errno = EINVAL;
3933 return 0;
3934 }
3935
3936 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3937 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3938 /* Caller can check for EINVAL on 0. */
3939 errno = EINVAL;
3940 return 0;
3941 }
3942
3943 f = fopen(path, "r");
3944 if (!f) {
3945 /* Caller can check for EINVAL on 0. */
3946 errno = EINVAL;
3947 return 0;
3948 }
3949
3950 /* Note that the *scanf() argument supression requires that length
3951 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3952 * at us. It's like telling someone you're not married and then asking
3953 * if you can bring your wife to the party.
3954 */
3955 ret = fscanf(f, "%*d " /* (1) pid %d */
3956 "%*s " /* (2) comm %s */
3957 "%*c " /* (3) state %c */
3958 "%*d " /* (4) ppid %d */
3959 "%*d " /* (5) pgrp %d */
3960 "%*d " /* (6) session %d */
3961 "%*d " /* (7) tty_nr %d */
3962 "%*d " /* (8) tpgid %d */
3963 "%*u " /* (9) flags %u */
3964 "%*u " /* (10) minflt %lu */
3965 "%*u " /* (11) cminflt %lu */
3966 "%*u " /* (12) majflt %lu */
3967 "%*u " /* (13) cmajflt %lu */
3968 "%*u " /* (14) utime %lu */
3969 "%*u " /* (15) stime %lu */
3970 "%*d " /* (16) cutime %ld */
3971 "%*d " /* (17) cstime %ld */
3972 "%*d " /* (18) priority %ld */
3973 "%*d " /* (19) nice %ld */
3974 "%*d " /* (20) num_threads %ld */
3975 "%*d " /* (21) itrealvalue %ld */
3976 "%" PRIu64, /* (22) starttime %llu */
3977 &starttime);
3978 if (ret != 1) {
3979 fclose(f);
3980 /* Caller can check for EINVAL on 0. */
3981 errno = EINVAL;
3982 return 0;
3983 }
3984
3985 fclose(f);
3986
3987 errno = 0;
3988 return starttime;
3989 }
3990
3991 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3992 {
3993 uint64_t clockticks;
3994 int64_t ticks_per_sec;
3995
3996 clockticks = get_reaper_start_time(pid);
3997 if (clockticks == 0 && errno == EINVAL) {
3998 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3999 return 0;
4000 }
4001
4002 ticks_per_sec = sysconf(_SC_CLK_TCK);
4003 if (ticks_per_sec < 0 && errno == EINVAL) {
4004 lxcfs_debug(
4005 "%s\n",
4006 "failed to determine number of clock ticks in a second");
4007 return 0;
4008 }
4009
4010 return (clockticks /= ticks_per_sec);
4011 }
4012
4013 static uint64_t get_reaper_age(pid_t pid)
4014 {
4015 uint64_t procstart, uptime, procage;
4016
4017 /* We need to substract the time the process has started since system
4018 * boot minus the time when the system has started to get the actual
4019 * reaper age.
4020 */
4021 procstart = get_reaper_start_time_in_sec(pid);
4022 procage = procstart;
4023 if (procstart > 0) {
4024 int ret;
4025 struct timespec spec;
4026
4027 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
4028 if (ret < 0)
4029 return 0;
4030 /* We could make this more precise here by using the tv_nsec
4031 * field in the timespec struct and convert it to milliseconds
4032 * and then create a double for the seconds and milliseconds but
4033 * that seems more work than it is worth.
4034 */
4035 uptime = spec.tv_sec;
4036 procage = uptime - procstart;
4037 }
4038
4039 return procage;
4040 }
4041
4042 /*
4043 * Returns 0 on success.
4044 * It is the caller's responsibility to free `return_usage`, unless this
4045 * function returns an error.
4046 */
4047 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
4048 {
4049 int cpucount = get_nprocs_conf();
4050 struct cpuacct_usage *cpu_usage;
4051 int rv = 0, i, j, ret, read_pos = 0, read_cnt;
4052 int cg_cpu;
4053 uint64_t cg_user, cg_system;
4054 int64_t ticks_per_sec;
4055 char *usage_str = NULL;
4056
4057 ticks_per_sec = sysconf(_SC_CLK_TCK);
4058
4059 if (ticks_per_sec < 0 && errno == EINVAL) {
4060 lxcfs_debug(
4061 "%s\n",
4062 "read_cpuacct_usage_all failed to determine number of clock ticks "
4063 "in a second");
4064 return -1;
4065 }
4066
4067 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4068 if (!cpu_usage)
4069 return -ENOMEM;
4070
4071 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4072 rv = -1;
4073 goto err;
4074 }
4075
4076 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4077 lxcfs_error("read_cpuacct_usage_all reading first line from "
4078 "%s/cpuacct.usage_all failed.\n", cg);
4079 rv = -1;
4080 goto err;
4081 }
4082
4083 read_pos += read_cnt;
4084
4085 for (i = 0, j = 0; i < cpucount; i++) {
4086 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4087 &cg_system, &read_cnt);
4088
4089 if (ret == EOF)
4090 break;
4091
4092 if (ret != 3) {
4093 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4094 "failed.\n", cg);
4095 rv = -1;
4096 goto err;
4097 }
4098
4099 read_pos += read_cnt;
4100
4101 /* Convert the time from nanoseconds to USER_HZ */
4102 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4103 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4104 j++;
4105 }
4106
4107 rv = 0;
4108 *return_usage = cpu_usage;
4109 *size = cpucount;
4110
4111 err:
4112 if (usage_str)
4113 free(usage_str);
4114
4115 if (rv != 0) {
4116 free(cpu_usage);
4117 *return_usage = NULL;
4118 }
4119
4120 return rv;
4121 }
4122
4123 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4124 {
4125 int i;
4126 unsigned long sum = 0;
4127
4128 for (i = 0; i < cpu_count; i++) {
4129 if (!newer[i].online)
4130 continue;
4131
4132 /* When cpuset is changed on the fly, the CPUs might get reordered.
4133 * We could either reset all counters, or check that the substractions
4134 * below will return expected results.
4135 */
4136 if (newer[i].user > older[i].user)
4137 diff[i].user = newer[i].user - older[i].user;
4138 else
4139 diff[i].user = 0;
4140
4141 if (newer[i].system > older[i].system)
4142 diff[i].system = newer[i].system - older[i].system;
4143 else
4144 diff[i].system = 0;
4145
4146 if (newer[i].idle > older[i].idle)
4147 diff[i].idle = newer[i].idle - older[i].idle;
4148 else
4149 diff[i].idle = 0;
4150
4151 sum += diff[i].user;
4152 sum += diff[i].system;
4153 sum += diff[i].idle;
4154 }
4155
4156 return sum;
4157 }
4158
4159 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4160 {
4161 unsigned long free_space, to_add;
4162
4163 free_space = threshold - usage->user - usage->system;
4164
4165 if (free_space > usage->idle)
4166 free_space = usage->idle;
4167
4168 to_add = free_space > *surplus ? *surplus : free_space;
4169
4170 *counter += to_add;
4171 usage->idle -= to_add;
4172 *surplus -= to_add;
4173 }
4174
4175 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4176 {
4177 struct cg_proc_stat *first = NULL, *prev, *tmp;
4178
4179 for (prev = NULL; node; ) {
4180 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4181 tmp = node;
4182 lxcfs_debug("Removing stat node for %s\n", node->cg);
4183
4184 if (prev)
4185 prev->next = node->next;
4186 else
4187 first = node->next;
4188
4189 node = node->next;
4190 free_proc_stat_node(tmp);
4191 } else {
4192 if (!first)
4193 first = node;
4194 prev = node;
4195 node = node->next;
4196 }
4197 }
4198
4199 return first;
4200 }
4201
4202 #define PROC_STAT_PRUNE_INTERVAL 10
4203 static void prune_proc_stat_history(void)
4204 {
4205 int i;
4206 time_t now = time(NULL);
4207
4208 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4209 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4210
4211 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4212 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4213 return;
4214 }
4215
4216 if (proc_stat_history[i]->next) {
4217 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4218 proc_stat_history[i]->lastcheck = now;
4219 }
4220
4221 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4222 }
4223 }
4224
4225 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4226 {
4227 struct cg_proc_stat *node;
4228
4229 pthread_rwlock_rdlock(&head->lock);
4230
4231 if (!head->next) {
4232 pthread_rwlock_unlock(&head->lock);
4233 return NULL;
4234 }
4235
4236 node = head->next;
4237
4238 do {
4239 if (strcmp(cg, node->cg) == 0)
4240 goto out;
4241 } while ((node = node->next));
4242
4243 node = NULL;
4244
4245 out:
4246 pthread_rwlock_unlock(&head->lock);
4247 prune_proc_stat_history();
4248 return node;
4249 }
4250
4251 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4252 {
4253 struct cg_proc_stat *node;
4254 int i;
4255
4256 node = malloc(sizeof(struct cg_proc_stat));
4257 if (!node)
4258 goto err;
4259
4260 node->cg = NULL;
4261 node->usage = NULL;
4262 node->view = NULL;
4263
4264 node->cg = malloc(strlen(cg) + 1);
4265 if (!node->cg)
4266 goto err;
4267
4268 strcpy(node->cg, cg);
4269
4270 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4271 if (!node->usage)
4272 goto err;
4273
4274 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4275
4276 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4277 if (!node->view)
4278 goto err;
4279
4280 node->cpu_count = cpu_count;
4281 node->next = NULL;
4282
4283 if (pthread_mutex_init(&node->lock, NULL) != 0) {
4284 lxcfs_error("%s\n", "Failed to initialize node lock");
4285 goto err;
4286 }
4287
4288 for (i = 0; i < cpu_count; i++) {
4289 node->view[i].user = 0;
4290 node->view[i].system = 0;
4291 node->view[i].idle = 0;
4292 }
4293
4294 return node;
4295
4296 err:
4297 if (node && node->cg)
4298 free(node->cg);
4299 if (node && node->usage)
4300 free(node->usage);
4301 if (node && node->view)
4302 free(node->view);
4303 if (node)
4304 free(node);
4305
4306 return NULL;
4307 }
4308
4309 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4310 {
4311 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4312 struct cg_proc_stat_head *head = proc_stat_history[hash];
4313 struct cg_proc_stat *node, *rv = new_node;
4314
4315 pthread_rwlock_wrlock(&head->lock);
4316
4317 if (!head->next) {
4318 head->next = new_node;
4319 goto out;
4320 }
4321
4322 node = head->next;
4323
4324 for (;;) {
4325 if (strcmp(node->cg, new_node->cg) == 0) {
4326 /* The node is already present, return it */
4327 free_proc_stat_node(new_node);
4328 rv = node;
4329 goto out;
4330 }
4331
4332 if (node->next) {
4333 node = node->next;
4334 continue;
4335 }
4336
4337 node->next = new_node;
4338 goto out;
4339 }
4340
4341 out:
4342 pthread_rwlock_unlock(&head->lock);
4343 return rv;
4344 }
4345
4346 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4347 {
4348 struct cpuacct_usage *new_usage, *new_view;
4349 int i;
4350
4351 /* Allocate new memory */
4352 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4353 if (!new_usage)
4354 return false;
4355
4356 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4357 if (!new_view) {
4358 free(new_usage);
4359 return false;
4360 }
4361
4362 /* Copy existing data & initialize new elements */
4363 for (i = 0; i < cpu_count; i++) {
4364 if (i < node->cpu_count) {
4365 new_usage[i].user = node->usage[i].user;
4366 new_usage[i].system = node->usage[i].system;
4367 new_usage[i].idle = node->usage[i].idle;
4368
4369 new_view[i].user = node->view[i].user;
4370 new_view[i].system = node->view[i].system;
4371 new_view[i].idle = node->view[i].idle;
4372 } else {
4373 new_usage[i].user = 0;
4374 new_usage[i].system = 0;
4375 new_usage[i].idle = 0;
4376
4377 new_view[i].user = 0;
4378 new_view[i].system = 0;
4379 new_view[i].idle = 0;
4380 }
4381 }
4382
4383 free(node->usage);
4384 free(node->view);
4385
4386 node->usage = new_usage;
4387 node->view = new_view;
4388 node->cpu_count = cpu_count;
4389
4390 return true;
4391 }
4392
4393 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4394 {
4395 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4396 struct cg_proc_stat_head *head = proc_stat_history[hash];
4397 struct cg_proc_stat *node;
4398
4399 node = find_proc_stat_node(head, cg);
4400
4401 if (!node) {
4402 node = new_proc_stat_node(usage, cpu_count, cg);
4403 if (!node)
4404 return NULL;
4405
4406 node = add_proc_stat_node(node);
4407 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4408 }
4409
4410 pthread_mutex_lock(&node->lock);
4411
4412 /* If additional CPUs on the host have been enabled, CPU usage counter
4413 * arrays have to be expanded */
4414 if (node->cpu_count < cpu_count) {
4415 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4416 node->cpu_count, cpu_count, cg);
4417
4418 if (!expand_proc_stat_node(node, cpu_count)) {
4419 pthread_mutex_unlock(&node->lock);
4420 lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4421 node->cpu_count, cpu_count, cg);
4422 return NULL;
4423 }
4424 }
4425
4426 return node;
4427 }
4428
4429 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4430 {
4431 int i;
4432
4433 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4434 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4435
4436 for (i = 0; i < cpu_count; i++) {
4437 node->view[i].user = 0;
4438 node->view[i].system = 0;
4439 node->view[i].idle = 0;
4440 }
4441
4442 node->cpu_count = cpu_count;
4443 }
4444
4445 static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, FILE *f, char *buf, size_t buf_size)
4446 {
4447 char *line = NULL;
4448 size_t linelen = 0, total_len = 0, rv = 0, l;
4449 int curcpu = -1; /* cpu numbering starts at 0 */
4450 int physcpu, i;
4451 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4452 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4453 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4454 unsigned long user_surplus = 0, system_surplus = 0;
4455 unsigned long total_sum, threshold;
4456 struct cg_proc_stat *stat_node;
4457 struct cpuacct_usage *diff = NULL;
4458 int nprocs = get_nprocs_conf();
4459
4460 if (cg_cpu_usage_size < nprocs)
4461 nprocs = cg_cpu_usage_size;
4462
4463 /* Read all CPU stats and stop when we've encountered other lines */
4464 while (getline(&line, &linelen, f) != -1) {
4465 int ret;
4466 char cpu_char[10]; /* That's a lot of cores */
4467 uint64_t all_used, cg_used;
4468
4469 if (strlen(line) == 0)
4470 continue;
4471 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4472 /* not a ^cpuN line containing a number N */
4473 break;
4474 }
4475
4476 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4477 continue;
4478
4479 if (physcpu >= cg_cpu_usage_size)
4480 continue;
4481
4482 curcpu ++;
4483 cpu_cnt ++;
4484
4485 if (!cpu_in_cpuset(physcpu, cpuset)) {
4486 for (i = curcpu; i <= physcpu; i++) {
4487 cg_cpu_usage[i].online = false;
4488 }
4489 continue;
4490 }
4491
4492 if (curcpu < physcpu) {
4493 /* Some CPUs may be disabled */
4494 for (i = curcpu; i < physcpu; i++)
4495 cg_cpu_usage[i].online = false;
4496
4497 curcpu = physcpu;
4498 }
4499
4500 cg_cpu_usage[curcpu].online = true;
4501
4502 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4503 &user,
4504 &nice,
4505 &system,
4506 &idle,
4507 &iowait,
4508 &irq,
4509 &softirq,
4510 &steal,
4511 &guest,
4512 &guest_nice);
4513
4514 if (ret != 10)
4515 continue;
4516
4517 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4518 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4519
4520 if (all_used >= cg_used) {
4521 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4522
4523 } else {
4524 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4525 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4526 curcpu, cg, all_used, cg_used);
4527 cg_cpu_usage[curcpu].idle = idle;
4528 }
4529 }
4530
4531 /* Cannot use more CPUs than is available due to cpuset */
4532 if (max_cpus > cpu_cnt)
4533 max_cpus = cpu_cnt;
4534
4535 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4536
4537 if (!stat_node) {
4538 lxcfs_error("unable to find/create stat node for %s\n", cg);
4539 rv = 0;
4540 goto err;
4541 }
4542
4543 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4544 if (!diff) {
4545 rv = 0;
4546 goto err;
4547 }
4548
4549 /*
4550 * If the new values are LOWER than values stored in memory, it means
4551 * the cgroup has been reset/recreated and we should reset too.
4552 */
4553 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4554 if (!cg_cpu_usage[curcpu].online)
4555 continue;
4556
4557 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4558 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4559
4560 break;
4561 }
4562
4563 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4564
4565 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4566 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4567
4568 if (!stat_node->usage[curcpu].online)
4569 continue;
4570
4571 i++;
4572
4573 stat_node->usage[curcpu].user += diff[curcpu].user;
4574 stat_node->usage[curcpu].system += diff[curcpu].system;
4575 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4576
4577 if (max_cpus > 0 && i >= max_cpus) {
4578 user_surplus += diff[curcpu].user;
4579 system_surplus += diff[curcpu].system;
4580 }
4581 }
4582
4583 /* Calculate usage counters of visible CPUs */
4584 if (max_cpus > 0) {
4585 /* threshold = maximum usage per cpu, including idle */
4586 threshold = total_sum / cpu_cnt * max_cpus;
4587
4588 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4589 if (i == max_cpus)
4590 break;
4591
4592 if (!stat_node->usage[curcpu].online)
4593 continue;
4594
4595 i++;
4596
4597 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4598 continue;
4599
4600 /* Add user */
4601 add_cpu_usage(
4602 &user_surplus,
4603 &diff[curcpu],
4604 &diff[curcpu].user,
4605 threshold);
4606
4607 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4608 continue;
4609
4610 /* If there is still room, add system */
4611 add_cpu_usage(
4612 &system_surplus,
4613 &diff[curcpu],
4614 &diff[curcpu].system,
4615 threshold);
4616 }
4617
4618 if (user_surplus > 0)
4619 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4620 if (system_surplus > 0)
4621 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4622
4623 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4624 if (i == max_cpus)
4625 break;
4626
4627 if (!stat_node->usage[curcpu].online)
4628 continue;
4629
4630 i++;
4631
4632 stat_node->view[curcpu].user += diff[curcpu].user;
4633 stat_node->view[curcpu].system += diff[curcpu].system;
4634 stat_node->view[curcpu].idle += diff[curcpu].idle;
4635
4636 user_sum += stat_node->view[curcpu].user;
4637 system_sum += stat_node->view[curcpu].system;
4638 idle_sum += stat_node->view[curcpu].idle;
4639 }
4640
4641 } else {
4642 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4643 if (!stat_node->usage[curcpu].online)
4644 continue;
4645
4646 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4647 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4648 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4649
4650 user_sum += stat_node->view[curcpu].user;
4651 system_sum += stat_node->view[curcpu].system;
4652 idle_sum += stat_node->view[curcpu].idle;
4653 }
4654 }
4655
4656 /* Render the file */
4657 /* cpu-all */
4658 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4659 user_sum,
4660 system_sum,
4661 idle_sum);
4662
4663 if (l < 0) {
4664 perror("Error writing to cache");
4665 rv = 0;
4666 goto err;
4667
4668 }
4669 if (l >= buf_size) {
4670 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4671 rv = 0;
4672 goto err;
4673 }
4674
4675 buf += l;
4676 buf_size -= l;
4677 total_len += l;
4678
4679 /* Render visible CPUs */
4680 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4681 if (!stat_node->usage[curcpu].online)
4682 continue;
4683
4684 i++;
4685
4686 if (max_cpus > 0 && i == max_cpus)
4687 break;
4688
4689 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4690 i,
4691 stat_node->view[curcpu].user,
4692 stat_node->view[curcpu].system,
4693 stat_node->view[curcpu].idle);
4694
4695 if (l < 0) {
4696 perror("Error writing to cache");
4697 rv = 0;
4698 goto err;
4699
4700 }
4701 if (l >= buf_size) {
4702 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4703 rv = 0;
4704 goto err;
4705 }
4706
4707 buf += l;
4708 buf_size -= l;
4709 total_len += l;
4710 }
4711
4712 /* Pass the rest of /proc/stat, start with the last line read */
4713 l = snprintf(buf, buf_size, "%s", line);
4714
4715 if (l < 0) {
4716 perror("Error writing to cache");
4717 rv = 0;
4718 goto err;
4719
4720 }
4721 if (l >= buf_size) {
4722 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4723 rv = 0;
4724 goto err;
4725 }
4726
4727 buf += l;
4728 buf_size -= l;
4729 total_len += l;
4730
4731 /* Pass the rest of the host's /proc/stat */
4732 while (getline(&line, &linelen, f) != -1) {
4733 l = snprintf(buf, buf_size, "%s", line);
4734 if (l < 0) {
4735 perror("Error writing to cache");
4736 rv = 0;
4737 goto err;
4738 }
4739 if (l >= buf_size) {
4740 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4741 rv = 0;
4742 goto err;
4743 }
4744 buf += l;
4745 buf_size -= l;
4746 total_len += l;
4747 }
4748
4749 rv = total_len;
4750
4751 err:
4752 if (stat_node)
4753 pthread_mutex_unlock(&stat_node->lock);
4754 if (line)
4755 free(line);
4756 if (diff)
4757 free(diff);
4758 return rv;
4759 }
4760
4761 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4762 static int proc_stat_read(char *buf, size_t size, off_t offset,
4763 struct fuse_file_info *fi)
4764 {
4765 struct fuse_context *fc = fuse_get_context();
4766 struct file_info *d = (struct file_info *)fi->fh;
4767 char *cg;
4768 char *cpuset = NULL;
4769 char *line = NULL;
4770 size_t linelen = 0, total_len = 0, rv = 0;
4771 int curcpu = -1; /* cpu numbering starts at 0 */
4772 int physcpu = 0;
4773 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4774 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
4775 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4776 char cpuall[CPUALL_MAX_SIZE];
4777 /* reserve for cpu all */
4778 char *cache = d->buf + CPUALL_MAX_SIZE;
4779 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4780 FILE *f = NULL;
4781 struct cpuacct_usage *cg_cpu_usage = NULL;
4782 int cg_cpu_usage_size = 0;
4783
4784 if (offset){
4785 if (offset > d->size)
4786 return -EINVAL;
4787 if (!d->cached)
4788 return 0;
4789 int left = d->size - offset;
4790 total_len = left > size ? size: left;
4791 memcpy(buf, d->buf + offset, total_len);
4792 return total_len;
4793 }
4794
4795 pid_t initpid = lookup_initpid_in_store(fc->pid);
4796 if (initpid <= 0)
4797 initpid = fc->pid;
4798 cg = get_pid_cgroup(initpid, "cpuset");
4799 if (!cg)
4800 return read_file("/proc/stat", buf, size, d);
4801 prune_init_slice(cg);
4802
4803 cpuset = get_cpuset(cg);
4804 if (!cpuset)
4805 goto err;
4806
4807 /*
4808 * Read cpuacct.usage_all for all CPUs.
4809 * If the cpuacct cgroup is present, it is used to calculate the container's
4810 * CPU usage. If not, values from the host's /proc/stat are used.
4811 */
4812 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
4813 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
4814 "falling back to the host's /proc/stat");
4815 }
4816
4817 f = fopen("/proc/stat", "r");
4818 if (!f)
4819 goto err;
4820
4821 //skip first line
4822 if (getline(&line, &linelen, f) < 0) {
4823 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4824 goto err;
4825 }
4826
4827 if (use_cpuview(cg) && cg_cpu_usage) {
4828 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4829 f, d->buf, d->buflen);
4830 goto out;
4831 }
4832
4833 while (getline(&line, &linelen, f) != -1) {
4834 ssize_t l;
4835 char cpu_char[10]; /* That's a lot of cores */
4836 char *c;
4837 uint64_t all_used, cg_used, new_idle;
4838 int ret;
4839
4840 if (strlen(line) == 0)
4841 continue;
4842 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4843 /* not a ^cpuN line containing a number N, just print it */
4844 l = snprintf(cache, cache_size, "%s", line);
4845 if (l < 0) {
4846 perror("Error writing to cache");
4847 rv = 0;
4848 goto err;
4849 }
4850 if (l >= cache_size) {
4851 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4852 rv = 0;
4853 goto err;
4854 }
4855 cache += l;
4856 cache_size -= l;
4857 total_len += l;
4858 continue;
4859 }
4860
4861 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4862 continue;
4863 if (!cpu_in_cpuset(physcpu, cpuset))
4864 continue;
4865 curcpu ++;
4866
4867 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4868 &user,
4869 &nice,
4870 &system,
4871 &idle,
4872 &iowait,
4873 &irq,
4874 &softirq,
4875 &steal,
4876 &guest,
4877 &guest_nice);
4878
4879 if (ret != 10 || !cg_cpu_usage) {
4880 c = strchr(line, ' ');
4881 if (!c)
4882 continue;
4883 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4884 if (l < 0) {
4885 perror("Error writing to cache");
4886 rv = 0;
4887 goto err;
4888
4889 }
4890 if (l >= cache_size) {
4891 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4892 rv = 0;
4893 goto err;
4894 }
4895
4896 cache += l;
4897 cache_size -= l;
4898 total_len += l;
4899
4900 if (ret != 10)
4901 continue;
4902 }
4903
4904 if (cg_cpu_usage) {
4905 if (physcpu >= cg_cpu_usage_size)
4906 break;
4907
4908 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4909 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
4910
4911 if (all_used >= cg_used) {
4912 new_idle = idle + (all_used - cg_used);
4913
4914 } else {
4915 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4916 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4917 curcpu, cg, all_used, cg_used);
4918 new_idle = idle;
4919 }
4920
4921 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4922 curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
4923 new_idle);
4924
4925 if (l < 0) {
4926 perror("Error writing to cache");
4927 rv = 0;
4928 goto err;
4929
4930 }
4931 if (l >= cache_size) {
4932 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4933 rv = 0;
4934 goto err;
4935 }
4936
4937 cache += l;
4938 cache_size -= l;
4939 total_len += l;
4940
4941 user_sum += cg_cpu_usage[physcpu].user;
4942 system_sum += cg_cpu_usage[physcpu].system;
4943 idle_sum += new_idle;
4944
4945 } else {
4946 user_sum += user;
4947 nice_sum += nice;
4948 system_sum += system;
4949 idle_sum += idle;
4950 iowait_sum += iowait;
4951 irq_sum += irq;
4952 softirq_sum += softirq;
4953 steal_sum += steal;
4954 guest_sum += guest;
4955 guest_nice_sum += guest_nice;
4956 }
4957 }
4958
4959 cache = d->buf;
4960
4961 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4962 user_sum,
4963 nice_sum,
4964 system_sum,
4965 idle_sum,
4966 iowait_sum,
4967 irq_sum,
4968 softirq_sum,
4969 steal_sum,
4970 guest_sum,
4971 guest_nice_sum);
4972 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4973 memcpy(cache, cpuall, cpuall_len);
4974 cache += cpuall_len;
4975 } else {
4976 /* shouldn't happen */
4977 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4978 cpuall_len = 0;
4979 }
4980
4981 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4982 total_len += cpuall_len;
4983
4984 out:
4985 d->cached = 1;
4986 d->size = total_len;
4987 if (total_len > size)
4988 total_len = size;
4989
4990 memcpy(buf, d->buf, total_len);
4991 rv = total_len;
4992
4993 err:
4994 if (f)
4995 fclose(f);
4996 if (cg_cpu_usage)
4997 free(cg_cpu_usage);
4998 free(line);
4999 free(cpuset);
5000 free(cg);
5001 return rv;
5002 }
5003
5004 /* This function retrieves the busy time of a group of tasks by looking at
5005 * cpuacct.usage. Unfortunately, this only makes sense when the container has
5006 * been given it's own cpuacct cgroup. If not, this function will take the busy
5007 * time of all other taks that do not actually belong to the container into
5008 * account as well. If someone has a clever solution for this please send a
5009 * patch!
5010 */
5011 static unsigned long get_reaper_busy(pid_t task)
5012 {
5013 pid_t initpid = lookup_initpid_in_store(task);
5014 char *cgroup = NULL, *usage_str = NULL;
5015 unsigned long usage = 0;
5016
5017 if (initpid <= 0)
5018 return 0;
5019
5020 cgroup = get_pid_cgroup(initpid, "cpuacct");
5021 if (!cgroup)
5022 goto out;
5023 prune_init_slice(cgroup);
5024 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
5025 goto out;
5026 usage = strtoul(usage_str, NULL, 10);
5027 usage /= 1000000000;
5028
5029 out:
5030 free(cgroup);
5031 free(usage_str);
5032 return usage;
5033 }
5034
5035 #if RELOADTEST
5036 void iwashere(void)
5037 {
5038 int fd;
5039
5040 fd = creat("/tmp/lxcfs-iwashere", 0644);
5041 if (fd >= 0)
5042 close(fd);
5043 }
5044 #endif
5045
5046 /*
5047 * We read /proc/uptime and reuse its second field.
5048 * For the first field, we use the mtime for the reaper for
5049 * the calling pid as returned by getreaperage
5050 */
5051 static int proc_uptime_read(char *buf, size_t size, off_t offset,
5052 struct fuse_file_info *fi)
5053 {
5054 struct fuse_context *fc = fuse_get_context();
5055 struct file_info *d = (struct file_info *)fi->fh;
5056 unsigned long int busytime = get_reaper_busy(fc->pid);
5057 char *cache = d->buf;
5058 ssize_t total_len = 0;
5059 uint64_t idletime, reaperage;
5060
5061 #if RELOADTEST
5062 iwashere();
5063 #endif
5064
5065 if (offset){
5066 if (!d->cached)
5067 return 0;
5068 if (offset > d->size)
5069 return -EINVAL;
5070 int left = d->size - offset;
5071 total_len = left > size ? size: left;
5072 memcpy(buf, cache + offset, total_len);
5073 return total_len;
5074 }
5075
5076 reaperage = get_reaper_age(fc->pid);
5077 /* To understand why this is done, please read the comment to the
5078 * get_reaper_busy() function.
5079 */
5080 idletime = reaperage;
5081 if (reaperage >= busytime)
5082 idletime = reaperage - busytime;
5083
5084 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
5085 if (total_len < 0 || total_len >= d->buflen){
5086 lxcfs_error("%s\n", "failed to write to cache");
5087 return 0;
5088 }
5089
5090 d->size = (int)total_len;
5091 d->cached = 1;
5092
5093 if (total_len > size) total_len = size;
5094
5095 memcpy(buf, d->buf, total_len);
5096 return total_len;
5097 }
5098
5099 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5100 struct fuse_file_info *fi)
5101 {
5102 char dev_name[72];
5103 struct fuse_context *fc = fuse_get_context();
5104 struct file_info *d = (struct file_info *)fi->fh;
5105 char *cg;
5106 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5107 *io_wait_time_str = NULL, *io_service_time_str = NULL;
5108 unsigned long read = 0, write = 0;
5109 unsigned long read_merged = 0, write_merged = 0;
5110 unsigned long read_sectors = 0, write_sectors = 0;
5111 unsigned long read_ticks = 0, write_ticks = 0;
5112 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5113 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5114 char *cache = d->buf;
5115 size_t cache_size = d->buflen;
5116 char *line = NULL;
5117 size_t linelen = 0, total_len = 0, rv = 0;
5118 unsigned int major = 0, minor = 0;
5119 int i = 0;
5120 FILE *f = NULL;
5121
5122 if (offset){
5123 if (offset > d->size)
5124 return -EINVAL;
5125 if (!d->cached)
5126 return 0;
5127 int left = d->size - offset;
5128 total_len = left > size ? size: left;
5129 memcpy(buf, cache + offset, total_len);
5130 return total_len;
5131 }
5132
5133 pid_t initpid = lookup_initpid_in_store(fc->pid);
5134 if (initpid <= 0)
5135 initpid = fc->pid;
5136 cg = get_pid_cgroup(initpid, "blkio");
5137 if (!cg)
5138 return read_file("/proc/diskstats", buf, size, d);
5139 prune_init_slice(cg);
5140
5141 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
5142 goto err;
5143 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
5144 goto err;
5145 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
5146 goto err;
5147 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
5148 goto err;
5149 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
5150 goto err;
5151
5152
5153 f = fopen("/proc/diskstats", "r");
5154 if (!f)
5155 goto err;
5156
5157 while (getline(&line, &linelen, f) != -1) {
5158 ssize_t l;
5159 char lbuf[256];
5160
5161 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5162 if (i != 3)
5163 continue;
5164
5165 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5166 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5167 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5168 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5169 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5170 read_sectors = read_sectors/512;
5171 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5172 write_sectors = write_sectors/512;
5173
5174 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5175 rd_svctm = rd_svctm/1000000;
5176 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5177 rd_wait = rd_wait/1000000;
5178 read_ticks = rd_svctm + rd_wait;
5179
5180 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5181 wr_svctm = wr_svctm/1000000;
5182 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5183 wr_wait = wr_wait/1000000;
5184 write_ticks = wr_svctm + wr_wait;
5185
5186 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5187 tot_ticks = tot_ticks/1000000;
5188
5189 memset(lbuf, 0, 256);
5190 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5191 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5192 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5193 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5194 else
5195 continue;
5196
5197 l = snprintf(cache, cache_size, "%s", lbuf);
5198 if (l < 0) {
5199 perror("Error writing to fuse buf");
5200 rv = 0;
5201 goto err;
5202 }
5203 if (l >= cache_size) {
5204 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5205 rv = 0;
5206 goto err;
5207 }
5208 cache += l;
5209 cache_size -= l;
5210 total_len += l;
5211 }
5212
5213 d->cached = 1;
5214 d->size = total_len;
5215 if (total_len > size ) total_len = size;
5216 memcpy(buf, d->buf, total_len);
5217
5218 rv = total_len;
5219 err:
5220 free(cg);
5221 if (f)
5222 fclose(f);
5223 free(line);
5224 free(io_serviced_str);
5225 free(io_merged_str);
5226 free(io_service_bytes_str);
5227 free(io_wait_time_str);
5228 free(io_service_time_str);
5229 return rv;
5230 }
5231
5232 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5233 struct fuse_file_info *fi)
5234 {
5235 struct fuse_context *fc = fuse_get_context();
5236 struct file_info *d = (struct file_info *)fi->fh;
5237 char *cg = NULL;
5238 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
5239 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
5240 ssize_t total_len = 0, rv = 0;
5241 ssize_t l = 0;
5242 char *cache = d->buf;
5243
5244 if (offset) {
5245 if (offset > d->size)
5246 return -EINVAL;
5247 if (!d->cached)
5248 return 0;
5249 int left = d->size - offset;
5250 total_len = left > size ? size: left;
5251 memcpy(buf, cache + offset, total_len);
5252 return total_len;
5253 }
5254
5255 pid_t initpid = lookup_initpid_in_store(fc->pid);
5256 if (initpid <= 0)
5257 initpid = fc->pid;
5258 cg = get_pid_cgroup(initpid, "memory");
5259 if (!cg)
5260 return read_file("/proc/swaps", buf, size, d);
5261 prune_init_slice(cg);
5262
5263 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
5264
5265 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
5266 goto err;
5267
5268 memusage = strtoul(memusage_str, NULL, 10);
5269
5270 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5271 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5272
5273 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
5274 memswusage = strtoul(memswusage_str, NULL, 10);
5275
5276 swap_total = (memswlimit - memlimit) / 1024;
5277 swap_free = (memswusage - memusage) / 1024;
5278 }
5279
5280 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5281
5282 /* When no mem + swap limit is specified or swapaccount=0*/
5283 if (!memswlimit) {
5284 char *line = NULL;
5285 size_t linelen = 0;
5286 FILE *f = fopen("/proc/meminfo", "r");
5287
5288 if (!f)
5289 goto err;
5290
5291 while (getline(&line, &linelen, f) != -1) {
5292 if (startswith(line, "SwapTotal:")) {
5293 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
5294 } else if (startswith(line, "SwapFree:")) {
5295 sscanf(line, "SwapFree: %8lu kB", &swap_free);
5296 }
5297 }
5298
5299 free(line);
5300 fclose(f);
5301 }
5302
5303 if (swap_total > 0) {
5304 l = snprintf(d->buf + total_len, d->size - total_len,
5305 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5306 swap_total, swap_free);
5307 total_len += l;
5308 }
5309
5310 if (total_len < 0 || l < 0) {
5311 perror("Error writing to cache");
5312 rv = 0;
5313 goto err;
5314 }
5315
5316 d->cached = 1;
5317 d->size = (int)total_len;
5318
5319 if (total_len > size) total_len = size;
5320 memcpy(buf, d->buf, total_len);
5321 rv = total_len;
5322
5323 err:
5324 free(cg);
5325 free(memswlimit_str);
5326 free(memlimit_str);
5327 free(memusage_str);
5328 free(memswusage_str);
5329 return rv;
5330 }
5331 /*
5332 * Find the process pid from cgroup path.
5333 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5334 * @pid_buf : put pid to pid_buf.
5335 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5336 * @depth : the depth of cgroup in container.
5337 * @sum : return the number of pid.
5338 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5339 */
5340 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5341 {
5342 DIR *dir;
5343 int fd;
5344 struct dirent *file;
5345 FILE *f = NULL;
5346 size_t linelen = 0;
5347 char *line = NULL;
5348 int pd;
5349 char *path_dir, *path;
5350 char **pid;
5351
5352 /* path = dpath + "/cgroup.procs" + /0 */
5353 do {
5354 path = malloc(strlen(dpath) + 20);
5355 } while (!path);
5356
5357 strcpy(path, dpath);
5358 fd = openat(cfd, path, O_RDONLY);
5359 if (fd < 0)
5360 goto out;
5361
5362 dir = fdopendir(fd);
5363 if (dir == NULL) {
5364 close(fd);
5365 goto out;
5366 }
5367
5368 while (((file = readdir(dir)) != NULL) && depth > 0) {
5369 if (strncmp(file->d_name, ".", 1) == 0)
5370 continue;
5371 if (strncmp(file->d_name, "..", 1) == 0)
5372 continue;
5373 if (file->d_type == DT_DIR) {
5374 /* path + '/' + d_name +/0 */
5375 do {
5376 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5377 } while (!path_dir);
5378 strcpy(path_dir, path);
5379 strcat(path_dir, "/");
5380 strcat(path_dir, file->d_name);
5381 pd = depth - 1;
5382 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5383 free(path_dir);
5384 }
5385 }
5386 closedir(dir);
5387
5388 strcat(path, "/cgroup.procs");
5389 fd = openat(cfd, path, O_RDONLY);
5390 if (fd < 0)
5391 goto out;
5392
5393 f = fdopen(fd, "r");
5394 if (!f) {
5395 close(fd);
5396 goto out;
5397 }
5398
5399 while (getline(&line, &linelen, f) != -1) {
5400 do {
5401 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5402 } while (!pid);
5403 *pid_buf = pid;
5404 do {
5405 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5406 } while (*(*pid_buf + sum) == NULL);
5407 strcpy(*(*pid_buf + sum), line);
5408 sum++;
5409 }
5410 fclose(f);
5411 out:
5412 if (line)
5413 free(line);
5414 free(path);
5415 return sum;
5416 }
5417 /*
5418 * calc_load calculates the load according to the following formula:
5419 * load1 = load0 * exp + active * (1 - exp)
5420 *
5421 * @load1: the new loadavg.
5422 * @load0: the former loadavg.
5423 * @active: the total number of running pid at this moment.
5424 * @exp: the fixed-point defined in the beginning.
5425 */
5426 static unsigned long
5427 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5428 {
5429 unsigned long newload;
5430
5431 active = active > 0 ? active * FIXED_1 : 0;
5432 newload = load * exp + active * (FIXED_1 - exp);
5433 if (active >= load)
5434 newload += FIXED_1 - 1;
5435
5436 return newload / FIXED_1;
5437 }
5438
5439 /*
5440 * Return 0 means that container p->cg is closed.
5441 * Return -1 means that error occurred in refresh.
5442 * Positive num equals the total number of pid.
5443 */
5444 static int refresh_load(struct load_node *p, char *path)
5445 {
5446 FILE *f = NULL;
5447 char **idbuf;
5448 char proc_path[256];
5449 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5450 char *line = NULL;
5451 size_t linelen = 0;
5452 int sum, length;
5453 DIR *dp;
5454 struct dirent *file;
5455
5456 do {
5457 idbuf = malloc(sizeof(char *));
5458 } while (!idbuf);
5459 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5460 /* normal exit */
5461 if (sum == 0)
5462 goto out;
5463
5464 for (i = 0; i < sum; i++) {
5465 /*clean up '\n' */
5466 length = strlen(idbuf[i])-1;
5467 idbuf[i][length] = '\0';
5468 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5469 if (ret < 0 || ret > 255) {
5470 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5471 i = sum;
5472 sum = -1;
5473 goto err_out;
5474 }
5475
5476 dp = opendir(proc_path);
5477 if (!dp) {
5478 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5479 continue;
5480 }
5481 while ((file = readdir(dp)) != NULL) {
5482 if (strncmp(file->d_name, ".", 1) == 0)
5483 continue;
5484 if (strncmp(file->d_name, "..", 1) == 0)
5485 continue;
5486 total_pid++;
5487 /* We make the biggest pid become last_pid.*/
5488 ret = atof(file->d_name);
5489 last_pid = (ret > last_pid) ? ret : last_pid;
5490
5491 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5492 if (ret < 0 || ret > 255) {
5493 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5494 i = sum;
5495 sum = -1;
5496 closedir(dp);
5497 goto err_out;
5498 }
5499 f = fopen(proc_path, "r");
5500 if (f != NULL) {
5501 while (getline(&line, &linelen, f) != -1) {
5502 /* Find State */
5503 if ((line[0] == 'S') && (line[1] == 't'))
5504 break;
5505 }
5506 if ((line[7] == 'R') || (line[7] == 'D'))
5507 run_pid++;
5508 fclose(f);
5509 }
5510 }
5511 closedir(dp);
5512 }
5513 /*Calculate the loadavg.*/
5514 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5515 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5516 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5517 p->run_pid = run_pid;
5518 p->total_pid = total_pid;
5519 p->last_pid = last_pid;
5520
5521 free(line);
5522 err_out:
5523 for (; i > 0; i--)
5524 free(idbuf[i-1]);
5525 out:
5526 free(idbuf);
5527 return sum;
5528 }
5529 /*
5530 * Traverse the hash table and update it.
5531 */
5532 void *load_begin(void *arg)
5533 {
5534
5535 char *path = NULL;
5536 int i, sum, length, ret;
5537 struct load_node *f;
5538 int first_node;
5539 clock_t time1, time2;
5540
5541 while (1) {
5542 if (loadavg_stop == 1)
5543 return NULL;
5544
5545 time1 = clock();
5546 for (i = 0; i < LOAD_SIZE; i++) {
5547 pthread_mutex_lock(&load_hash[i].lock);
5548 if (load_hash[i].next == NULL) {
5549 pthread_mutex_unlock(&load_hash[i].lock);
5550 continue;
5551 }
5552 f = load_hash[i].next;
5553 first_node = 1;
5554 while (f) {
5555 length = strlen(f->cg) + 2;
5556 do {
5557 /* strlen(f->cg) + '.' or '' + \0 */
5558 path = malloc(length);
5559 } while (!path);
5560
5561 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5562 if (ret < 0 || ret > length - 1) {
5563 /* snprintf failed, ignore the node.*/
5564 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5565 goto out;
5566 }
5567 sum = refresh_load(f, path);
5568 if (sum == 0) {
5569 f = del_node(f, i);
5570 } else {
5571 out: f = f->next;
5572 }
5573 free(path);
5574 /* load_hash[i].lock locks only on the first node.*/
5575 if (first_node == 1) {
5576 first_node = 0;
5577 pthread_mutex_unlock(&load_hash[i].lock);
5578 }
5579 }
5580 }
5581
5582 if (loadavg_stop == 1)
5583 return NULL;
5584
5585 time2 = clock();
5586 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5587 }
5588 }
5589
5590 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5591 struct fuse_file_info *fi)
5592 {
5593 struct fuse_context *fc = fuse_get_context();
5594 struct file_info *d = (struct file_info *)fi->fh;
5595 pid_t initpid;
5596 char *cg;
5597 size_t total_len = 0;
5598 char *cache = d->buf;
5599 struct load_node *n;
5600 int hash;
5601 int cfd, rv = 0;
5602 unsigned long a, b, c;
5603
5604 if (offset) {
5605 if (offset > d->size)
5606 return -EINVAL;
5607 if (!d->cached)
5608 return 0;
5609 int left = d->size - offset;
5610 total_len = left > size ? size : left;
5611 memcpy(buf, cache + offset, total_len);
5612 return total_len;
5613 }
5614 if (!loadavg)
5615 return read_file("/proc/loadavg", buf, size, d);
5616
5617 initpid = lookup_initpid_in_store(fc->pid);
5618 if (initpid <= 0)
5619 initpid = fc->pid;
5620 cg = get_pid_cgroup(initpid, "cpu");
5621 if (!cg)
5622 return read_file("/proc/loadavg", buf, size, d);
5623
5624 prune_init_slice(cg);
5625 hash = calc_hash(cg) % LOAD_SIZE;
5626 n = locate_node(cg, hash);
5627
5628 /* First time */
5629 if (n == NULL) {
5630 if (!find_mounted_controller("cpu", &cfd)) {
5631 /*
5632 * In locate_node() above, pthread_rwlock_unlock() isn't used
5633 * because delete is not allowed before read has ended.
5634 */
5635 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5636 rv = 0;
5637 goto err;
5638 }
5639 do {
5640 n = malloc(sizeof(struct load_node));
5641 } while (!n);
5642
5643 do {
5644 n->cg = malloc(strlen(cg)+1);
5645 } while (!n->cg);
5646 strcpy(n->cg, cg);
5647 n->avenrun[0] = 0;
5648 n->avenrun[1] = 0;
5649 n->avenrun[2] = 0;
5650 n->run_pid = 0;
5651 n->total_pid = 1;
5652 n->last_pid = initpid;
5653 n->cfd = cfd;
5654 insert_node(&n, hash);
5655 }
5656 a = n->avenrun[0] + (FIXED_1/200);
5657 b = n->avenrun[1] + (FIXED_1/200);
5658 c = n->avenrun[2] + (FIXED_1/200);
5659 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5660 LOAD_INT(a), LOAD_FRAC(a),
5661 LOAD_INT(b), LOAD_FRAC(b),
5662 LOAD_INT(c), LOAD_FRAC(c),
5663 n->run_pid, n->total_pid, n->last_pid);
5664 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5665 if (total_len < 0 || total_len >= d->buflen) {
5666 lxcfs_error("%s\n", "Failed to write to cache");
5667 rv = 0;
5668 goto err;
5669 }
5670 d->size = (int)total_len;
5671 d->cached = 1;
5672
5673 if (total_len > size)
5674 total_len = size;
5675 memcpy(buf, d->buf, total_len);
5676 rv = total_len;
5677
5678 err:
5679 free(cg);
5680 return rv;
5681 }
5682 /* Return a positive number on success, return 0 on failure.*/
5683 pthread_t load_daemon(int load_use)
5684 {
5685 int ret;
5686 pthread_t pid;
5687
5688 ret = init_load();
5689 if (ret == -1) {
5690 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5691 return 0;
5692 }
5693 ret = pthread_create(&pid, NULL, load_begin, NULL);
5694 if (ret != 0) {
5695 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5696 load_free();
5697 return 0;
5698 }
5699 /* use loadavg, here loadavg = 1*/
5700 loadavg = load_use;
5701 return pid;
5702 }
5703
5704 /* Returns 0 on success. */
5705 int stop_load_daemon(pthread_t pid)
5706 {
5707 int s;
5708
5709 /* Signal the thread to gracefully stop */
5710 loadavg_stop = 1;
5711
5712 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5713 if (s != 0) {
5714 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5715 return -1;
5716 }
5717
5718 load_free();
5719 loadavg_stop = 0;
5720
5721 return 0;
5722 }
5723
5724 static off_t get_procfile_size(const char *which)
5725 {
5726 FILE *f = fopen(which, "r");
5727 char *line = NULL;
5728 size_t len = 0;
5729 ssize_t sz, answer = 0;
5730 if (!f)
5731 return 0;
5732
5733 while ((sz = getline(&line, &len, f)) != -1)
5734 answer += sz;
5735 fclose (f);
5736 free(line);
5737
5738 return answer;
5739 }
5740
5741 int proc_getattr(const char *path, struct stat *sb)
5742 {
5743 struct timespec now;
5744
5745 memset(sb, 0, sizeof(struct stat));
5746 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5747 return -EINVAL;
5748 sb->st_uid = sb->st_gid = 0;
5749 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5750 if (strcmp(path, "/proc") == 0) {
5751 sb->st_mode = S_IFDIR | 00555;
5752 sb->st_nlink = 2;
5753 return 0;
5754 }
5755 if (strcmp(path, "/proc/meminfo") == 0 ||
5756 strcmp(path, "/proc/cpuinfo") == 0 ||
5757 strcmp(path, "/proc/uptime") == 0 ||
5758 strcmp(path, "/proc/stat") == 0 ||
5759 strcmp(path, "/proc/diskstats") == 0 ||
5760 strcmp(path, "/proc/swaps") == 0 ||
5761 strcmp(path, "/proc/loadavg") == 0) {
5762 sb->st_size = 0;
5763 sb->st_mode = S_IFREG | 00444;
5764 sb->st_nlink = 1;
5765 return 0;
5766 }
5767
5768 return -ENOENT;
5769 }
5770
5771 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5772 struct fuse_file_info *fi)
5773 {
5774 if (filler(buf, ".", NULL, 0) != 0 ||
5775 filler(buf, "..", NULL, 0) != 0 ||
5776 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5777 filler(buf, "meminfo", NULL, 0) != 0 ||
5778 filler(buf, "stat", NULL, 0) != 0 ||
5779 filler(buf, "uptime", NULL, 0) != 0 ||
5780 filler(buf, "diskstats", NULL, 0) != 0 ||
5781 filler(buf, "swaps", NULL, 0) != 0 ||
5782 filler(buf, "loadavg", NULL, 0) != 0)
5783 return -EINVAL;
5784 return 0;
5785 }
5786
5787 int proc_open(const char *path, struct fuse_file_info *fi)
5788 {
5789 int type = -1;
5790 struct file_info *info;
5791
5792 if (strcmp(path, "/proc/meminfo") == 0)
5793 type = LXC_TYPE_PROC_MEMINFO;
5794 else if (strcmp(path, "/proc/cpuinfo") == 0)
5795 type = LXC_TYPE_PROC_CPUINFO;
5796 else if (strcmp(path, "/proc/uptime") == 0)
5797 type = LXC_TYPE_PROC_UPTIME;
5798 else if (strcmp(path, "/proc/stat") == 0)
5799 type = LXC_TYPE_PROC_STAT;
5800 else if (strcmp(path, "/proc/diskstats") == 0)
5801 type = LXC_TYPE_PROC_DISKSTATS;
5802 else if (strcmp(path, "/proc/swaps") == 0)
5803 type = LXC_TYPE_PROC_SWAPS;
5804 else if (strcmp(path, "/proc/loadavg") == 0)
5805 type = LXC_TYPE_PROC_LOADAVG;
5806 if (type == -1)
5807 return -ENOENT;
5808
5809 info = malloc(sizeof(*info));
5810 if (!info)
5811 return -ENOMEM;
5812
5813 memset(info, 0, sizeof(*info));
5814 info->type = type;
5815
5816 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5817 do {
5818 info->buf = malloc(info->buflen);
5819 } while (!info->buf);
5820 memset(info->buf, 0, info->buflen);
5821 /* set actual size to buffer size */
5822 info->size = info->buflen;
5823
5824 fi->fh = (unsigned long)info;
5825 return 0;
5826 }
5827
5828 int proc_access(const char *path, int mask)
5829 {
5830 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5831 return 0;
5832
5833 /* these are all read-only */
5834 if ((mask & ~R_OK) != 0)
5835 return -EACCES;
5836 return 0;
5837 }
5838
5839 int proc_release(const char *path, struct fuse_file_info *fi)
5840 {
5841 do_release_file_info(fi);
5842 return 0;
5843 }
5844
5845 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5846 struct fuse_file_info *fi)
5847 {
5848 struct file_info *f = (struct file_info *) fi->fh;
5849
5850 switch (f->type) {
5851 case LXC_TYPE_PROC_MEMINFO:
5852 return proc_meminfo_read(buf, size, offset, fi);
5853 case LXC_TYPE_PROC_CPUINFO:
5854 return proc_cpuinfo_read(buf, size, offset, fi);
5855 case LXC_TYPE_PROC_UPTIME:
5856 return proc_uptime_read(buf, size, offset, fi);
5857 case LXC_TYPE_PROC_STAT:
5858 return proc_stat_read(buf, size, offset, fi);
5859 case LXC_TYPE_PROC_DISKSTATS:
5860 return proc_diskstats_read(buf, size, offset, fi);
5861 case LXC_TYPE_PROC_SWAPS:
5862 return proc_swaps_read(buf, size, offset, fi);
5863 case LXC_TYPE_PROC_LOADAVG:
5864 return proc_loadavg_read(buf, size, offset, fi);
5865 default:
5866 return -EINVAL;
5867 }
5868 }
5869
5870 /*
5871 * Functions needed to setup cgroups in the __constructor__.
5872 */
5873
5874 static bool mkdir_p(const char *dir, mode_t mode)
5875 {
5876 const char *tmp = dir;
5877 const char *orig = dir;
5878 char *makeme;
5879
5880 do {
5881 dir = tmp + strspn(tmp, "/");
5882 tmp = dir + strcspn(dir, "/");
5883 makeme = strndup(orig, dir - orig);
5884 if (!makeme)
5885 return false;
5886 if (mkdir(makeme, mode) && errno != EEXIST) {
5887 lxcfs_error("Failed to create directory '%s': %s.\n",
5888 makeme, strerror(errno));
5889 free(makeme);
5890 return false;
5891 }
5892 free(makeme);
5893 } while(tmp != dir);
5894
5895 return true;
5896 }
5897
5898 static bool umount_if_mounted(void)
5899 {
5900 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5901 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5902 return false;
5903 }
5904 return true;
5905 }
5906
5907 /* __typeof__ should be safe to use with all compilers. */
5908 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5909 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5910 {
5911 return (fs->f_type == (fs_type_magic)magic_val);
5912 }
5913
5914 /*
5915 * looking at fs/proc_namespace.c, it appears we can
5916 * actually expect the rootfs entry to very specifically contain
5917 * " - rootfs rootfs "
5918 * IIUC, so long as we've chrooted so that rootfs is not our root,
5919 * the rootfs entry should always be skipped in mountinfo contents.
5920 */
5921 static bool is_on_ramfs(void)
5922 {
5923 FILE *f;
5924 char *p, *p2;
5925 char *line = NULL;
5926 size_t len = 0;
5927 int i;
5928
5929 f = fopen("/proc/self/mountinfo", "r");
5930 if (!f)
5931 return false;
5932
5933 while (getline(&line, &len, f) != -1) {
5934 for (p = line, i = 0; p && i < 4; i++)
5935 p = strchr(p + 1, ' ');
5936 if (!p)
5937 continue;
5938 p2 = strchr(p + 1, ' ');
5939 if (!p2)
5940 continue;
5941 *p2 = '\0';
5942 if (strcmp(p + 1, "/") == 0) {
5943 // this is '/'. is it the ramfs?
5944 p = strchr(p2 + 1, '-');
5945 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5946 free(line);
5947 fclose(f);
5948 return true;
5949 }
5950 }
5951 }
5952 free(line);
5953 fclose(f);
5954 return false;
5955 }
5956
5957 static int pivot_enter()
5958 {
5959 int ret = -1, oldroot = -1, newroot = -1;
5960
5961 oldroot = open("/", O_DIRECTORY | O_RDONLY);
5962 if (oldroot < 0) {
5963 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5964 return ret;
5965 }
5966
5967 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5968 if (newroot < 0) {
5969 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5970 goto err;
5971 }
5972
5973 /* change into new root fs */
5974 if (fchdir(newroot) < 0) {
5975 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5976 goto err;
5977 }
5978
5979 /* pivot_root into our new root fs */
5980 if (pivot_root(".", ".") < 0) {
5981 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5982 goto err;
5983 }
5984
5985 /*
5986 * At this point the old-root is mounted on top of our new-root.
5987 * To unmounted it we must not be chdir'd into it, so escape back
5988 * to the old-root.
5989 */
5990 if (fchdir(oldroot) < 0) {
5991 lxcfs_error("%s\n", "Failed to enter old root.");
5992 goto err;
5993 }
5994
5995 if (umount2(".", MNT_DETACH) < 0) {
5996 lxcfs_error("%s\n", "Failed to detach old root.");
5997 goto err;
5998 }
5999
6000 if (fchdir(newroot) < 0) {
6001 lxcfs_error("%s\n", "Failed to re-enter new root.");
6002 goto err;
6003 }
6004
6005 ret = 0;
6006
6007 err:
6008 if (oldroot > 0)
6009 close(oldroot);
6010 if (newroot > 0)
6011 close(newroot);
6012
6013 return ret;
6014 }
6015
6016 static int chroot_enter()
6017 {
6018 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
6019 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
6020 return -1;
6021 }
6022
6023 if (chroot(".") < 0) {
6024 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
6025 return -1;
6026 }
6027
6028 if (chdir("/") < 0) {
6029 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
6030 return -1;
6031 }
6032
6033 return 0;
6034 }
6035
6036 static int permute_and_enter(void)
6037 {
6038 struct statfs sb;
6039
6040 if (statfs("/", &sb) < 0) {
6041 lxcfs_error("%s\n", "Could not stat / mountpoint.");
6042 return -1;
6043 }
6044
6045 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
6046 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
6047 * /proc/1/mountinfo. */
6048 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
6049 return chroot_enter();
6050
6051 if (pivot_enter() < 0) {
6052 lxcfs_error("%s\n", "Could not perform pivot root.");
6053 return -1;
6054 }
6055
6056 return 0;
6057 }
6058
6059 /* Prepare our new clean root. */
6060 static int permute_prepare(void)
6061 {
6062 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
6063 lxcfs_error("%s\n", "Failed to create directory for new root.");
6064 return -1;
6065 }
6066
6067 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
6068 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
6069 return -1;
6070 }
6071
6072 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
6073 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
6074 return -1;
6075 }
6076
6077 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
6078 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
6079 return -1;
6080 }
6081
6082 return 0;
6083 }
6084
6085 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6086 static bool permute_root(void)
6087 {
6088 /* Prepare new root. */
6089 if (permute_prepare() < 0)
6090 return false;
6091
6092 /* Pivot into new root. */
6093 if (permute_and_enter() < 0)
6094 return false;
6095
6096 return true;
6097 }
6098
6099 static int preserve_mnt_ns(int pid)
6100 {
6101 int ret;
6102 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6103 char path[len];
6104
6105 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6106 if (ret < 0 || (size_t)ret >= len)
6107 return -1;
6108
6109 return open(path, O_RDONLY | O_CLOEXEC);
6110 }
6111
6112 static bool cgfs_prepare_mounts(void)
6113 {
6114 if (!mkdir_p(BASEDIR, 0700)) {
6115 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6116 return false;
6117 }
6118
6119 if (!umount_if_mounted()) {
6120 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6121 return false;
6122 }
6123
6124 if (unshare(CLONE_NEWNS) < 0) {
6125 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6126 return false;
6127 }
6128
6129 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6130 if (cgroup_mount_ns_fd < 0) {
6131 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6132 return false;
6133 }
6134
6135 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6136 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6137 return false;
6138 }
6139
6140 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6141 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6142 return false;
6143 }
6144
6145 return true;
6146 }
6147
6148 static bool cgfs_mount_hierarchies(void)
6149 {
6150 char *target;
6151 size_t clen, len;
6152 int i, ret;
6153
6154 for (i = 0; i < num_hierarchies; i++) {
6155 char *controller = hierarchies[i];
6156
6157 clen = strlen(controller);
6158 len = strlen(BASEDIR) + clen + 2;
6159 target = malloc(len);
6160 if (!target)
6161 return false;
6162
6163 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
6164 if (ret < 0 || ret >= len) {
6165 free(target);
6166 return false;
6167 }
6168 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
6169 free(target);
6170 return false;
6171 }
6172 if (!strcmp(controller, "unified"))
6173 ret = mount("none", target, "cgroup2", 0, NULL);
6174 else
6175 ret = mount(controller, target, "cgroup", 0, controller);
6176 if (ret < 0) {
6177 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
6178 free(target);
6179 return false;
6180 }
6181
6182 fd_hierarchies[i] = open(target, O_DIRECTORY);
6183 if (fd_hierarchies[i] < 0) {
6184 free(target);
6185 return false;
6186 }
6187 free(target);
6188 }
6189 return true;
6190 }
6191
6192 static bool cgfs_setup_controllers(void)
6193 {
6194 if (!cgfs_prepare_mounts())
6195 return false;
6196
6197 if (!cgfs_mount_hierarchies()) {
6198 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6199 return false;
6200 }
6201
6202 if (!permute_root())
6203 return false;
6204
6205 return true;
6206 }
6207
6208 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
6209 {
6210 FILE *f;
6211 char *cret, *line = NULL;
6212 char cwd[MAXPATHLEN];
6213 size_t len = 0;
6214 int i, init_ns = -1;
6215 bool found_unified = false;
6216
6217 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
6218 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
6219 return;
6220 }
6221
6222 while (getline(&line, &len, f) != -1) {
6223 char *idx, *p, *p2;
6224
6225 p = strchr(line, ':');
6226 if (!p)
6227 goto out;
6228 idx = line;
6229 *(p++) = '\0';
6230
6231 p2 = strrchr(p, ':');
6232 if (!p2)
6233 goto out;
6234 *p2 = '\0';
6235
6236 /* With cgroupv2 /proc/self/cgroup can contain entries of the
6237 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
6238 * because it parses out the empty string "" and later on passes
6239 * it to mount(). Let's skip such entries.
6240 */
6241 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
6242 found_unified = true;
6243 p = "unified";
6244 }
6245
6246 if (!store_hierarchy(line, p))
6247 goto out;
6248 }
6249
6250 /* Preserve initial namespace. */
6251 init_ns = preserve_mnt_ns(getpid());
6252 if (init_ns < 0) {
6253 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
6254 goto out;
6255 }
6256
6257 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
6258 if (!fd_hierarchies) {
6259 lxcfs_error("%s\n", strerror(errno));
6260 goto out;
6261 }
6262
6263 for (i = 0; i < num_hierarchies; i++)
6264 fd_hierarchies[i] = -1;
6265
6266 cret = getcwd(cwd, MAXPATHLEN);
6267 if (!cret)
6268 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6269
6270 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6271 * to privately mount lxcfs cgroups. */
6272 if (!cgfs_setup_controllers()) {
6273 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
6274 goto out;
6275 }
6276
6277 if (setns(init_ns, 0) < 0) {
6278 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
6279 goto out;
6280 }
6281
6282 if (!cret || chdir(cwd) < 0)
6283 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6284
6285 if (!init_cpuview()) {
6286 lxcfs_error("%s\n", "failed to init CPU view");
6287 goto out;
6288 }
6289
6290 print_subsystems();
6291
6292 out:
6293 free(line);
6294 fclose(f);
6295 if (init_ns >= 0)
6296 close(init_ns);
6297 }
6298
6299 static void __attribute__((destructor)) free_subsystems(void)
6300 {
6301 int i;
6302
6303 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6304
6305 for (i = 0; i < num_hierarchies; i++) {
6306 if (hierarchies[i])
6307 free(hierarchies[i]);
6308 if (fd_hierarchies && fd_hierarchies[i] >= 0)
6309 close(fd_hierarchies[i]);
6310 }
6311 free(hierarchies);
6312 free(fd_hierarchies);
6313 free_cpuview();
6314
6315 if (cgroup_mount_ns_fd >= 0)
6316 close(cgroup_mount_ns_fd);
6317 }