]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
Merge pull request #266 from tomponline/master
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #define __STDC_FORMAT_MACROS
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <fuse.h>
16 #include <inttypes.h>
17 #include <libgen.h>
18 #include <pthread.h>
19 #include <sched.h>
20 #include <stdbool.h>
21 #include <stdint.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <time.h>
26 #include <unistd.h>
27 #include <wait.h>
28 #include <linux/magic.h>
29 #include <linux/sched.h>
30 #include <sys/epoll.h>
31 #include <sys/mman.h>
32 #include <sys/mount.h>
33 #include <sys/param.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/sysinfo.h>
37 #include <sys/vfs.h>
38
39 #include "bindings.h"
40 #include "config.h" // for VERSION
41
42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
43 #define LXCFS_NUMSTRLEN64 21
44
45 /* Define pivot_root() if missing from the C library */
46 #ifndef HAVE_PIVOT_ROOT
47 static int pivot_root(const char * new_root, const char * put_old)
48 {
49 #ifdef __NR_pivot_root
50 return syscall(__NR_pivot_root, new_root, put_old);
51 #else
52 errno = ENOSYS;
53 return -1;
54 #endif
55 }
56 #else
57 extern int pivot_root(const char * new_root, const char * put_old);
58 #endif
59
60 enum {
61 LXC_TYPE_CGDIR,
62 LXC_TYPE_CGFILE,
63 LXC_TYPE_PROC_MEMINFO,
64 LXC_TYPE_PROC_CPUINFO,
65 LXC_TYPE_PROC_UPTIME,
66 LXC_TYPE_PROC_STAT,
67 LXC_TYPE_PROC_DISKSTATS,
68 LXC_TYPE_PROC_SWAPS,
69 LXC_TYPE_PROC_LOADAVG,
70 };
71
72 struct file_info {
73 char *controller;
74 char *cgroup;
75 char *file;
76 int type;
77 char *buf; // unused as of yet
78 int buflen;
79 int size; //actual data size
80 int cached;
81 };
82
83 struct cpuacct_usage {
84 uint64_t user;
85 uint64_t system;
86 uint64_t idle;
87 bool online;
88 };
89
90 /* The function of hash table.*/
91 #define LOAD_SIZE 100 /*the size of hash_table */
92 #define FLUSH_TIME 5 /*the flush rate */
93 #define DEPTH_DIR 3 /*the depth of per cgroup */
94 /* The function of calculate loadavg .*/
95 #define FSHIFT 11 /* nr of bits of precision */
96 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
97 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
98 #define EXP_5 2014 /* 1/exp(5sec/5min) */
99 #define EXP_15 2037 /* 1/exp(5sec/15min) */
100 #define LOAD_INT(x) ((x) >> FSHIFT)
101 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
102 /*
103 * This parameter is used for proc_loadavg_read().
104 * 1 means use loadavg, 0 means not use.
105 */
106 static int loadavg = 0;
107 static volatile sig_atomic_t loadavg_stop = 0;
108 static int calc_hash(const char *name)
109 {
110 unsigned int hash = 0;
111 unsigned int x = 0;
112 /* ELFHash algorithm. */
113 while (*name) {
114 hash = (hash << 4) + *name++;
115 x = hash & 0xf0000000;
116 if (x != 0)
117 hash ^= (x >> 24);
118 hash &= ~x;
119 }
120 return (hash & 0x7fffffff);
121 }
122
123 struct load_node {
124 char *cg; /*cg */
125 unsigned long avenrun[3]; /* Load averages */
126 unsigned int run_pid;
127 unsigned int total_pid;
128 unsigned int last_pid;
129 int cfd; /* The file descriptor of the mounted cgroup */
130 struct load_node *next;
131 struct load_node **pre;
132 };
133
134 struct load_head {
135 /*
136 * The lock is about insert load_node and refresh load_node.To the first
137 * load_node of each hash bucket, insert and refresh in this hash bucket is
138 * mutually exclusive.
139 */
140 pthread_mutex_t lock;
141 /*
142 * The rdlock is about read loadavg and delete load_node.To each hash
143 * bucket, read and delete is mutually exclusive. But at the same time, we
144 * allow paratactic read operation. This rdlock is at list level.
145 */
146 pthread_rwlock_t rdlock;
147 /*
148 * The rilock is about read loadavg and insert load_node.To the first
149 * load_node of each hash bucket, read and insert is mutually exclusive.
150 * But at the same time, we allow paratactic read operation.
151 */
152 pthread_rwlock_t rilock;
153 struct load_node *next;
154 };
155
156 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
157 /*
158 * init_load initialize the hash table.
159 * Return 0 on success, return -1 on failure.
160 */
161 static int init_load(void)
162 {
163 int i;
164 int ret;
165
166 for (i = 0; i < LOAD_SIZE; i++) {
167 load_hash[i].next = NULL;
168 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
169 if (ret != 0) {
170 lxcfs_error("%s\n", "Failed to initialize lock");
171 goto out3;
172 }
173 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
174 if (ret != 0) {
175 lxcfs_error("%s\n", "Failed to initialize rdlock");
176 goto out2;
177 }
178 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
179 if (ret != 0) {
180 lxcfs_error("%s\n", "Failed to initialize rilock");
181 goto out1;
182 }
183 }
184 return 0;
185 out1:
186 pthread_rwlock_destroy(&load_hash[i].rdlock);
187 out2:
188 pthread_mutex_destroy(&load_hash[i].lock);
189 out3:
190 while (i > 0) {
191 i--;
192 pthread_mutex_destroy(&load_hash[i].lock);
193 pthread_rwlock_destroy(&load_hash[i].rdlock);
194 pthread_rwlock_destroy(&load_hash[i].rilock);
195 }
196 return -1;
197 }
198
199 static void insert_node(struct load_node **n, int locate)
200 {
201 struct load_node *f;
202
203 pthread_mutex_lock(&load_hash[locate].lock);
204 pthread_rwlock_wrlock(&load_hash[locate].rilock);
205 f = load_hash[locate].next;
206 load_hash[locate].next = *n;
207
208 (*n)->pre = &(load_hash[locate].next);
209 if (f)
210 f->pre = &((*n)->next);
211 (*n)->next = f;
212 pthread_mutex_unlock(&load_hash[locate].lock);
213 pthread_rwlock_unlock(&load_hash[locate].rilock);
214 }
215 /*
216 * locate_node() finds special node. Not return NULL means success.
217 * It should be noted that rdlock isn't unlocked at the end of code
218 * because this function is used to read special node. Delete is not
219 * allowed before read has ended.
220 * unlock rdlock only in proc_loadavg_read().
221 */
222 static struct load_node *locate_node(char *cg, int locate)
223 {
224 struct load_node *f = NULL;
225 int i = 0;
226
227 pthread_rwlock_rdlock(&load_hash[locate].rilock);
228 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
229 if (load_hash[locate].next == NULL) {
230 pthread_rwlock_unlock(&load_hash[locate].rilock);
231 return f;
232 }
233 f = load_hash[locate].next;
234 pthread_rwlock_unlock(&load_hash[locate].rilock);
235 while (f && ((i = strcmp(f->cg, cg)) != 0))
236 f = f->next;
237 return f;
238 }
239 /* Delete the load_node n and return the next node of it. */
240 static struct load_node *del_node(struct load_node *n, int locate)
241 {
242 struct load_node *g;
243
244 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
245 if (n->next == NULL) {
246 *(n->pre) = NULL;
247 } else {
248 *(n->pre) = n->next;
249 n->next->pre = n->pre;
250 }
251 g = n->next;
252 free(n->cg);
253 free(n);
254 pthread_rwlock_unlock(&load_hash[locate].rdlock);
255 return g;
256 }
257
258 static void load_free(void)
259 {
260 int i;
261 struct load_node *f, *p;
262
263 for (i = 0; i < LOAD_SIZE; i++) {
264 pthread_mutex_lock(&load_hash[i].lock);
265 pthread_rwlock_wrlock(&load_hash[i].rilock);
266 pthread_rwlock_wrlock(&load_hash[i].rdlock);
267 if (load_hash[i].next == NULL) {
268 pthread_mutex_unlock(&load_hash[i].lock);
269 pthread_mutex_destroy(&load_hash[i].lock);
270 pthread_rwlock_unlock(&load_hash[i].rilock);
271 pthread_rwlock_destroy(&load_hash[i].rilock);
272 pthread_rwlock_unlock(&load_hash[i].rdlock);
273 pthread_rwlock_destroy(&load_hash[i].rdlock);
274 continue;
275 }
276 for (f = load_hash[i].next; f; ) {
277 free(f->cg);
278 p = f->next;
279 free(f);
280 f = p;
281 }
282 pthread_mutex_unlock(&load_hash[i].lock);
283 pthread_mutex_destroy(&load_hash[i].lock);
284 pthread_rwlock_unlock(&load_hash[i].rilock);
285 pthread_rwlock_destroy(&load_hash[i].rilock);
286 pthread_rwlock_unlock(&load_hash[i].rdlock);
287 pthread_rwlock_destroy(&load_hash[i].rdlock);
288 }
289 }
290
291 /* Data for CPU view */
292 struct cg_proc_stat {
293 char *cg;
294 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
295 struct cpuacct_usage *view; // Usage stats reported to the container
296 int cpu_count;
297 pthread_mutex_t lock; // For node manipulation
298 struct cg_proc_stat *next;
299 };
300
301 struct cg_proc_stat_head {
302 struct cg_proc_stat *next;
303 time_t lastcheck;
304
305 /*
306 * For access to the list. Reading can be parallel, pruning is exclusive.
307 */
308 pthread_rwlock_t lock;
309 };
310
311 #define CPUVIEW_HASH_SIZE 100
312 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
313
314 static bool cpuview_init_head(struct cg_proc_stat_head **head)
315 {
316 *head = malloc(sizeof(struct cg_proc_stat_head));
317 if (!(*head)) {
318 lxcfs_error("%s\n", strerror(errno));
319 return false;
320 }
321
322 (*head)->lastcheck = time(NULL);
323 (*head)->next = NULL;
324
325 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
326 lxcfs_error("%s\n", "Failed to initialize list lock");
327 free(*head);
328 return false;
329 }
330
331 return true;
332 }
333
334 static bool init_cpuview()
335 {
336 int i;
337
338 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
339 proc_stat_history[i] = NULL;
340
341 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
342 if (!cpuview_init_head(&proc_stat_history[i]))
343 goto err;
344 }
345
346 return true;
347
348 err:
349 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
350 if (proc_stat_history[i]) {
351 free(proc_stat_history[i]);
352 proc_stat_history[i] = NULL;
353 }
354 }
355
356 return false;
357 }
358
359 static void free_proc_stat_node(struct cg_proc_stat *node)
360 {
361 pthread_mutex_destroy(&node->lock);
362 free(node->cg);
363 free(node->usage);
364 free(node->view);
365 free(node);
366 }
367
368 static void cpuview_free_head(struct cg_proc_stat_head *head)
369 {
370 struct cg_proc_stat *node, *tmp;
371
372 if (head->next) {
373 node = head->next;
374
375 for (;;) {
376 tmp = node;
377 node = node->next;
378 free_proc_stat_node(tmp);
379
380 if (!node)
381 break;
382 }
383 }
384
385 pthread_rwlock_destroy(&head->lock);
386 free(head);
387 }
388
389 static void free_cpuview()
390 {
391 int i;
392
393 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
394 if (proc_stat_history[i])
395 cpuview_free_head(proc_stat_history[i]);
396 }
397 }
398
399 /* Reserve buffer size to account for file size changes. */
400 #define BUF_RESERVE_SIZE 512
401
402 /*
403 * A table caching which pid is init for a pid namespace.
404 * When looking up which pid is init for $qpid, we first
405 * 1. Stat /proc/$qpid/ns/pid.
406 * 2. Check whether the ino_t is in our store.
407 * a. if not, fork a child in qpid's ns to send us
408 * ucred.pid = 1, and read the initpid. Cache
409 * initpid and creation time for /proc/initpid
410 * in a new store entry.
411 * b. if so, verify that /proc/initpid still matches
412 * what we have saved. If not, clear the store
413 * entry and go back to a. If so, return the
414 * cached initpid.
415 */
416 struct pidns_init_store {
417 ino_t ino; // inode number for /proc/$pid/ns/pid
418 pid_t initpid; // the pid of nit in that ns
419 long int ctime; // the time at which /proc/$initpid was created
420 struct pidns_init_store *next;
421 long int lastcheck;
422 };
423
424 /* lol - look at how they are allocated in the kernel */
425 #define PIDNS_HASH_SIZE 4096
426 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
427
428 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
429 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
430 static void lock_mutex(pthread_mutex_t *l)
431 {
432 int ret;
433
434 if ((ret = pthread_mutex_lock(l)) != 0) {
435 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
436 exit(1);
437 }
438 }
439
440 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
441 * Number of hierarchies mounted. */
442 static int num_hierarchies;
443
444 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
445 * Hierachies mounted {cpuset, blkio, ...}:
446 * Initialized via __constructor__ collect_and_mount_subsystems(). */
447 static char **hierarchies;
448
449 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
450 * Open file descriptors:
451 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
452 * private mount namespace.
453 * Initialized via __constructor__ collect_and_mount_subsystems().
454 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
455 * mounts and respective files in the private namespace even when located in
456 * another namespace using the *at() family of functions
457 * {openat(), fchownat(), ...}. */
458 static int *fd_hierarchies;
459 static int cgroup_mount_ns_fd = -1;
460
461 static void unlock_mutex(pthread_mutex_t *l)
462 {
463 int ret;
464
465 if ((ret = pthread_mutex_unlock(l)) != 0) {
466 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
467 exit(1);
468 }
469 }
470
471 static void store_lock(void)
472 {
473 lock_mutex(&pidns_store_mutex);
474 }
475
476 static void store_unlock(void)
477 {
478 unlock_mutex(&pidns_store_mutex);
479 }
480
481 /* Must be called under store_lock */
482 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
483 {
484 struct stat initsb;
485 char fnam[100];
486
487 snprintf(fnam, 100, "/proc/%d", e->initpid);
488 if (stat(fnam, &initsb) < 0)
489 return false;
490
491 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
492 initsb.st_ctime, e->initpid);
493
494 if (e->ctime != initsb.st_ctime)
495 return false;
496 return true;
497 }
498
499 /* Must be called under store_lock */
500 static void remove_initpid(struct pidns_init_store *e)
501 {
502 struct pidns_init_store *tmp;
503 int h;
504
505 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
506
507 h = HASH(e->ino);
508 if (pidns_hash_table[h] == e) {
509 pidns_hash_table[h] = e->next;
510 free(e);
511 return;
512 }
513
514 tmp = pidns_hash_table[h];
515 while (tmp) {
516 if (tmp->next == e) {
517 tmp->next = e->next;
518 free(e);
519 return;
520 }
521 tmp = tmp->next;
522 }
523 }
524
525 #define PURGE_SECS 5
526 /* Must be called under store_lock */
527 static void prune_initpid_store(void)
528 {
529 static long int last_prune = 0;
530 struct pidns_init_store *e, *prev, *delme;
531 long int now, threshold;
532 int i;
533
534 if (!last_prune) {
535 last_prune = time(NULL);
536 return;
537 }
538 now = time(NULL);
539 if (now < last_prune + PURGE_SECS)
540 return;
541
542 lxcfs_debug("%s\n", "Pruning.");
543
544 last_prune = now;
545 threshold = now - 2 * PURGE_SECS;
546
547 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
548 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
549 if (e->lastcheck < threshold) {
550
551 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
552
553 delme = e;
554 if (prev)
555 prev->next = e->next;
556 else
557 pidns_hash_table[i] = e->next;
558 e = e->next;
559 free(delme);
560 } else {
561 prev = e;
562 e = e->next;
563 }
564 }
565 }
566 }
567
568 /* Must be called under store_lock */
569 static void save_initpid(struct stat *sb, pid_t pid)
570 {
571 struct pidns_init_store *e;
572 char fpath[100];
573 struct stat procsb;
574 int h;
575
576 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
577
578 snprintf(fpath, 100, "/proc/%d", pid);
579 if (stat(fpath, &procsb) < 0)
580 return;
581 do {
582 e = malloc(sizeof(*e));
583 } while (!e);
584 e->ino = sb->st_ino;
585 e->initpid = pid;
586 e->ctime = procsb.st_ctime;
587 h = HASH(e->ino);
588 e->next = pidns_hash_table[h];
589 e->lastcheck = time(NULL);
590 pidns_hash_table[h] = e;
591 }
592
593 /*
594 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
595 * entry for the inode number and creation time. Verify that the init pid
596 * is still valid. If not, remove it. Return the entry if valid, NULL
597 * otherwise.
598 * Must be called under store_lock
599 */
600 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
601 {
602 int h = HASH(sb->st_ino);
603 struct pidns_init_store *e = pidns_hash_table[h];
604
605 while (e) {
606 if (e->ino == sb->st_ino) {
607 if (initpid_still_valid(e, sb)) {
608 e->lastcheck = time(NULL);
609 return e;
610 }
611 remove_initpid(e);
612 return NULL;
613 }
614 e = e->next;
615 }
616
617 return NULL;
618 }
619
620 static int is_dir(const char *path, int fd)
621 {
622 struct stat statbuf;
623 int ret = fstatat(fd, path, &statbuf, fd);
624 if (ret == 0 && S_ISDIR(statbuf.st_mode))
625 return 1;
626 return 0;
627 }
628
629 static char *must_copy_string(const char *str)
630 {
631 char *dup = NULL;
632 if (!str)
633 return NULL;
634 do {
635 dup = strdup(str);
636 } while (!dup);
637
638 return dup;
639 }
640
641 static inline void drop_trailing_newlines(char *s)
642 {
643 int l;
644
645 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
646 s[l-1] = '\0';
647 }
648
649 #define BATCH_SIZE 50
650 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
651 {
652 int newbatches = (newlen / BATCH_SIZE) + 1;
653 int oldbatches = (oldlen / BATCH_SIZE) + 1;
654
655 if (!*mem || newbatches > oldbatches) {
656 char *tmp;
657 do {
658 tmp = realloc(*mem, newbatches * BATCH_SIZE);
659 } while (!tmp);
660 *mem = tmp;
661 }
662 }
663 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
664 {
665 size_t newlen = *len + linelen;
666 dorealloc(contents, *len, newlen + 1);
667 memcpy(*contents + *len, line, linelen+1);
668 *len = newlen;
669 }
670
671 static char *slurp_file(const char *from, int fd)
672 {
673 char *line = NULL;
674 char *contents = NULL;
675 FILE *f = fdopen(fd, "r");
676 size_t len = 0, fulllen = 0;
677 ssize_t linelen;
678
679 if (!f)
680 return NULL;
681
682 while ((linelen = getline(&line, &len, f)) != -1) {
683 append_line(&contents, &fulllen, line, linelen);
684 }
685 fclose(f);
686
687 if (contents)
688 drop_trailing_newlines(contents);
689 free(line);
690 return contents;
691 }
692
693 static bool write_string(const char *fnam, const char *string, int fd)
694 {
695 FILE *f;
696 size_t len, ret;
697
698 f = fdopen(fd, "w");
699 if (!f)
700 return false;
701
702 len = strlen(string);
703 ret = fwrite(string, 1, len, f);
704 if (ret != len) {
705 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
706 strerror(errno), string, fnam);
707 fclose(f);
708 return false;
709 }
710
711 if (fclose(f) < 0) {
712 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
713 return false;
714 }
715
716 return true;
717 }
718
719 struct cgfs_files {
720 char *name;
721 uint32_t uid, gid;
722 uint32_t mode;
723 };
724
725 #define ALLOC_NUM 20
726 static bool store_hierarchy(char *stridx, char *h)
727 {
728 if (num_hierarchies % ALLOC_NUM == 0) {
729 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
730 n *= ALLOC_NUM;
731 char **tmp = realloc(hierarchies, n * sizeof(char *));
732 if (!tmp) {
733 lxcfs_error("%s\n", strerror(errno));
734 exit(1);
735 }
736 hierarchies = tmp;
737 }
738
739 hierarchies[num_hierarchies++] = must_copy_string(h);
740 return true;
741 }
742
743 static void print_subsystems(void)
744 {
745 int i;
746
747 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
748 fprintf(stderr, "hierarchies:\n");
749 for (i = 0; i < num_hierarchies; i++) {
750 if (hierarchies[i])
751 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
752 fd_hierarchies[i], hierarchies[i]);
753 }
754 }
755
756 static bool in_comma_list(const char *needle, const char *haystack)
757 {
758 const char *s = haystack, *e;
759 size_t nlen = strlen(needle);
760
761 while (*s && (e = strchr(s, ','))) {
762 if (nlen != e - s) {
763 s = e + 1;
764 continue;
765 }
766 if (strncmp(needle, s, nlen) == 0)
767 return true;
768 s = e + 1;
769 }
770 if (strcmp(needle, s) == 0)
771 return true;
772 return false;
773 }
774
775 /* do we need to do any massaging here? I'm not sure... */
776 /* Return the mounted controller and store the corresponding open file descriptor
777 * referring to the controller mountpoint in the private lxcfs namespace in
778 * @cfd.
779 */
780 static char *find_mounted_controller(const char *controller, int *cfd)
781 {
782 int i;
783
784 for (i = 0; i < num_hierarchies; i++) {
785 if (!hierarchies[i])
786 continue;
787 if (strcmp(hierarchies[i], controller) == 0) {
788 *cfd = fd_hierarchies[i];
789 return hierarchies[i];
790 }
791 if (in_comma_list(controller, hierarchies[i])) {
792 *cfd = fd_hierarchies[i];
793 return hierarchies[i];
794 }
795 }
796
797 return NULL;
798 }
799
800 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
801 const char *value)
802 {
803 int ret, fd, cfd;
804 size_t len;
805 char *fnam, *tmpc;
806
807 tmpc = find_mounted_controller(controller, &cfd);
808 if (!tmpc)
809 return false;
810
811 /* Make sure we pass a relative path to *at() family of functions.
812 * . + /cgroup + / + file + \0
813 */
814 len = strlen(cgroup) + strlen(file) + 3;
815 fnam = alloca(len);
816 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
817 if (ret < 0 || (size_t)ret >= len)
818 return false;
819
820 fd = openat(cfd, fnam, O_WRONLY);
821 if (fd < 0)
822 return false;
823
824 return write_string(fnam, value, fd);
825 }
826
827 // Chown all the files in the cgroup directory. We do this when we create
828 // a cgroup on behalf of a user.
829 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
830 {
831 struct dirent *direntp;
832 char path[MAXPATHLEN];
833 size_t len;
834 DIR *d;
835 int fd1, ret;
836
837 len = strlen(dirname);
838 if (len >= MAXPATHLEN) {
839 lxcfs_error("Pathname too long: %s\n", dirname);
840 return;
841 }
842
843 fd1 = openat(fd, dirname, O_DIRECTORY);
844 if (fd1 < 0)
845 return;
846
847 d = fdopendir(fd1);
848 if (!d) {
849 lxcfs_error("Failed to open %s\n", dirname);
850 return;
851 }
852
853 while ((direntp = readdir(d))) {
854 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
855 continue;
856 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
857 if (ret < 0 || ret >= MAXPATHLEN) {
858 lxcfs_error("Pathname too long under %s\n", dirname);
859 continue;
860 }
861 if (fchownat(fd, path, uid, gid, 0) < 0)
862 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
863 }
864 closedir(d);
865 }
866
867 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
868 {
869 int cfd;
870 size_t len;
871 char *dirnam, *tmpc;
872
873 tmpc = find_mounted_controller(controller, &cfd);
874 if (!tmpc)
875 return -EINVAL;
876
877 /* Make sure we pass a relative path to *at() family of functions.
878 * . + /cg + \0
879 */
880 len = strlen(cg) + 2;
881 dirnam = alloca(len);
882 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
883
884 if (mkdirat(cfd, dirnam, 0755) < 0)
885 return -errno;
886
887 if (uid == 0 && gid == 0)
888 return 0;
889
890 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
891 return -errno;
892
893 chown_all_cgroup_files(dirnam, uid, gid, cfd);
894
895 return 0;
896 }
897
898 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
899 {
900 struct dirent *direntp;
901 DIR *dir;
902 bool ret = false;
903 char pathname[MAXPATHLEN];
904 int dupfd;
905
906 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
907 if (dupfd < 0)
908 return false;
909
910 dir = fdopendir(dupfd);
911 if (!dir) {
912 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
913 close(dupfd);
914 return false;
915 }
916
917 while ((direntp = readdir(dir))) {
918 struct stat mystat;
919 int rc;
920
921 if (!strcmp(direntp->d_name, ".") ||
922 !strcmp(direntp->d_name, ".."))
923 continue;
924
925 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
926 if (rc < 0 || rc >= MAXPATHLEN) {
927 lxcfs_error("%s\n", "Pathname too long.");
928 continue;
929 }
930
931 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
932 if (rc) {
933 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
934 continue;
935 }
936 if (S_ISDIR(mystat.st_mode))
937 if (!recursive_rmdir(pathname, fd, cfd))
938 lxcfs_debug("Error removing %s.\n", pathname);
939 }
940
941 ret = true;
942 if (closedir(dir) < 0) {
943 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
944 ret = false;
945 }
946
947 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
948 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
949 ret = false;
950 }
951
952 close(dupfd);
953
954 return ret;
955 }
956
957 bool cgfs_remove(const char *controller, const char *cg)
958 {
959 int fd, cfd;
960 size_t len;
961 char *dirnam, *tmpc;
962 bool bret;
963
964 tmpc = find_mounted_controller(controller, &cfd);
965 if (!tmpc)
966 return false;
967
968 /* Make sure we pass a relative path to *at() family of functions.
969 * . + /cg + \0
970 */
971 len = strlen(cg) + 2;
972 dirnam = alloca(len);
973 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
974
975 fd = openat(cfd, dirnam, O_DIRECTORY);
976 if (fd < 0)
977 return false;
978
979 bret = recursive_rmdir(dirnam, fd, cfd);
980 close(fd);
981 return bret;
982 }
983
984 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
985 {
986 int cfd;
987 size_t len;
988 char *pathname, *tmpc;
989
990 tmpc = find_mounted_controller(controller, &cfd);
991 if (!tmpc)
992 return false;
993
994 /* Make sure we pass a relative path to *at() family of functions.
995 * . + /file + \0
996 */
997 len = strlen(file) + 2;
998 pathname = alloca(len);
999 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1000 if (fchmodat(cfd, pathname, mode, 0) < 0)
1001 return false;
1002 return true;
1003 }
1004
1005 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
1006 {
1007 size_t len;
1008 char *fname;
1009
1010 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
1011 fname = alloca(len);
1012 snprintf(fname, len, "%s/tasks", dirname);
1013 if (fchownat(fd, fname, uid, gid, 0) != 0)
1014 return -errno;
1015 snprintf(fname, len, "%s/cgroup.procs", dirname);
1016 if (fchownat(fd, fname, uid, gid, 0) != 0)
1017 return -errno;
1018 return 0;
1019 }
1020
1021 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
1022 {
1023 int cfd;
1024 size_t len;
1025 char *pathname, *tmpc;
1026
1027 tmpc = find_mounted_controller(controller, &cfd);
1028 if (!tmpc)
1029 return -EINVAL;
1030
1031 /* Make sure we pass a relative path to *at() family of functions.
1032 * . + /file + \0
1033 */
1034 len = strlen(file) + 2;
1035 pathname = alloca(len);
1036 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1037 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
1038 return -errno;
1039
1040 if (is_dir(pathname, cfd))
1041 // like cgmanager did, we want to chown the tasks file as well
1042 return chown_tasks_files(pathname, uid, gid, cfd);
1043
1044 return 0;
1045 }
1046
1047 FILE *open_pids_file(const char *controller, const char *cgroup)
1048 {
1049 int fd, cfd;
1050 size_t len;
1051 char *pathname, *tmpc;
1052
1053 tmpc = find_mounted_controller(controller, &cfd);
1054 if (!tmpc)
1055 return NULL;
1056
1057 /* Make sure we pass a relative path to *at() family of functions.
1058 * . + /cgroup + / "cgroup.procs" + \0
1059 */
1060 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
1061 pathname = alloca(len);
1062 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1063
1064 fd = openat(cfd, pathname, O_WRONLY);
1065 if (fd < 0)
1066 return NULL;
1067
1068 return fdopen(fd, "w");
1069 }
1070
1071 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1072 void ***list, size_t typesize,
1073 void* (*iterator)(const char*, const char*, const char*))
1074 {
1075 int cfd, fd, ret;
1076 size_t len;
1077 char *cg, *tmpc;
1078 char pathname[MAXPATHLEN];
1079 size_t sz = 0, asz = 0;
1080 struct dirent *dirent;
1081 DIR *dir;
1082
1083 tmpc = find_mounted_controller(controller, &cfd);
1084 *list = NULL;
1085 if (!tmpc)
1086 return false;
1087
1088 /* Make sure we pass a relative path to *at() family of functions. */
1089 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1090 cg = alloca(len);
1091 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1092 if (ret < 0 || (size_t)ret >= len) {
1093 lxcfs_error("Pathname too long under %s\n", cgroup);
1094 return false;
1095 }
1096
1097 fd = openat(cfd, cg, O_DIRECTORY);
1098 if (fd < 0)
1099 return false;
1100
1101 dir = fdopendir(fd);
1102 if (!dir)
1103 return false;
1104
1105 while ((dirent = readdir(dir))) {
1106 struct stat mystat;
1107
1108 if (!strcmp(dirent->d_name, ".") ||
1109 !strcmp(dirent->d_name, ".."))
1110 continue;
1111
1112 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1113 if (ret < 0 || ret >= MAXPATHLEN) {
1114 lxcfs_error("Pathname too long under %s\n", cg);
1115 continue;
1116 }
1117
1118 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1119 if (ret) {
1120 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1121 continue;
1122 }
1123 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1124 (directories && !S_ISDIR(mystat.st_mode)))
1125 continue;
1126
1127 if (sz+2 >= asz) {
1128 void **tmp;
1129 asz += BATCH_SIZE;
1130 do {
1131 tmp = realloc(*list, asz * typesize);
1132 } while (!tmp);
1133 *list = tmp;
1134 }
1135 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1136 (*list)[sz+1] = NULL;
1137 sz++;
1138 }
1139 if (closedir(dir) < 0) {
1140 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1141 return false;
1142 }
1143 return true;
1144 }
1145
1146 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1147 {
1148 char *dup;
1149 do {
1150 dup = strdup(dir_entry);
1151 } while (!dup);
1152 return dup;
1153 }
1154
1155 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1156 {
1157 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1158 }
1159
1160 void free_key(struct cgfs_files *k)
1161 {
1162 if (!k)
1163 return;
1164 free(k->name);
1165 free(k);
1166 }
1167
1168 void free_keys(struct cgfs_files **keys)
1169 {
1170 int i;
1171
1172 if (!keys)
1173 return;
1174 for (i = 0; keys[i]; i++) {
1175 free_key(keys[i]);
1176 }
1177 free(keys);
1178 }
1179
1180 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1181 {
1182 int ret, fd, cfd;
1183 size_t len;
1184 char *fnam, *tmpc;
1185
1186 tmpc = find_mounted_controller(controller, &cfd);
1187 if (!tmpc)
1188 return false;
1189
1190 /* Make sure we pass a relative path to *at() family of functions.
1191 * . + /cgroup + / + file + \0
1192 */
1193 len = strlen(cgroup) + strlen(file) + 3;
1194 fnam = alloca(len);
1195 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1196 if (ret < 0 || (size_t)ret >= len)
1197 return false;
1198
1199 fd = openat(cfd, fnam, O_RDONLY);
1200 if (fd < 0)
1201 return false;
1202
1203 *value = slurp_file(fnam, fd);
1204 return *value != NULL;
1205 }
1206
1207 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1208 {
1209 int ret, cfd;
1210 size_t len;
1211 char *fnam, *tmpc;
1212
1213 tmpc = find_mounted_controller(controller, &cfd);
1214 if (!tmpc)
1215 return false;
1216
1217 /* Make sure we pass a relative path to *at() family of functions.
1218 * . + /cgroup + / + file + \0
1219 */
1220 len = strlen(cgroup) + strlen(file) + 3;
1221 fnam = alloca(len);
1222 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1223 if (ret < 0 || (size_t)ret >= len)
1224 return false;
1225
1226 return (faccessat(cfd, fnam, F_OK, 0) == 0);
1227 }
1228
1229 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1230 {
1231 int ret, cfd;
1232 size_t len;
1233 char *fnam, *tmpc;
1234 struct stat sb;
1235 struct cgfs_files *newkey;
1236
1237 tmpc = find_mounted_controller(controller, &cfd);
1238 if (!tmpc)
1239 return false;
1240
1241 if (file && *file == '/')
1242 file++;
1243
1244 if (file && strchr(file, '/'))
1245 return NULL;
1246
1247 /* Make sure we pass a relative path to *at() family of functions.
1248 * . + /cgroup + / + file + \0
1249 */
1250 len = strlen(cgroup) + 3;
1251 if (file)
1252 len += strlen(file) + 1;
1253 fnam = alloca(len);
1254 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1255 file ? "/" : "", file ? file : "");
1256
1257 ret = fstatat(cfd, fnam, &sb, 0);
1258 if (ret < 0)
1259 return NULL;
1260
1261 do {
1262 newkey = malloc(sizeof(struct cgfs_files));
1263 } while (!newkey);
1264 if (file)
1265 newkey->name = must_copy_string(file);
1266 else if (strrchr(cgroup, '/'))
1267 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1268 else
1269 newkey->name = must_copy_string(cgroup);
1270 newkey->uid = sb.st_uid;
1271 newkey->gid = sb.st_gid;
1272 newkey->mode = sb.st_mode;
1273
1274 return newkey;
1275 }
1276
1277 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1278 {
1279 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1280 if (!entry) {
1281 lxcfs_error("Error getting files under %s:%s\n", controller,
1282 cgroup);
1283 }
1284 return entry;
1285 }
1286
1287 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1288 {
1289 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1290 }
1291
1292 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1293 {
1294 int cfd;
1295 size_t len;
1296 char *fnam, *tmpc;
1297 int ret;
1298 struct stat sb;
1299
1300 tmpc = find_mounted_controller(controller, &cfd);
1301 if (!tmpc)
1302 return false;
1303
1304 /* Make sure we pass a relative path to *at() family of functions.
1305 * . + /cgroup + / + f + \0
1306 */
1307 len = strlen(cgroup) + strlen(f) + 3;
1308 fnam = alloca(len);
1309 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1310 if (ret < 0 || (size_t)ret >= len)
1311 return false;
1312
1313 ret = fstatat(cfd, fnam, &sb, 0);
1314 if (ret < 0 || !S_ISDIR(sb.st_mode))
1315 return false;
1316
1317 return true;
1318 }
1319
1320 #define SEND_CREDS_OK 0
1321 #define SEND_CREDS_NOTSK 1
1322 #define SEND_CREDS_FAIL 2
1323 static bool recv_creds(int sock, struct ucred *cred, char *v);
1324 static int wait_for_pid(pid_t pid);
1325 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1326 static int send_creds_clone_wrapper(void *arg);
1327
1328 /*
1329 * clone a task which switches to @task's namespace and writes '1'.
1330 * over a unix sock so we can read the task's reaper's pid in our
1331 * namespace
1332 *
1333 * Note: glibc's fork() does not respect pidns, which can lead to failed
1334 * assertions inside glibc (and thus failed forks) if the child's pid in
1335 * the pidns and the parent pid outside are identical. Using clone prevents
1336 * this issue.
1337 */
1338 static void write_task_init_pid_exit(int sock, pid_t target)
1339 {
1340 char fnam[100];
1341 pid_t pid;
1342 int fd, ret;
1343 size_t stack_size = sysconf(_SC_PAGESIZE);
1344 void *stack = alloca(stack_size);
1345
1346 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1347 if (ret < 0 || ret >= sizeof(fnam))
1348 _exit(1);
1349
1350 fd = open(fnam, O_RDONLY);
1351 if (fd < 0) {
1352 perror("write_task_init_pid_exit open of ns/pid");
1353 _exit(1);
1354 }
1355 if (setns(fd, 0)) {
1356 perror("write_task_init_pid_exit setns 1");
1357 close(fd);
1358 _exit(1);
1359 }
1360 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1361 if (pid < 0)
1362 _exit(1);
1363 if (pid != 0) {
1364 if (!wait_for_pid(pid))
1365 _exit(1);
1366 _exit(0);
1367 }
1368 }
1369
1370 static int send_creds_clone_wrapper(void *arg) {
1371 struct ucred cred;
1372 char v;
1373 int sock = *(int *)arg;
1374
1375 /* we are the child */
1376 cred.uid = 0;
1377 cred.gid = 0;
1378 cred.pid = 1;
1379 v = '1';
1380 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1381 return 1;
1382 return 0;
1383 }
1384
1385 static pid_t get_init_pid_for_task(pid_t task)
1386 {
1387 int sock[2];
1388 pid_t pid;
1389 pid_t ret = -1;
1390 char v = '0';
1391 struct ucred cred;
1392
1393 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1394 perror("socketpair");
1395 return -1;
1396 }
1397
1398 pid = fork();
1399 if (pid < 0)
1400 goto out;
1401 if (!pid) {
1402 close(sock[1]);
1403 write_task_init_pid_exit(sock[0], task);
1404 _exit(0);
1405 }
1406
1407 if (!recv_creds(sock[1], &cred, &v))
1408 goto out;
1409 ret = cred.pid;
1410
1411 out:
1412 close(sock[0]);
1413 close(sock[1]);
1414 if (pid > 0)
1415 wait_for_pid(pid);
1416 return ret;
1417 }
1418
1419 static pid_t lookup_initpid_in_store(pid_t qpid)
1420 {
1421 pid_t answer = 0;
1422 struct stat sb;
1423 struct pidns_init_store *e;
1424 char fnam[100];
1425
1426 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1427 store_lock();
1428 if (stat(fnam, &sb) < 0)
1429 goto out;
1430 e = lookup_verify_initpid(&sb);
1431 if (e) {
1432 answer = e->initpid;
1433 goto out;
1434 }
1435 answer = get_init_pid_for_task(qpid);
1436 if (answer > 0)
1437 save_initpid(&sb, answer);
1438
1439 out:
1440 /* we prune at end in case we are returning
1441 * the value we were about to return */
1442 prune_initpid_store();
1443 store_unlock();
1444 return answer;
1445 }
1446
1447 static int wait_for_pid(pid_t pid)
1448 {
1449 int status, ret;
1450
1451 if (pid <= 0)
1452 return -1;
1453
1454 again:
1455 ret = waitpid(pid, &status, 0);
1456 if (ret == -1) {
1457 if (errno == EINTR)
1458 goto again;
1459 return -1;
1460 }
1461 if (ret != pid)
1462 goto again;
1463 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1464 return -1;
1465 return 0;
1466 }
1467
1468
1469 /*
1470 * append pid to *src.
1471 * src: a pointer to a char* in which ot append the pid.
1472 * sz: the number of characters printed so far, minus trailing \0.
1473 * asz: the allocated size so far
1474 * pid: the pid to append
1475 */
1476 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1477 {
1478 char tmp[30];
1479
1480 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1481
1482 if (!*src || tmplen + *sz + 1 >= *asz) {
1483 char *tmp;
1484 do {
1485 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1486 } while (!tmp);
1487 *src = tmp;
1488 *asz += BUF_RESERVE_SIZE;
1489 }
1490 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1491 *sz += tmplen;
1492 }
1493
1494 /*
1495 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1496 * valid in the caller's namespace, return the id mapped into
1497 * pid's namespace.
1498 * Returns the mapped id, or -1 on error.
1499 */
1500 unsigned int
1501 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1502 {
1503 unsigned int nsuid, // base id for a range in the idfile's namespace
1504 hostuid, // base id for a range in the caller's namespace
1505 count; // number of ids in this range
1506 char line[400];
1507 int ret;
1508
1509 fseek(idfile, 0L, SEEK_SET);
1510 while (fgets(line, 400, idfile)) {
1511 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1512 if (ret != 3)
1513 continue;
1514 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1515 /*
1516 * uids wrapped around - unexpected as this is a procfile,
1517 * so just bail.
1518 */
1519 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1520 nsuid, hostuid, count, line);
1521 return -1;
1522 }
1523 if (hostuid <= in_id && hostuid+count > in_id) {
1524 /*
1525 * now since hostuid <= in_id < hostuid+count, and
1526 * hostuid+count and nsuid+count do not wrap around,
1527 * we know that nsuid+(in_id-hostuid) which must be
1528 * less that nsuid+(count) must not wrap around
1529 */
1530 return (in_id - hostuid) + nsuid;
1531 }
1532 }
1533
1534 // no answer found
1535 return -1;
1536 }
1537
1538 /*
1539 * for is_privileged_over,
1540 * specify whether we require the calling uid to be root in his
1541 * namespace
1542 */
1543 #define NS_ROOT_REQD true
1544 #define NS_ROOT_OPT false
1545
1546 #define PROCLEN 100
1547
1548 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1549 {
1550 char fpath[PROCLEN];
1551 int ret;
1552 bool answer = false;
1553 uid_t nsuid;
1554
1555 if (victim == -1 || uid == -1)
1556 return false;
1557
1558 /*
1559 * If the request is one not requiring root in the namespace,
1560 * then having the same uid suffices. (i.e. uid 1000 has write
1561 * access to files owned by uid 1000
1562 */
1563 if (!req_ns_root && uid == victim)
1564 return true;
1565
1566 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1567 if (ret < 0 || ret >= PROCLEN)
1568 return false;
1569 FILE *f = fopen(fpath, "r");
1570 if (!f)
1571 return false;
1572
1573 /* if caller's not root in his namespace, reject */
1574 nsuid = convert_id_to_ns(f, uid);
1575 if (nsuid)
1576 goto out;
1577
1578 /*
1579 * If victim is not mapped into caller's ns, reject.
1580 * XXX I'm not sure this check is needed given that fuse
1581 * will be sending requests where the vfs has converted
1582 */
1583 nsuid = convert_id_to_ns(f, victim);
1584 if (nsuid == -1)
1585 goto out;
1586
1587 answer = true;
1588
1589 out:
1590 fclose(f);
1591 return answer;
1592 }
1593
1594 static bool perms_include(int fmode, mode_t req_mode)
1595 {
1596 mode_t r;
1597
1598 switch (req_mode & O_ACCMODE) {
1599 case O_RDONLY:
1600 r = S_IROTH;
1601 break;
1602 case O_WRONLY:
1603 r = S_IWOTH;
1604 break;
1605 case O_RDWR:
1606 r = S_IROTH | S_IWOTH;
1607 break;
1608 default:
1609 return false;
1610 }
1611 return ((fmode & r) == r);
1612 }
1613
1614
1615 /*
1616 * taskcg is a/b/c
1617 * querycg is /a/b/c/d/e
1618 * we return 'd'
1619 */
1620 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1621 {
1622 char *start, *end;
1623
1624 if (strlen(taskcg) <= strlen(querycg)) {
1625 lxcfs_error("%s\n", "I was fed bad input.");
1626 return NULL;
1627 }
1628
1629 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1630 start = strdup(taskcg + 1);
1631 else
1632 start = strdup(taskcg + strlen(querycg) + 1);
1633 if (!start)
1634 return NULL;
1635 end = strchr(start, '/');
1636 if (end)
1637 *end = '\0';
1638 return start;
1639 }
1640
1641 static void stripnewline(char *x)
1642 {
1643 size_t l = strlen(x);
1644 if (l && x[l-1] == '\n')
1645 x[l-1] = '\0';
1646 }
1647
1648 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1649 {
1650 int cfd;
1651 char fnam[PROCLEN];
1652 FILE *f;
1653 char *answer = NULL;
1654 char *line = NULL;
1655 size_t len = 0;
1656 int ret;
1657 const char *h = find_mounted_controller(contrl, &cfd);
1658 if (!h)
1659 return NULL;
1660
1661 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1662 if (ret < 0 || ret >= PROCLEN)
1663 return NULL;
1664 if (!(f = fopen(fnam, "r")))
1665 return NULL;
1666
1667 while (getline(&line, &len, f) != -1) {
1668 char *c1, *c2;
1669 if (!line[0])
1670 continue;
1671 c1 = strchr(line, ':');
1672 if (!c1)
1673 goto out;
1674 c1++;
1675 c2 = strchr(c1, ':');
1676 if (!c2)
1677 goto out;
1678 *c2 = '\0';
1679 if (strcmp(c1, h) != 0)
1680 continue;
1681 c2++;
1682 stripnewline(c2);
1683 do {
1684 answer = strdup(c2);
1685 } while (!answer);
1686 break;
1687 }
1688
1689 out:
1690 fclose(f);
1691 free(line);
1692 return answer;
1693 }
1694
1695 /*
1696 * check whether a fuse context may access a cgroup dir or file
1697 *
1698 * If file is not null, it is a cgroup file to check under cg.
1699 * If file is null, then we are checking perms on cg itself.
1700 *
1701 * For files we can check the mode of the list_keys result.
1702 * For cgroups, we must make assumptions based on the files under the
1703 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1704 * yet.
1705 */
1706 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1707 {
1708 struct cgfs_files *k = NULL;
1709 bool ret = false;
1710
1711 k = cgfs_get_key(contrl, cg, file);
1712 if (!k)
1713 return false;
1714
1715 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1716 if (perms_include(k->mode >> 6, mode)) {
1717 ret = true;
1718 goto out;
1719 }
1720 }
1721 if (fc->gid == k->gid) {
1722 if (perms_include(k->mode >> 3, mode)) {
1723 ret = true;
1724 goto out;
1725 }
1726 }
1727 ret = perms_include(k->mode, mode);
1728
1729 out:
1730 free_key(k);
1731 return ret;
1732 }
1733
1734 #define INITSCOPE "/init.scope"
1735 static void prune_init_slice(char *cg)
1736 {
1737 char *point;
1738 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1739
1740 if (cg_len < initscope_len)
1741 return;
1742
1743 point = cg + cg_len - initscope_len;
1744 if (strcmp(point, INITSCOPE) == 0) {
1745 if (point == cg)
1746 *(point+1) = '\0';
1747 else
1748 *point = '\0';
1749 }
1750 }
1751
1752 /*
1753 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1754 * If pid is in /a, he may act on /a/b, but not on /b.
1755 * if the answer is false and nextcg is not NULL, then *nextcg will point
1756 * to a string containing the next cgroup directory under cg, which must be
1757 * freed by the caller.
1758 */
1759 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1760 {
1761 bool answer = false;
1762 char *c2 = get_pid_cgroup(pid, contrl);
1763 char *linecmp;
1764
1765 if (!c2)
1766 return false;
1767 prune_init_slice(c2);
1768
1769 /*
1770 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1771 * they pass in a cgroup without leading '/'
1772 *
1773 * The original line here was:
1774 * linecmp = *cg == '/' ? c2 : c2+1;
1775 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1776 * Serge, do you know?
1777 */
1778 if (*cg == '/' || !strncmp(cg, "./", 2))
1779 linecmp = c2;
1780 else
1781 linecmp = c2 + 1;
1782 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1783 if (nextcg) {
1784 *nextcg = get_next_cgroup_dir(linecmp, cg);
1785 }
1786 goto out;
1787 }
1788 answer = true;
1789
1790 out:
1791 free(c2);
1792 return answer;
1793 }
1794
1795 /*
1796 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1797 */
1798 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1799 {
1800 bool answer = false;
1801 char *c2, *task_cg;
1802 size_t target_len, task_len;
1803
1804 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1805 return true;
1806
1807 c2 = get_pid_cgroup(pid, contrl);
1808 if (!c2)
1809 return false;
1810 prune_init_slice(c2);
1811
1812 task_cg = c2 + 1;
1813 target_len = strlen(cg);
1814 task_len = strlen(task_cg);
1815 if (task_len == 0) {
1816 /* Task is in the root cg, it can see everything. This case is
1817 * not handled by the strmcps below, since they test for the
1818 * last /, but that is the first / that we've chopped off
1819 * above.
1820 */
1821 answer = true;
1822 goto out;
1823 }
1824 if (strcmp(cg, task_cg) == 0) {
1825 answer = true;
1826 goto out;
1827 }
1828 if (target_len < task_len) {
1829 /* looking up a parent dir */
1830 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1831 answer = true;
1832 goto out;
1833 }
1834 if (target_len > task_len) {
1835 /* looking up a child dir */
1836 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1837 answer = true;
1838 goto out;
1839 }
1840
1841 out:
1842 free(c2);
1843 return answer;
1844 }
1845
1846 /*
1847 * given /cgroup/freezer/a/b, return "freezer".
1848 * the returned char* should NOT be freed.
1849 */
1850 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1851 {
1852 const char *p1;
1853 char *contr, *slash;
1854
1855 if (strlen(path) < 9) {
1856 errno = EACCES;
1857 return NULL;
1858 }
1859 if (*(path + 7) != '/') {
1860 errno = EINVAL;
1861 return NULL;
1862 }
1863 p1 = path + 8;
1864 contr = strdupa(p1);
1865 if (!contr) {
1866 errno = ENOMEM;
1867 return NULL;
1868 }
1869 slash = strstr(contr, "/");
1870 if (slash)
1871 *slash = '\0';
1872
1873 int i;
1874 for (i = 0; i < num_hierarchies; i++) {
1875 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1876 return hierarchies[i];
1877 }
1878 errno = ENOENT;
1879 return NULL;
1880 }
1881
1882 /*
1883 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1884 * Note that the returned value may include files (keynames) etc
1885 */
1886 static const char *find_cgroup_in_path(const char *path)
1887 {
1888 const char *p1;
1889
1890 if (strlen(path) < 9) {
1891 errno = EACCES;
1892 return NULL;
1893 }
1894 p1 = strstr(path + 8, "/");
1895 if (!p1) {
1896 errno = EINVAL;
1897 return NULL;
1898 }
1899 errno = 0;
1900 return p1 + 1;
1901 }
1902
1903 /*
1904 * split the last path element from the path in @cg.
1905 * @dir is newly allocated and should be freed, @last not
1906 */
1907 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1908 {
1909 char *p;
1910
1911 do {
1912 *dir = strdup(cg);
1913 } while (!*dir);
1914 *last = strrchr(cg, '/');
1915 if (!*last) {
1916 *last = NULL;
1917 return;
1918 }
1919 p = strrchr(*dir, '/');
1920 *p = '\0';
1921 }
1922
1923 /*
1924 * FUSE ops for /cgroup
1925 */
1926
1927 int cg_getattr(const char *path, struct stat *sb)
1928 {
1929 struct timespec now;
1930 struct fuse_context *fc = fuse_get_context();
1931 char * cgdir = NULL;
1932 char *last = NULL, *path1, *path2;
1933 struct cgfs_files *k = NULL;
1934 const char *cgroup;
1935 const char *controller = NULL;
1936 int ret = -ENOENT;
1937
1938
1939 if (!fc)
1940 return -EIO;
1941
1942 memset(sb, 0, sizeof(struct stat));
1943
1944 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1945 return -EINVAL;
1946
1947 sb->st_uid = sb->st_gid = 0;
1948 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1949 sb->st_size = 0;
1950
1951 if (strcmp(path, "/cgroup") == 0) {
1952 sb->st_mode = S_IFDIR | 00755;
1953 sb->st_nlink = 2;
1954 return 0;
1955 }
1956
1957 controller = pick_controller_from_path(fc, path);
1958 if (!controller)
1959 return -errno;
1960 cgroup = find_cgroup_in_path(path);
1961 if (!cgroup) {
1962 /* this is just /cgroup/controller, return it as a dir */
1963 sb->st_mode = S_IFDIR | 00755;
1964 sb->st_nlink = 2;
1965 return 0;
1966 }
1967
1968 get_cgdir_and_path(cgroup, &cgdir, &last);
1969
1970 if (!last) {
1971 path1 = "/";
1972 path2 = cgdir;
1973 } else {
1974 path1 = cgdir;
1975 path2 = last;
1976 }
1977
1978 pid_t initpid = lookup_initpid_in_store(fc->pid);
1979 if (initpid <= 0)
1980 initpid = fc->pid;
1981 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1982 * Then check that caller's cgroup is under path if last is a child
1983 * cgroup, or cgdir if last is a file */
1984
1985 if (is_child_cgroup(controller, path1, path2)) {
1986 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1987 ret = -ENOENT;
1988 goto out;
1989 }
1990 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1991 /* this is just /cgroup/controller, return it as a dir */
1992 sb->st_mode = S_IFDIR | 00555;
1993 sb->st_nlink = 2;
1994 ret = 0;
1995 goto out;
1996 }
1997 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1998 ret = -EACCES;
1999 goto out;
2000 }
2001
2002 // get uid, gid, from '/tasks' file and make up a mode
2003 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2004 sb->st_mode = S_IFDIR | 00755;
2005 k = cgfs_get_key(controller, cgroup, NULL);
2006 if (!k) {
2007 sb->st_uid = sb->st_gid = 0;
2008 } else {
2009 sb->st_uid = k->uid;
2010 sb->st_gid = k->gid;
2011 }
2012 free_key(k);
2013 sb->st_nlink = 2;
2014 ret = 0;
2015 goto out;
2016 }
2017
2018 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
2019 sb->st_mode = S_IFREG | k->mode;
2020 sb->st_nlink = 1;
2021 sb->st_uid = k->uid;
2022 sb->st_gid = k->gid;
2023 sb->st_size = 0;
2024 free_key(k);
2025 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2026 ret = -ENOENT;
2027 goto out;
2028 }
2029 ret = 0;
2030 }
2031
2032 out:
2033 free(cgdir);
2034 return ret;
2035 }
2036
2037 int cg_opendir(const char *path, struct fuse_file_info *fi)
2038 {
2039 struct fuse_context *fc = fuse_get_context();
2040 const char *cgroup;
2041 struct file_info *dir_info;
2042 char *controller = NULL;
2043
2044 if (!fc)
2045 return -EIO;
2046
2047 if (strcmp(path, "/cgroup") == 0) {
2048 cgroup = NULL;
2049 controller = NULL;
2050 } else {
2051 // return list of keys for the controller, and list of child cgroups
2052 controller = pick_controller_from_path(fc, path);
2053 if (!controller)
2054 return -errno;
2055
2056 cgroup = find_cgroup_in_path(path);
2057 if (!cgroup) {
2058 /* this is just /cgroup/controller, return its contents */
2059 cgroup = "/";
2060 }
2061 }
2062
2063 pid_t initpid = lookup_initpid_in_store(fc->pid);
2064 if (initpid <= 0)
2065 initpid = fc->pid;
2066 if (cgroup) {
2067 if (!caller_may_see_dir(initpid, controller, cgroup))
2068 return -ENOENT;
2069 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2070 return -EACCES;
2071 }
2072
2073 /* we'll free this at cg_releasedir */
2074 dir_info = malloc(sizeof(*dir_info));
2075 if (!dir_info)
2076 return -ENOMEM;
2077 dir_info->controller = must_copy_string(controller);
2078 dir_info->cgroup = must_copy_string(cgroup);
2079 dir_info->type = LXC_TYPE_CGDIR;
2080 dir_info->buf = NULL;
2081 dir_info->file = NULL;
2082 dir_info->buflen = 0;
2083
2084 fi->fh = (unsigned long)dir_info;
2085 return 0;
2086 }
2087
2088 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2089 struct fuse_file_info *fi)
2090 {
2091 struct file_info *d = (struct file_info *)fi->fh;
2092 struct cgfs_files **list = NULL;
2093 int i, ret;
2094 char *nextcg = NULL;
2095 struct fuse_context *fc = fuse_get_context();
2096 char **clist = NULL;
2097
2098 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2099 return -EIO;
2100
2101 if (d->type != LXC_TYPE_CGDIR) {
2102 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
2103 return -EIO;
2104 }
2105 if (!d->cgroup && !d->controller) {
2106 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2107 int i;
2108
2109 for (i = 0; i < num_hierarchies; i++) {
2110 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2111 return -EIO;
2112 }
2113 }
2114 return 0;
2115 }
2116
2117 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2118 // not a valid cgroup
2119 ret = -EINVAL;
2120 goto out;
2121 }
2122
2123 pid_t initpid = lookup_initpid_in_store(fc->pid);
2124 if (initpid <= 0)
2125 initpid = fc->pid;
2126 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2127 if (nextcg) {
2128 ret = filler(buf, nextcg, NULL, 0);
2129 free(nextcg);
2130 if (ret != 0) {
2131 ret = -EIO;
2132 goto out;
2133 }
2134 }
2135 ret = 0;
2136 goto out;
2137 }
2138
2139 for (i = 0; list && list[i]; i++) {
2140 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2141 ret = -EIO;
2142 goto out;
2143 }
2144 }
2145
2146 // now get the list of child cgroups
2147
2148 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2149 ret = 0;
2150 goto out;
2151 }
2152 if (clist) {
2153 for (i = 0; clist[i]; i++) {
2154 if (filler(buf, clist[i], NULL, 0) != 0) {
2155 ret = -EIO;
2156 goto out;
2157 }
2158 }
2159 }
2160 ret = 0;
2161
2162 out:
2163 free_keys(list);
2164 if (clist) {
2165 for (i = 0; clist[i]; i++)
2166 free(clist[i]);
2167 free(clist);
2168 }
2169 return ret;
2170 }
2171
2172 static pthread_mutex_t do_release_file_info_lock = PTHREAD_MUTEX_INITIALIZER;
2173
2174 static void do_release_file_info(struct fuse_file_info *fi)
2175 {
2176 lock_mutex(&do_release_file_info_lock);
2177
2178 struct file_info *f = (struct file_info *)fi->fh;
2179
2180 if (!f)
2181 unlock_mutex(&do_release_file_info_lock);
2182 return;
2183
2184 fi->fh = 0;
2185
2186 free(f->controller);
2187 f->controller = NULL;
2188 free(f->cgroup);
2189 f->cgroup = NULL;
2190 free(f->file);
2191 f->file = NULL;
2192 free(f->buf);
2193 f->buf = NULL;
2194 free(f);
2195 f = NULL;
2196
2197 unlock_mutex(&do_release_file_info_lock);
2198 }
2199
2200 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2201 {
2202 do_release_file_info(fi);
2203 return 0;
2204 }
2205
2206 int cg_open(const char *path, struct fuse_file_info *fi)
2207 {
2208 const char *cgroup;
2209 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2210 struct cgfs_files *k = NULL;
2211 struct file_info *file_info;
2212 struct fuse_context *fc = fuse_get_context();
2213 int ret;
2214
2215 if (!fc)
2216 return -EIO;
2217
2218 controller = pick_controller_from_path(fc, path);
2219 if (!controller)
2220 return -errno;
2221 cgroup = find_cgroup_in_path(path);
2222 if (!cgroup)
2223 return -errno;
2224
2225 get_cgdir_and_path(cgroup, &cgdir, &last);
2226 if (!last) {
2227 path1 = "/";
2228 path2 = cgdir;
2229 } else {
2230 path1 = cgdir;
2231 path2 = last;
2232 }
2233
2234 k = cgfs_get_key(controller, path1, path2);
2235 if (!k) {
2236 ret = -EINVAL;
2237 goto out;
2238 }
2239 free_key(k);
2240
2241 pid_t initpid = lookup_initpid_in_store(fc->pid);
2242 if (initpid <= 0)
2243 initpid = fc->pid;
2244 if (!caller_may_see_dir(initpid, controller, path1)) {
2245 ret = -ENOENT;
2246 goto out;
2247 }
2248 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2249 ret = -EACCES;
2250 goto out;
2251 }
2252
2253 /* we'll free this at cg_release */
2254 file_info = malloc(sizeof(*file_info));
2255 if (!file_info) {
2256 ret = -ENOMEM;
2257 goto out;
2258 }
2259 file_info->controller = must_copy_string(controller);
2260 file_info->cgroup = must_copy_string(path1);
2261 file_info->file = must_copy_string(path2);
2262 file_info->type = LXC_TYPE_CGFILE;
2263 file_info->buf = NULL;
2264 file_info->buflen = 0;
2265
2266 fi->fh = (unsigned long)file_info;
2267 ret = 0;
2268
2269 out:
2270 free(cgdir);
2271 return ret;
2272 }
2273
2274 int cg_access(const char *path, int mode)
2275 {
2276 int ret;
2277 const char *cgroup;
2278 char *path1, *path2, *controller;
2279 char *last = NULL, *cgdir = NULL;
2280 struct cgfs_files *k = NULL;
2281 struct fuse_context *fc = fuse_get_context();
2282
2283 if (strcmp(path, "/cgroup") == 0)
2284 return 0;
2285
2286 if (!fc)
2287 return -EIO;
2288
2289 controller = pick_controller_from_path(fc, path);
2290 if (!controller)
2291 return -errno;
2292 cgroup = find_cgroup_in_path(path);
2293 if (!cgroup) {
2294 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2295 if ((mode & W_OK) == 0)
2296 return 0;
2297 return -EACCES;
2298 }
2299
2300 get_cgdir_and_path(cgroup, &cgdir, &last);
2301 if (!last) {
2302 path1 = "/";
2303 path2 = cgdir;
2304 } else {
2305 path1 = cgdir;
2306 path2 = last;
2307 }
2308
2309 k = cgfs_get_key(controller, path1, path2);
2310 if (!k) {
2311 if ((mode & W_OK) == 0)
2312 ret = 0;
2313 else
2314 ret = -EACCES;
2315 goto out;
2316 }
2317 free_key(k);
2318
2319 pid_t initpid = lookup_initpid_in_store(fc->pid);
2320 if (initpid <= 0)
2321 initpid = fc->pid;
2322 if (!caller_may_see_dir(initpid, controller, path1)) {
2323 ret = -ENOENT;
2324 goto out;
2325 }
2326 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2327 ret = -EACCES;
2328 goto out;
2329 }
2330
2331 ret = 0;
2332
2333 out:
2334 free(cgdir);
2335 return ret;
2336 }
2337
2338 int cg_release(const char *path, struct fuse_file_info *fi)
2339 {
2340 do_release_file_info(fi);
2341 return 0;
2342 }
2343
2344 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2345
2346 static bool wait_for_sock(int sock, int timeout)
2347 {
2348 struct epoll_event ev;
2349 int epfd, ret, now, starttime, deltatime, saved_errno;
2350
2351 if ((starttime = time(NULL)) < 0)
2352 return false;
2353
2354 if ((epfd = epoll_create(1)) < 0) {
2355 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2356 return false;
2357 }
2358
2359 ev.events = POLLIN_SET;
2360 ev.data.fd = sock;
2361 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2362 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2363 close(epfd);
2364 return false;
2365 }
2366
2367 again:
2368 if ((now = time(NULL)) < 0) {
2369 close(epfd);
2370 return false;
2371 }
2372
2373 deltatime = (starttime + timeout) - now;
2374 if (deltatime < 0) { // timeout
2375 errno = 0;
2376 close(epfd);
2377 return false;
2378 }
2379 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2380 if (ret < 0 && errno == EINTR)
2381 goto again;
2382 saved_errno = errno;
2383 close(epfd);
2384
2385 if (ret <= 0) {
2386 errno = saved_errno;
2387 return false;
2388 }
2389 return true;
2390 }
2391
2392 static int msgrecv(int sockfd, void *buf, size_t len)
2393 {
2394 if (!wait_for_sock(sockfd, 2))
2395 return -1;
2396 return recv(sockfd, buf, len, MSG_DONTWAIT);
2397 }
2398
2399 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2400 {
2401 struct msghdr msg = { 0 };
2402 struct iovec iov;
2403 struct cmsghdr *cmsg;
2404 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2405 char buf[1];
2406 buf[0] = 'p';
2407
2408 if (pingfirst) {
2409 if (msgrecv(sock, buf, 1) != 1) {
2410 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2411 return SEND_CREDS_FAIL;
2412 }
2413 }
2414
2415 msg.msg_control = cmsgbuf;
2416 msg.msg_controllen = sizeof(cmsgbuf);
2417
2418 cmsg = CMSG_FIRSTHDR(&msg);
2419 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2420 cmsg->cmsg_level = SOL_SOCKET;
2421 cmsg->cmsg_type = SCM_CREDENTIALS;
2422 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2423
2424 msg.msg_name = NULL;
2425 msg.msg_namelen = 0;
2426
2427 buf[0] = v;
2428 iov.iov_base = buf;
2429 iov.iov_len = sizeof(buf);
2430 msg.msg_iov = &iov;
2431 msg.msg_iovlen = 1;
2432
2433 if (sendmsg(sock, &msg, 0) < 0) {
2434 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2435 if (errno == 3)
2436 return SEND_CREDS_NOTSK;
2437 return SEND_CREDS_FAIL;
2438 }
2439
2440 return SEND_CREDS_OK;
2441 }
2442
2443 static bool recv_creds(int sock, struct ucred *cred, char *v)
2444 {
2445 struct msghdr msg = { 0 };
2446 struct iovec iov;
2447 struct cmsghdr *cmsg;
2448 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2449 char buf[1];
2450 int ret;
2451 int optval = 1;
2452
2453 *v = '1';
2454
2455 cred->pid = -1;
2456 cred->uid = -1;
2457 cred->gid = -1;
2458
2459 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2460 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2461 return false;
2462 }
2463 buf[0] = '1';
2464 if (write(sock, buf, 1) != 1) {
2465 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2466 return false;
2467 }
2468
2469 msg.msg_name = NULL;
2470 msg.msg_namelen = 0;
2471 msg.msg_control = cmsgbuf;
2472 msg.msg_controllen = sizeof(cmsgbuf);
2473
2474 iov.iov_base = buf;
2475 iov.iov_len = sizeof(buf);
2476 msg.msg_iov = &iov;
2477 msg.msg_iovlen = 1;
2478
2479 if (!wait_for_sock(sock, 2)) {
2480 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2481 return false;
2482 }
2483 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2484 if (ret < 0) {
2485 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2486 return false;
2487 }
2488
2489 cmsg = CMSG_FIRSTHDR(&msg);
2490
2491 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2492 cmsg->cmsg_level == SOL_SOCKET &&
2493 cmsg->cmsg_type == SCM_CREDENTIALS) {
2494 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2495 }
2496 *v = buf[0];
2497
2498 return true;
2499 }
2500
2501 struct pid_ns_clone_args {
2502 int *cpipe;
2503 int sock;
2504 pid_t tpid;
2505 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2506 };
2507
2508 /*
2509 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2510 * with clone(). This simply writes '1' as ACK back to the parent
2511 * before calling the actual wrapped function.
2512 */
2513 static int pid_ns_clone_wrapper(void *arg) {
2514 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2515 char b = '1';
2516
2517 close(args->cpipe[0]);
2518 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2519 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2520 close(args->cpipe[1]);
2521 return args->wrapped(args->sock, args->tpid);
2522 }
2523
2524 /*
2525 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2526 * int value back over the socket. This shifts the pid from the
2527 * sender's pidns into tpid's pidns.
2528 */
2529 static int pid_to_ns(int sock, pid_t tpid)
2530 {
2531 char v = '0';
2532 struct ucred cred;
2533
2534 while (recv_creds(sock, &cred, &v)) {
2535 if (v == '1')
2536 return 0;
2537 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2538 return 1;
2539 }
2540 return 0;
2541 }
2542
2543
2544 /*
2545 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2546 * in your old pidns. Only children which you clone will be in the target
2547 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2548 * actually convert pids.
2549 *
2550 * Note: glibc's fork() does not respect pidns, which can lead to failed
2551 * assertions inside glibc (and thus failed forks) if the child's pid in
2552 * the pidns and the parent pid outside are identical. Using clone prevents
2553 * this issue.
2554 */
2555 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2556 {
2557 int newnsfd = -1, ret, cpipe[2];
2558 char fnam[100];
2559 pid_t cpid;
2560 char v;
2561
2562 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2563 if (ret < 0 || ret >= sizeof(fnam))
2564 _exit(1);
2565 newnsfd = open(fnam, O_RDONLY);
2566 if (newnsfd < 0)
2567 _exit(1);
2568 if (setns(newnsfd, 0) < 0)
2569 _exit(1);
2570 close(newnsfd);
2571
2572 if (pipe(cpipe) < 0)
2573 _exit(1);
2574
2575 struct pid_ns_clone_args args = {
2576 .cpipe = cpipe,
2577 .sock = sock,
2578 .tpid = tpid,
2579 .wrapped = &pid_to_ns
2580 };
2581 size_t stack_size = sysconf(_SC_PAGESIZE);
2582 void *stack = alloca(stack_size);
2583
2584 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2585 if (cpid < 0)
2586 _exit(1);
2587
2588 // give the child 1 second to be done forking and
2589 // write its ack
2590 if (!wait_for_sock(cpipe[0], 1))
2591 _exit(1);
2592 ret = read(cpipe[0], &v, 1);
2593 if (ret != sizeof(char) || v != '1')
2594 _exit(1);
2595
2596 if (!wait_for_pid(cpid))
2597 _exit(1);
2598 _exit(0);
2599 }
2600
2601 /*
2602 * To read cgroup files with a particular pid, we will setns into the child
2603 * pidns, open a pipe, fork a child - which will be the first to really be in
2604 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2605 */
2606 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2607 {
2608 int sock[2] = {-1, -1};
2609 char *tmpdata = NULL;
2610 int ret;
2611 pid_t qpid, cpid = -1;
2612 bool answer = false;
2613 char v = '0';
2614 struct ucred cred;
2615 size_t sz = 0, asz = 0;
2616
2617 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2618 return false;
2619
2620 /*
2621 * Now we read the pids from returned data one by one, pass
2622 * them into a child in the target namespace, read back the
2623 * translated pids, and put them into our to-return data
2624 */
2625
2626 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2627 perror("socketpair");
2628 free(tmpdata);
2629 return false;
2630 }
2631
2632 cpid = fork();
2633 if (cpid == -1)
2634 goto out;
2635
2636 if (!cpid) // child - exits when done
2637 pid_to_ns_wrapper(sock[1], tpid);
2638
2639 char *ptr = tmpdata;
2640 cred.uid = 0;
2641 cred.gid = 0;
2642 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2643 cred.pid = qpid;
2644 ret = send_creds(sock[0], &cred, v, true);
2645
2646 if (ret == SEND_CREDS_NOTSK)
2647 goto next;
2648 if (ret == SEND_CREDS_FAIL)
2649 goto out;
2650
2651 // read converted results
2652 if (!wait_for_sock(sock[0], 2)) {
2653 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2654 goto out;
2655 }
2656 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2657 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2658 goto out;
2659 }
2660 must_strcat_pid(d, &sz, &asz, qpid);
2661 next:
2662 ptr = strchr(ptr, '\n');
2663 if (!ptr)
2664 break;
2665 ptr++;
2666 }
2667
2668 cred.pid = getpid();
2669 v = '1';
2670 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2671 // failed to ask child to exit
2672 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2673 goto out;
2674 }
2675
2676 answer = true;
2677
2678 out:
2679 free(tmpdata);
2680 if (cpid != -1)
2681 wait_for_pid(cpid);
2682 if (sock[0] != -1) {
2683 close(sock[0]);
2684 close(sock[1]);
2685 }
2686 return answer;
2687 }
2688
2689 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2690 struct fuse_file_info *fi)
2691 {
2692 struct fuse_context *fc = fuse_get_context();
2693 struct file_info *f = (struct file_info *)fi->fh;
2694 struct cgfs_files *k = NULL;
2695 char *data = NULL;
2696 int ret, s;
2697 bool r;
2698
2699 if (f->type != LXC_TYPE_CGFILE) {
2700 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2701 return -EIO;
2702 }
2703
2704 if (offset)
2705 return 0;
2706
2707 if (!fc)
2708 return -EIO;
2709
2710 if (!f->controller)
2711 return -EINVAL;
2712
2713 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2714 return -EINVAL;
2715 }
2716 free_key(k);
2717
2718
2719 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2720 ret = -EACCES;
2721 goto out;
2722 }
2723
2724 if (strcmp(f->file, "tasks") == 0 ||
2725 strcmp(f->file, "/tasks") == 0 ||
2726 strcmp(f->file, "/cgroup.procs") == 0 ||
2727 strcmp(f->file, "cgroup.procs") == 0)
2728 // special case - we have to translate the pids
2729 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2730 else
2731 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2732
2733 if (!r) {
2734 ret = -EINVAL;
2735 goto out;
2736 }
2737
2738 if (!data) {
2739 ret = 0;
2740 goto out;
2741 }
2742 s = strlen(data);
2743 if (s > size)
2744 s = size;
2745 memcpy(buf, data, s);
2746 if (s > 0 && s < size && data[s-1] != '\n')
2747 buf[s++] = '\n';
2748
2749 ret = s;
2750
2751 out:
2752 free(data);
2753 return ret;
2754 }
2755
2756 static int pid_from_ns(int sock, pid_t tpid)
2757 {
2758 pid_t vpid;
2759 struct ucred cred;
2760 char v;
2761 int ret;
2762
2763 cred.uid = 0;
2764 cred.gid = 0;
2765 while (1) {
2766 if (!wait_for_sock(sock, 2)) {
2767 lxcfs_error("%s\n", "Timeout reading from parent.");
2768 return 1;
2769 }
2770 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2771 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2772 return 1;
2773 }
2774 if (vpid == -1) // done
2775 break;
2776 v = '0';
2777 cred.pid = vpid;
2778 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2779 v = '1';
2780 cred.pid = getpid();
2781 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2782 return 1;
2783 }
2784 }
2785 return 0;
2786 }
2787
2788 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2789 {
2790 int newnsfd = -1, ret, cpipe[2];
2791 char fnam[100];
2792 pid_t cpid;
2793 char v;
2794
2795 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2796 if (ret < 0 || ret >= sizeof(fnam))
2797 _exit(1);
2798 newnsfd = open(fnam, O_RDONLY);
2799 if (newnsfd < 0)
2800 _exit(1);
2801 if (setns(newnsfd, 0) < 0)
2802 _exit(1);
2803 close(newnsfd);
2804
2805 if (pipe(cpipe) < 0)
2806 _exit(1);
2807
2808 struct pid_ns_clone_args args = {
2809 .cpipe = cpipe,
2810 .sock = sock,
2811 .tpid = tpid,
2812 .wrapped = &pid_from_ns
2813 };
2814 size_t stack_size = sysconf(_SC_PAGESIZE);
2815 void *stack = alloca(stack_size);
2816
2817 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2818 if (cpid < 0)
2819 _exit(1);
2820
2821 // give the child 1 second to be done forking and
2822 // write its ack
2823 if (!wait_for_sock(cpipe[0], 1))
2824 _exit(1);
2825 ret = read(cpipe[0], &v, 1);
2826 if (ret != sizeof(char) || v != '1')
2827 _exit(1);
2828
2829 if (!wait_for_pid(cpid))
2830 _exit(1);
2831 _exit(0);
2832 }
2833
2834 /*
2835 * Given host @uid, return the uid to which it maps in
2836 * @pid's user namespace, or -1 if none.
2837 */
2838 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2839 {
2840 FILE *f;
2841 char line[400];
2842
2843 sprintf(line, "/proc/%d/uid_map", pid);
2844 if ((f = fopen(line, "r")) == NULL) {
2845 return false;
2846 }
2847
2848 *answer = convert_id_to_ns(f, uid);
2849 fclose(f);
2850
2851 if (*answer == -1)
2852 return false;
2853 return true;
2854 }
2855
2856 /*
2857 * get_pid_creds: get the real uid and gid of @pid from
2858 * /proc/$$/status
2859 * (XXX should we use euid here?)
2860 */
2861 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2862 {
2863 char line[400];
2864 uid_t u;
2865 gid_t g;
2866 FILE *f;
2867
2868 *uid = -1;
2869 *gid = -1;
2870 sprintf(line, "/proc/%d/status", pid);
2871 if ((f = fopen(line, "r")) == NULL) {
2872 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2873 return;
2874 }
2875 while (fgets(line, 400, f)) {
2876 if (strncmp(line, "Uid:", 4) == 0) {
2877 if (sscanf(line+4, "%u", &u) != 1) {
2878 lxcfs_error("bad uid line for pid %u\n", pid);
2879 fclose(f);
2880 return;
2881 }
2882 *uid = u;
2883 } else if (strncmp(line, "Gid:", 4) == 0) {
2884 if (sscanf(line+4, "%u", &g) != 1) {
2885 lxcfs_error("bad gid line for pid %u\n", pid);
2886 fclose(f);
2887 return;
2888 }
2889 *gid = g;
2890 }
2891 }
2892 fclose(f);
2893 }
2894
2895 /*
2896 * May the requestor @r move victim @v to a new cgroup?
2897 * This is allowed if
2898 * . they are the same task
2899 * . they are ownedy by the same uid
2900 * . @r is root on the host, or
2901 * . @v's uid is mapped into @r's where @r is root.
2902 */
2903 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2904 {
2905 uid_t v_uid, tmpuid;
2906 gid_t v_gid;
2907
2908 if (r == v)
2909 return true;
2910 if (r_uid == 0)
2911 return true;
2912 get_pid_creds(v, &v_uid, &v_gid);
2913 if (r_uid == v_uid)
2914 return true;
2915 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2916 && hostuid_to_ns(v_uid, r, &tmpuid))
2917 return true;
2918 return false;
2919 }
2920
2921 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2922 const char *file, const char *buf)
2923 {
2924 int sock[2] = {-1, -1};
2925 pid_t qpid, cpid = -1;
2926 FILE *pids_file = NULL;
2927 bool answer = false, fail = false;
2928
2929 pids_file = open_pids_file(contrl, cg);
2930 if (!pids_file)
2931 return false;
2932
2933 /*
2934 * write the pids to a socket, have helper in writer's pidns
2935 * call movepid for us
2936 */
2937 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2938 perror("socketpair");
2939 goto out;
2940 }
2941
2942 cpid = fork();
2943 if (cpid == -1)
2944 goto out;
2945
2946 if (!cpid) { // child
2947 fclose(pids_file);
2948 pid_from_ns_wrapper(sock[1], tpid);
2949 }
2950
2951 const char *ptr = buf;
2952 while (sscanf(ptr, "%d", &qpid) == 1) {
2953 struct ucred cred;
2954 char v;
2955
2956 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2957 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2958 goto out;
2959 }
2960
2961 if (recv_creds(sock[0], &cred, &v)) {
2962 if (v == '0') {
2963 if (!may_move_pid(tpid, tuid, cred.pid)) {
2964 fail = true;
2965 break;
2966 }
2967 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2968 fail = true;
2969 }
2970 }
2971
2972 ptr = strchr(ptr, '\n');
2973 if (!ptr)
2974 break;
2975 ptr++;
2976 }
2977
2978 /* All good, write the value */
2979 qpid = -1;
2980 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2981 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2982
2983 if (!fail)
2984 answer = true;
2985
2986 out:
2987 if (cpid != -1)
2988 wait_for_pid(cpid);
2989 if (sock[0] != -1) {
2990 close(sock[0]);
2991 close(sock[1]);
2992 }
2993 if (pids_file) {
2994 if (fclose(pids_file) != 0)
2995 answer = false;
2996 }
2997 return answer;
2998 }
2999
3000 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
3001 struct fuse_file_info *fi)
3002 {
3003 struct fuse_context *fc = fuse_get_context();
3004 char *localbuf = NULL;
3005 struct cgfs_files *k = NULL;
3006 struct file_info *f = (struct file_info *)fi->fh;
3007 bool r;
3008
3009 if (f->type != LXC_TYPE_CGFILE) {
3010 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
3011 return -EIO;
3012 }
3013
3014 if (offset)
3015 return 0;
3016
3017 if (!fc)
3018 return -EIO;
3019
3020 localbuf = alloca(size+1);
3021 localbuf[size] = '\0';
3022 memcpy(localbuf, buf, size);
3023
3024 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
3025 size = -EINVAL;
3026 goto out;
3027 }
3028
3029 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
3030 size = -EACCES;
3031 goto out;
3032 }
3033
3034 if (strcmp(f->file, "tasks") == 0 ||
3035 strcmp(f->file, "/tasks") == 0 ||
3036 strcmp(f->file, "/cgroup.procs") == 0 ||
3037 strcmp(f->file, "cgroup.procs") == 0)
3038 // special case - we have to translate the pids
3039 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
3040 else
3041 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
3042
3043 if (!r)
3044 size = -EINVAL;
3045
3046 out:
3047 free_key(k);
3048 return size;
3049 }
3050
3051 int cg_chown(const char *path, uid_t uid, gid_t gid)
3052 {
3053 struct fuse_context *fc = fuse_get_context();
3054 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3055 struct cgfs_files *k = NULL;
3056 const char *cgroup;
3057 int ret;
3058
3059 if (!fc)
3060 return -EIO;
3061
3062 if (strcmp(path, "/cgroup") == 0)
3063 return -EPERM;
3064
3065 controller = pick_controller_from_path(fc, path);
3066 if (!controller)
3067 return errno == ENOENT ? -EPERM : -errno;
3068
3069 cgroup = find_cgroup_in_path(path);
3070 if (!cgroup)
3071 /* this is just /cgroup/controller */
3072 return -EPERM;
3073
3074 get_cgdir_and_path(cgroup, &cgdir, &last);
3075
3076 if (!last) {
3077 path1 = "/";
3078 path2 = cgdir;
3079 } else {
3080 path1 = cgdir;
3081 path2 = last;
3082 }
3083
3084 if (is_child_cgroup(controller, path1, path2)) {
3085 // get uid, gid, from '/tasks' file and make up a mode
3086 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3087 k = cgfs_get_key(controller, cgroup, "tasks");
3088
3089 } else
3090 k = cgfs_get_key(controller, path1, path2);
3091
3092 if (!k) {
3093 ret = -EINVAL;
3094 goto out;
3095 }
3096
3097 /*
3098 * This being a fuse request, the uid and gid must be valid
3099 * in the caller's namespace. So we can just check to make
3100 * sure that the caller is root in his uid, and privileged
3101 * over the file's current owner.
3102 */
3103 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3104 ret = -EACCES;
3105 goto out;
3106 }
3107
3108 ret = cgfs_chown_file(controller, cgroup, uid, gid);
3109
3110 out:
3111 free_key(k);
3112 free(cgdir);
3113
3114 return ret;
3115 }
3116
3117 int cg_chmod(const char *path, mode_t mode)
3118 {
3119 struct fuse_context *fc = fuse_get_context();
3120 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3121 struct cgfs_files *k = NULL;
3122 const char *cgroup;
3123 int ret;
3124
3125 if (!fc)
3126 return -EIO;
3127
3128 if (strcmp(path, "/cgroup") == 0)
3129 return -EPERM;
3130
3131 controller = pick_controller_from_path(fc, path);
3132 if (!controller)
3133 return errno == ENOENT ? -EPERM : -errno;
3134
3135 cgroup = find_cgroup_in_path(path);
3136 if (!cgroup)
3137 /* this is just /cgroup/controller */
3138 return -EPERM;
3139
3140 get_cgdir_and_path(cgroup, &cgdir, &last);
3141
3142 if (!last) {
3143 path1 = "/";
3144 path2 = cgdir;
3145 } else {
3146 path1 = cgdir;
3147 path2 = last;
3148 }
3149
3150 if (is_child_cgroup(controller, path1, path2)) {
3151 // get uid, gid, from '/tasks' file and make up a mode
3152 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3153 k = cgfs_get_key(controller, cgroup, "tasks");
3154
3155 } else
3156 k = cgfs_get_key(controller, path1, path2);
3157
3158 if (!k) {
3159 ret = -EINVAL;
3160 goto out;
3161 }
3162
3163 /*
3164 * This being a fuse request, the uid and gid must be valid
3165 * in the caller's namespace. So we can just check to make
3166 * sure that the caller is root in his uid, and privileged
3167 * over the file's current owner.
3168 */
3169 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3170 ret = -EPERM;
3171 goto out;
3172 }
3173
3174 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3175 ret = -EINVAL;
3176 goto out;
3177 }
3178
3179 ret = 0;
3180 out:
3181 free_key(k);
3182 free(cgdir);
3183 return ret;
3184 }
3185
3186 int cg_mkdir(const char *path, mode_t mode)
3187 {
3188 struct fuse_context *fc = fuse_get_context();
3189 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3190 const char *cgroup;
3191 int ret;
3192
3193 if (!fc)
3194 return -EIO;
3195
3196 controller = pick_controller_from_path(fc, path);
3197 if (!controller)
3198 return errno == ENOENT ? -EPERM : -errno;
3199
3200 cgroup = find_cgroup_in_path(path);
3201 if (!cgroup)
3202 return -errno;
3203
3204 get_cgdir_and_path(cgroup, &cgdir, &last);
3205 if (!last)
3206 path1 = "/";
3207 else
3208 path1 = cgdir;
3209
3210 pid_t initpid = lookup_initpid_in_store(fc->pid);
3211 if (initpid <= 0)
3212 initpid = fc->pid;
3213 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3214 if (!next)
3215 ret = -EINVAL;
3216 else if (last && strcmp(next, last) == 0)
3217 ret = -EEXIST;
3218 else
3219 ret = -EPERM;
3220 goto out;
3221 }
3222
3223 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3224 ret = -EACCES;
3225 goto out;
3226 }
3227 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3228 ret = -EACCES;
3229 goto out;
3230 }
3231
3232 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3233
3234 out:
3235 free(cgdir);
3236 free(next);
3237 return ret;
3238 }
3239
3240 int cg_rmdir(const char *path)
3241 {
3242 struct fuse_context *fc = fuse_get_context();
3243 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3244 const char *cgroup;
3245 int ret;
3246
3247 if (!fc)
3248 return -EIO;
3249
3250 controller = pick_controller_from_path(fc, path);
3251 if (!controller) /* Someone's trying to delete "/cgroup". */
3252 return -EPERM;
3253
3254 cgroup = find_cgroup_in_path(path);
3255 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3256 return -EPERM;
3257
3258 get_cgdir_and_path(cgroup, &cgdir, &last);
3259 if (!last) {
3260 /* Someone's trying to delete a cgroup on the same level as the
3261 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3262 * rmdir "/cgroup/blkio/init.slice".
3263 */
3264 ret = -EPERM;
3265 goto out;
3266 }
3267
3268 pid_t initpid = lookup_initpid_in_store(fc->pid);
3269 if (initpid <= 0)
3270 initpid = fc->pid;
3271 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3272 if (!last || (next && (strcmp(next, last) == 0)))
3273 ret = -EBUSY;
3274 else
3275 ret = -ENOENT;
3276 goto out;
3277 }
3278
3279 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3280 ret = -EACCES;
3281 goto out;
3282 }
3283 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3284 ret = -EACCES;
3285 goto out;
3286 }
3287
3288 if (!cgfs_remove(controller, cgroup)) {
3289 ret = -EINVAL;
3290 goto out;
3291 }
3292
3293 ret = 0;
3294
3295 out:
3296 free(cgdir);
3297 free(next);
3298 return ret;
3299 }
3300
3301 static bool startswith(const char *line, const char *pref)
3302 {
3303 if (strncmp(line, pref, strlen(pref)) == 0)
3304 return true;
3305 return false;
3306 }
3307
3308 static void parse_memstat(char *memstat, unsigned long *cached,
3309 unsigned long *active_anon, unsigned long *inactive_anon,
3310 unsigned long *active_file, unsigned long *inactive_file,
3311 unsigned long *unevictable, unsigned long *shmem)
3312 {
3313 char *eol;
3314
3315 while (*memstat) {
3316 if (startswith(memstat, "total_cache")) {
3317 sscanf(memstat + 11, "%lu", cached);
3318 *cached /= 1024;
3319 } else if (startswith(memstat, "total_active_anon")) {
3320 sscanf(memstat + 17, "%lu", active_anon);
3321 *active_anon /= 1024;
3322 } else if (startswith(memstat, "total_inactive_anon")) {
3323 sscanf(memstat + 19, "%lu", inactive_anon);
3324 *inactive_anon /= 1024;
3325 } else if (startswith(memstat, "total_active_file")) {
3326 sscanf(memstat + 17, "%lu", active_file);
3327 *active_file /= 1024;
3328 } else if (startswith(memstat, "total_inactive_file")) {
3329 sscanf(memstat + 19, "%lu", inactive_file);
3330 *inactive_file /= 1024;
3331 } else if (startswith(memstat, "total_unevictable")) {
3332 sscanf(memstat + 17, "%lu", unevictable);
3333 *unevictable /= 1024;
3334 } else if (startswith(memstat, "total_shmem")) {
3335 sscanf(memstat + 11, "%lu", shmem);
3336 *shmem /= 1024;
3337 }
3338 eol = strchr(memstat, '\n');
3339 if (!eol)
3340 return;
3341 memstat = eol+1;
3342 }
3343 }
3344
3345 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3346 {
3347 char *eol;
3348 char key[32];
3349
3350 memset(key, 0, 32);
3351 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3352
3353 size_t len = strlen(key);
3354 *v = 0;
3355
3356 while (*str) {
3357 if (startswith(str, key)) {
3358 sscanf(str + len, "%lu", v);
3359 return;
3360 }
3361 eol = strchr(str, '\n');
3362 if (!eol)
3363 return;
3364 str = eol+1;
3365 }
3366 }
3367
3368 static int read_file(const char *path, char *buf, size_t size,
3369 struct file_info *d)
3370 {
3371 size_t linelen = 0, total_len = 0, rv = 0;
3372 char *line = NULL;
3373 char *cache = d->buf;
3374 size_t cache_size = d->buflen;
3375 FILE *f = fopen(path, "r");
3376 if (!f)
3377 return 0;
3378
3379 while (getline(&line, &linelen, f) != -1) {
3380 ssize_t l = snprintf(cache, cache_size, "%s", line);
3381 if (l < 0) {
3382 perror("Error writing to cache");
3383 rv = 0;
3384 goto err;
3385 }
3386 if (l >= cache_size) {
3387 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3388 rv = 0;
3389 goto err;
3390 }
3391 cache += l;
3392 cache_size -= l;
3393 total_len += l;
3394 }
3395
3396 d->size = total_len;
3397 if (total_len > size)
3398 total_len = size;
3399
3400 /* read from off 0 */
3401 memcpy(buf, d->buf, total_len);
3402 rv = total_len;
3403 err:
3404 fclose(f);
3405 free(line);
3406 return rv;
3407 }
3408
3409 /*
3410 * FUSE ops for /proc
3411 */
3412
3413 static unsigned long get_memlimit(const char *cgroup, const char *file)
3414 {
3415 char *memlimit_str = NULL;
3416 unsigned long memlimit = -1;
3417
3418 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3419 memlimit = strtoul(memlimit_str, NULL, 10);
3420
3421 free(memlimit_str);
3422
3423 return memlimit;
3424 }
3425
3426 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3427 {
3428 char *copy = strdupa(cgroup);
3429 unsigned long memlimit = 0, retlimit;
3430
3431 retlimit = get_memlimit(copy, file);
3432
3433 while (strcmp(copy, "/") != 0) {
3434 copy = dirname(copy);
3435 memlimit = get_memlimit(copy, file);
3436 if (memlimit != -1 && memlimit < retlimit)
3437 retlimit = memlimit;
3438 };
3439
3440 return retlimit;
3441 }
3442
3443 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3444 struct fuse_file_info *fi)
3445 {
3446 struct fuse_context *fc = fuse_get_context();
3447 struct file_info *d = (struct file_info *)fi->fh;
3448 char *cg;
3449 char *memusage_str = NULL, *memstat_str = NULL,
3450 *memswlimit_str = NULL, *memswusage_str = NULL;
3451 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3452 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3453 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
3454 hostswtotal = 0;
3455 char *line = NULL;
3456 size_t linelen = 0, total_len = 0, rv = 0;
3457 char *cache = d->buf;
3458 size_t cache_size = d->buflen;
3459 FILE *f = NULL;
3460
3461 if (offset){
3462 if (offset > d->size)
3463 return -EINVAL;
3464 if (!d->cached)
3465 return 0;
3466 int left = d->size - offset;
3467 total_len = left > size ? size: left;
3468 memcpy(buf, cache + offset, total_len);
3469 return total_len;
3470 }
3471
3472 pid_t initpid = lookup_initpid_in_store(fc->pid);
3473 if (initpid <= 0)
3474 initpid = fc->pid;
3475 cg = get_pid_cgroup(initpid, "memory");
3476 if (!cg)
3477 return read_file("/proc/meminfo", buf, size, d);
3478 prune_init_slice(cg);
3479
3480 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3481 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3482 goto err;
3483 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3484 goto err;
3485
3486 // Following values are allowed to fail, because swapaccount might be turned
3487 // off for current kernel
3488 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3489 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3490 {
3491 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3492 memswusage = strtoul(memswusage_str, NULL, 10);
3493
3494 memswlimit = memswlimit / 1024;
3495 memswusage = memswusage / 1024;
3496 }
3497
3498 memusage = strtoul(memusage_str, NULL, 10);
3499 memlimit /= 1024;
3500 memusage /= 1024;
3501
3502 parse_memstat(memstat_str, &cached, &active_anon,
3503 &inactive_anon, &active_file, &inactive_file,
3504 &unevictable, &shmem);
3505
3506 f = fopen("/proc/meminfo", "r");
3507 if (!f)
3508 goto err;
3509
3510 while (getline(&line, &linelen, f) != -1) {
3511 ssize_t l;
3512 char *printme, lbuf[100];
3513
3514 memset(lbuf, 0, 100);
3515 if (startswith(line, "MemTotal:")) {
3516 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3517 if (hosttotal < memlimit)
3518 memlimit = hosttotal;
3519 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3520 printme = lbuf;
3521 } else if (startswith(line, "MemFree:")) {
3522 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3523 printme = lbuf;
3524 } else if (startswith(line, "MemAvailable:")) {
3525 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
3526 printme = lbuf;
3527 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3528 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3529 if (hostswtotal < memswlimit)
3530 memswlimit = hostswtotal;
3531 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
3532 printme = lbuf;
3533 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3534 unsigned long swaptotal = memswlimit,
3535 swapusage = memswusage - memusage,
3536 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3537 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3538 printme = lbuf;
3539 } else if (startswith(line, "Slab:")) {
3540 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3541 printme = lbuf;
3542 } else if (startswith(line, "Buffers:")) {
3543 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3544 printme = lbuf;
3545 } else if (startswith(line, "Cached:")) {
3546 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3547 printme = lbuf;
3548 } else if (startswith(line, "SwapCached:")) {
3549 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3550 printme = lbuf;
3551 } else if (startswith(line, "Active:")) {
3552 snprintf(lbuf, 100, "Active: %8lu kB\n",
3553 active_anon + active_file);
3554 printme = lbuf;
3555 } else if (startswith(line, "Inactive:")) {
3556 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3557 inactive_anon + inactive_file);
3558 printme = lbuf;
3559 } else if (startswith(line, "Active(anon)")) {
3560 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3561 printme = lbuf;
3562 } else if (startswith(line, "Inactive(anon)")) {
3563 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3564 printme = lbuf;
3565 } else if (startswith(line, "Active(file)")) {
3566 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3567 printme = lbuf;
3568 } else if (startswith(line, "Inactive(file)")) {
3569 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3570 printme = lbuf;
3571 } else if (startswith(line, "Unevictable")) {
3572 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3573 printme = lbuf;
3574 } else if (startswith(line, "SReclaimable")) {
3575 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3576 printme = lbuf;
3577 } else if (startswith(line, "SUnreclaim")) {
3578 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3579 printme = lbuf;
3580 } else if (startswith(line, "Shmem:")) {
3581 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3582 printme = lbuf;
3583 } else if (startswith(line, "ShmemHugePages")) {
3584 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3585 printme = lbuf;
3586 } else if (startswith(line, "ShmemPmdMapped")) {
3587 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3588 printme = lbuf;
3589 } else
3590 printme = line;
3591
3592 l = snprintf(cache, cache_size, "%s", printme);
3593 if (l < 0) {
3594 perror("Error writing to cache");
3595 rv = 0;
3596 goto err;
3597
3598 }
3599 if (l >= cache_size) {
3600 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3601 rv = 0;
3602 goto err;
3603 }
3604
3605 cache += l;
3606 cache_size -= l;
3607 total_len += l;
3608 }
3609
3610 d->cached = 1;
3611 d->size = total_len;
3612 if (total_len > size ) total_len = size;
3613 memcpy(buf, d->buf, total_len);
3614
3615 rv = total_len;
3616 err:
3617 if (f)
3618 fclose(f);
3619 free(line);
3620 free(cg);
3621 free(memusage_str);
3622 free(memswlimit_str);
3623 free(memswusage_str);
3624 free(memstat_str);
3625 return rv;
3626 }
3627
3628 /*
3629 * Read the cpuset.cpus for cg
3630 * Return the answer in a newly allocated string which must be freed
3631 */
3632 static char *get_cpuset(const char *cg)
3633 {
3634 char *answer;
3635
3636 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3637 return NULL;
3638 return answer;
3639 }
3640
3641 bool cpu_in_cpuset(int cpu, const char *cpuset);
3642
3643 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3644 {
3645 int cpu;
3646
3647 if (sscanf(line, "processor : %d", &cpu) != 1)
3648 return false;
3649 return cpu_in_cpuset(cpu, cpuset);
3650 }
3651
3652 /*
3653 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3654 * depending on `param`. Parameter value is returned throuh `value`.
3655 */
3656 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3657 {
3658 bool rv = false;
3659 char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3660 char *str = NULL;
3661
3662 sprintf(file, "cpu.cfs_%s_us", param);
3663
3664 if (!cgfs_get_value("cpu", cg, file, &str))
3665 goto err;
3666
3667 if (sscanf(str, "%ld", value) != 1)
3668 goto err;
3669
3670 rv = true;
3671
3672 err:
3673 if (str)
3674 free(str);
3675 return rv;
3676 }
3677
3678 /*
3679 * Return the maximum number of visible CPUs based on CPU quotas.
3680 * If there is no quota set, zero is returned.
3681 */
3682 int max_cpu_count(const char *cg)
3683 {
3684 int rv, nprocs;
3685 int64_t cfs_quota, cfs_period;
3686
3687 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3688 return 0;
3689
3690 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3691 return 0;
3692
3693 if (cfs_quota <= 0 || cfs_period <= 0)
3694 return 0;
3695
3696 rv = cfs_quota / cfs_period;
3697
3698 /* In case quota/period does not yield a whole number, add one CPU for
3699 * the remainder.
3700 */
3701 if ((cfs_quota % cfs_period) > 0)
3702 rv += 1;
3703
3704 nprocs = get_nprocs();
3705
3706 if (rv > nprocs)
3707 rv = nprocs;
3708
3709 return rv;
3710 }
3711
3712 /*
3713 * Determine whether CPU views should be used or not.
3714 */
3715 bool use_cpuview(const char *cg)
3716 {
3717 int cfd;
3718 char *tmpc;
3719
3720 tmpc = find_mounted_controller("cpu", &cfd);
3721 if (!tmpc)
3722 return false;
3723
3724 tmpc = find_mounted_controller("cpuacct", &cfd);
3725 if (!tmpc)
3726 return false;
3727
3728 return true;
3729 }
3730
3731 /*
3732 * check whether this is a '^processor" line in /proc/cpuinfo
3733 */
3734 static bool is_processor_line(const char *line)
3735 {
3736 int cpu;
3737
3738 if (sscanf(line, "processor : %d", &cpu) == 1)
3739 return true;
3740 return false;
3741 }
3742
3743 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3744 struct fuse_file_info *fi)
3745 {
3746 struct fuse_context *fc = fuse_get_context();
3747 struct file_info *d = (struct file_info *)fi->fh;
3748 char *cg;
3749 char *cpuset = NULL;
3750 char *line = NULL;
3751 size_t linelen = 0, total_len = 0, rv = 0;
3752 bool am_printing = false, firstline = true, is_s390x = false;
3753 int curcpu = -1, cpu, max_cpus = 0;
3754 bool use_view;
3755 char *cache = d->buf;
3756 size_t cache_size = d->buflen;
3757 FILE *f = NULL;
3758
3759 if (offset){
3760 if (offset > d->size)
3761 return -EINVAL;
3762 if (!d->cached)
3763 return 0;
3764 int left = d->size - offset;
3765 total_len = left > size ? size: left;
3766 memcpy(buf, cache + offset, total_len);
3767 return total_len;
3768 }
3769
3770 pid_t initpid = lookup_initpid_in_store(fc->pid);
3771 if (initpid <= 0)
3772 initpid = fc->pid;
3773 cg = get_pid_cgroup(initpid, "cpuset");
3774 if (!cg)
3775 return read_file("proc/cpuinfo", buf, size, d);
3776 prune_init_slice(cg);
3777
3778 cpuset = get_cpuset(cg);
3779 if (!cpuset)
3780 goto err;
3781
3782 use_view = use_cpuview(cg);
3783
3784 if (use_view)
3785 max_cpus = max_cpu_count(cg);
3786
3787 f = fopen("/proc/cpuinfo", "r");
3788 if (!f)
3789 goto err;
3790
3791 while (getline(&line, &linelen, f) != -1) {
3792 ssize_t l;
3793 if (firstline) {
3794 firstline = false;
3795 if (strstr(line, "IBM/S390") != NULL) {
3796 is_s390x = true;
3797 am_printing = true;
3798 continue;
3799 }
3800 }
3801 if (strncmp(line, "# processors:", 12) == 0)
3802 continue;
3803 if (is_processor_line(line)) {
3804 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3805 break;
3806 am_printing = cpuline_in_cpuset(line, cpuset);
3807 if (am_printing) {
3808 curcpu ++;
3809 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3810 if (l < 0) {
3811 perror("Error writing to cache");
3812 rv = 0;
3813 goto err;
3814 }
3815 if (l >= cache_size) {
3816 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3817 rv = 0;
3818 goto err;
3819 }
3820 cache += l;
3821 cache_size -= l;
3822 total_len += l;
3823 }
3824 continue;
3825 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3826 char *p;
3827 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3828 break;
3829 if (!cpu_in_cpuset(cpu, cpuset))
3830 continue;
3831 curcpu ++;
3832 p = strchr(line, ':');
3833 if (!p || !*p)
3834 goto err;
3835 p++;
3836 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3837 if (l < 0) {
3838 perror("Error writing to cache");
3839 rv = 0;
3840 goto err;
3841 }
3842 if (l >= cache_size) {
3843 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3844 rv = 0;
3845 goto err;
3846 }
3847 cache += l;
3848 cache_size -= l;
3849 total_len += l;
3850 continue;
3851
3852 }
3853 if (am_printing) {
3854 l = snprintf(cache, cache_size, "%s", line);
3855 if (l < 0) {
3856 perror("Error writing to cache");
3857 rv = 0;
3858 goto err;
3859 }
3860 if (l >= cache_size) {
3861 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3862 rv = 0;
3863 goto err;
3864 }
3865 cache += l;
3866 cache_size -= l;
3867 total_len += l;
3868 }
3869 }
3870
3871 if (is_s390x) {
3872 char *origcache = d->buf;
3873 ssize_t l;
3874 do {
3875 d->buf = malloc(d->buflen);
3876 } while (!d->buf);
3877 cache = d->buf;
3878 cache_size = d->buflen;
3879 total_len = 0;
3880 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3881 if (l < 0 || l >= cache_size) {
3882 free(origcache);
3883 goto err;
3884 }
3885 cache_size -= l;
3886 cache += l;
3887 total_len += l;
3888 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3889 if (l < 0 || l >= cache_size) {
3890 free(origcache);
3891 goto err;
3892 }
3893 cache_size -= l;
3894 cache += l;
3895 total_len += l;
3896 l = snprintf(cache, cache_size, "%s", origcache);
3897 free(origcache);
3898 if (l < 0 || l >= cache_size)
3899 goto err;
3900 total_len += l;
3901 }
3902
3903 d->cached = 1;
3904 d->size = total_len;
3905 if (total_len > size ) total_len = size;
3906
3907 /* read from off 0 */
3908 memcpy(buf, d->buf, total_len);
3909 rv = total_len;
3910 err:
3911 if (f)
3912 fclose(f);
3913 free(line);
3914 free(cpuset);
3915 free(cg);
3916 return rv;
3917 }
3918
3919 static uint64_t get_reaper_start_time(pid_t pid)
3920 {
3921 int ret;
3922 FILE *f;
3923 uint64_t starttime;
3924 /* strlen("/proc/") = 6
3925 * +
3926 * LXCFS_NUMSTRLEN64
3927 * +
3928 * strlen("/stat") = 5
3929 * +
3930 * \0 = 1
3931 * */
3932 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3933 char path[__PROC_PID_STAT_LEN];
3934 pid_t qpid;
3935
3936 qpid = lookup_initpid_in_store(pid);
3937 if (qpid <= 0) {
3938 /* Caller can check for EINVAL on 0. */
3939 errno = EINVAL;
3940 return 0;
3941 }
3942
3943 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3944 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3945 /* Caller can check for EINVAL on 0. */
3946 errno = EINVAL;
3947 return 0;
3948 }
3949
3950 f = fopen(path, "r");
3951 if (!f) {
3952 /* Caller can check for EINVAL on 0. */
3953 errno = EINVAL;
3954 return 0;
3955 }
3956
3957 /* Note that the *scanf() argument supression requires that length
3958 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3959 * at us. It's like telling someone you're not married and then asking
3960 * if you can bring your wife to the party.
3961 */
3962 ret = fscanf(f, "%*d " /* (1) pid %d */
3963 "%*s " /* (2) comm %s */
3964 "%*c " /* (3) state %c */
3965 "%*d " /* (4) ppid %d */
3966 "%*d " /* (5) pgrp %d */
3967 "%*d " /* (6) session %d */
3968 "%*d " /* (7) tty_nr %d */
3969 "%*d " /* (8) tpgid %d */
3970 "%*u " /* (9) flags %u */
3971 "%*u " /* (10) minflt %lu */
3972 "%*u " /* (11) cminflt %lu */
3973 "%*u " /* (12) majflt %lu */
3974 "%*u " /* (13) cmajflt %lu */
3975 "%*u " /* (14) utime %lu */
3976 "%*u " /* (15) stime %lu */
3977 "%*d " /* (16) cutime %ld */
3978 "%*d " /* (17) cstime %ld */
3979 "%*d " /* (18) priority %ld */
3980 "%*d " /* (19) nice %ld */
3981 "%*d " /* (20) num_threads %ld */
3982 "%*d " /* (21) itrealvalue %ld */
3983 "%" PRIu64, /* (22) starttime %llu */
3984 &starttime);
3985 if (ret != 1) {
3986 fclose(f);
3987 /* Caller can check for EINVAL on 0. */
3988 errno = EINVAL;
3989 return 0;
3990 }
3991
3992 fclose(f);
3993
3994 errno = 0;
3995 return starttime;
3996 }
3997
3998 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3999 {
4000 uint64_t clockticks;
4001 int64_t ticks_per_sec;
4002
4003 clockticks = get_reaper_start_time(pid);
4004 if (clockticks == 0 && errno == EINVAL) {
4005 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
4006 return 0;
4007 }
4008
4009 ticks_per_sec = sysconf(_SC_CLK_TCK);
4010 if (ticks_per_sec < 0 && errno == EINVAL) {
4011 lxcfs_debug(
4012 "%s\n",
4013 "failed to determine number of clock ticks in a second");
4014 return 0;
4015 }
4016
4017 return (clockticks /= ticks_per_sec);
4018 }
4019
4020 static uint64_t get_reaper_age(pid_t pid)
4021 {
4022 uint64_t procstart, uptime, procage;
4023
4024 /* We need to substract the time the process has started since system
4025 * boot minus the time when the system has started to get the actual
4026 * reaper age.
4027 */
4028 procstart = get_reaper_start_time_in_sec(pid);
4029 procage = procstart;
4030 if (procstart > 0) {
4031 int ret;
4032 struct timespec spec;
4033
4034 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
4035 if (ret < 0)
4036 return 0;
4037 /* We could make this more precise here by using the tv_nsec
4038 * field in the timespec struct and convert it to milliseconds
4039 * and then create a double for the seconds and milliseconds but
4040 * that seems more work than it is worth.
4041 */
4042 uptime = spec.tv_sec;
4043 procage = uptime - procstart;
4044 }
4045
4046 return procage;
4047 }
4048
4049 /*
4050 * Returns 0 on success.
4051 * It is the caller's responsibility to free `return_usage`, unless this
4052 * function returns an error.
4053 */
4054 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
4055 {
4056 int cpucount = get_nprocs_conf();
4057 struct cpuacct_usage *cpu_usage;
4058 int rv = 0, i, j, ret, read_pos = 0, read_cnt;
4059 int cg_cpu;
4060 uint64_t cg_user, cg_system;
4061 int64_t ticks_per_sec;
4062 char *usage_str = NULL;
4063
4064 ticks_per_sec = sysconf(_SC_CLK_TCK);
4065
4066 if (ticks_per_sec < 0 && errno == EINVAL) {
4067 lxcfs_debug(
4068 "%s\n",
4069 "read_cpuacct_usage_all failed to determine number of clock ticks "
4070 "in a second");
4071 return -1;
4072 }
4073
4074 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4075 if (!cpu_usage)
4076 return -ENOMEM;
4077
4078 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4079 rv = -1;
4080 goto err;
4081 }
4082
4083 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4084 lxcfs_error("read_cpuacct_usage_all reading first line from "
4085 "%s/cpuacct.usage_all failed.\n", cg);
4086 rv = -1;
4087 goto err;
4088 }
4089
4090 read_pos += read_cnt;
4091
4092 for (i = 0, j = 0; i < cpucount; i++) {
4093 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4094 &cg_system, &read_cnt);
4095
4096 if (ret == EOF)
4097 break;
4098
4099 if (ret != 3) {
4100 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4101 "failed.\n", cg);
4102 rv = -1;
4103 goto err;
4104 }
4105
4106 read_pos += read_cnt;
4107
4108 /* Convert the time from nanoseconds to USER_HZ */
4109 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4110 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4111 j++;
4112 }
4113
4114 rv = 0;
4115 *return_usage = cpu_usage;
4116 *size = cpucount;
4117
4118 err:
4119 if (usage_str)
4120 free(usage_str);
4121
4122 if (rv != 0) {
4123 free(cpu_usage);
4124 *return_usage = NULL;
4125 }
4126
4127 return rv;
4128 }
4129
4130 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4131 {
4132 int i;
4133 unsigned long sum = 0;
4134
4135 for (i = 0; i < cpu_count; i++) {
4136 if (!newer[i].online)
4137 continue;
4138
4139 /* When cpuset is changed on the fly, the CPUs might get reordered.
4140 * We could either reset all counters, or check that the substractions
4141 * below will return expected results.
4142 */
4143 if (newer[i].user > older[i].user)
4144 diff[i].user = newer[i].user - older[i].user;
4145 else
4146 diff[i].user = 0;
4147
4148 if (newer[i].system > older[i].system)
4149 diff[i].system = newer[i].system - older[i].system;
4150 else
4151 diff[i].system = 0;
4152
4153 if (newer[i].idle > older[i].idle)
4154 diff[i].idle = newer[i].idle - older[i].idle;
4155 else
4156 diff[i].idle = 0;
4157
4158 sum += diff[i].user;
4159 sum += diff[i].system;
4160 sum += diff[i].idle;
4161 }
4162
4163 return sum;
4164 }
4165
4166 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4167 {
4168 unsigned long free_space, to_add;
4169
4170 free_space = threshold - usage->user - usage->system;
4171
4172 if (free_space > usage->idle)
4173 free_space = usage->idle;
4174
4175 to_add = free_space > *surplus ? *surplus : free_space;
4176
4177 *counter += to_add;
4178 usage->idle -= to_add;
4179 *surplus -= to_add;
4180 }
4181
4182 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4183 {
4184 struct cg_proc_stat *first = NULL, *prev, *tmp;
4185
4186 for (prev = NULL; node; ) {
4187 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4188 tmp = node;
4189 lxcfs_debug("Removing stat node for %s\n", node->cg);
4190
4191 if (prev)
4192 prev->next = node->next;
4193 else
4194 first = node->next;
4195
4196 node = node->next;
4197 free_proc_stat_node(tmp);
4198 } else {
4199 if (!first)
4200 first = node;
4201 prev = node;
4202 node = node->next;
4203 }
4204 }
4205
4206 return first;
4207 }
4208
4209 #define PROC_STAT_PRUNE_INTERVAL 10
4210 static void prune_proc_stat_history(void)
4211 {
4212 int i;
4213 time_t now = time(NULL);
4214
4215 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4216 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4217
4218 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4219 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4220 return;
4221 }
4222
4223 if (proc_stat_history[i]->next) {
4224 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4225 proc_stat_history[i]->lastcheck = now;
4226 }
4227
4228 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4229 }
4230 }
4231
4232 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4233 {
4234 struct cg_proc_stat *node;
4235
4236 pthread_rwlock_rdlock(&head->lock);
4237
4238 if (!head->next) {
4239 pthread_rwlock_unlock(&head->lock);
4240 return NULL;
4241 }
4242
4243 node = head->next;
4244
4245 do {
4246 if (strcmp(cg, node->cg) == 0)
4247 goto out;
4248 } while ((node = node->next));
4249
4250 node = NULL;
4251
4252 out:
4253 pthread_rwlock_unlock(&head->lock);
4254 prune_proc_stat_history();
4255 return node;
4256 }
4257
4258 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4259 {
4260 struct cg_proc_stat *node;
4261 int i;
4262
4263 node = malloc(sizeof(struct cg_proc_stat));
4264 if (!node)
4265 goto err;
4266
4267 node->cg = NULL;
4268 node->usage = NULL;
4269 node->view = NULL;
4270
4271 node->cg = malloc(strlen(cg) + 1);
4272 if (!node->cg)
4273 goto err;
4274
4275 strcpy(node->cg, cg);
4276
4277 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4278 if (!node->usage)
4279 goto err;
4280
4281 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4282
4283 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4284 if (!node->view)
4285 goto err;
4286
4287 node->cpu_count = cpu_count;
4288 node->next = NULL;
4289
4290 if (pthread_mutex_init(&node->lock, NULL) != 0) {
4291 lxcfs_error("%s\n", "Failed to initialize node lock");
4292 goto err;
4293 }
4294
4295 for (i = 0; i < cpu_count; i++) {
4296 node->view[i].user = 0;
4297 node->view[i].system = 0;
4298 node->view[i].idle = 0;
4299 }
4300
4301 return node;
4302
4303 err:
4304 if (node && node->cg)
4305 free(node->cg);
4306 if (node && node->usage)
4307 free(node->usage);
4308 if (node && node->view)
4309 free(node->view);
4310 if (node)
4311 free(node);
4312
4313 return NULL;
4314 }
4315
4316 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4317 {
4318 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4319 struct cg_proc_stat_head *head = proc_stat_history[hash];
4320 struct cg_proc_stat *node, *rv = new_node;
4321
4322 pthread_rwlock_wrlock(&head->lock);
4323
4324 if (!head->next) {
4325 head->next = new_node;
4326 goto out;
4327 }
4328
4329 node = head->next;
4330
4331 for (;;) {
4332 if (strcmp(node->cg, new_node->cg) == 0) {
4333 /* The node is already present, return it */
4334 free_proc_stat_node(new_node);
4335 rv = node;
4336 goto out;
4337 }
4338
4339 if (node->next) {
4340 node = node->next;
4341 continue;
4342 }
4343
4344 node->next = new_node;
4345 goto out;
4346 }
4347
4348 out:
4349 pthread_rwlock_unlock(&head->lock);
4350 return rv;
4351 }
4352
4353 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4354 {
4355 struct cpuacct_usage *new_usage, *new_view;
4356 int i;
4357
4358 /* Allocate new memory */
4359 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4360 if (!new_usage)
4361 return false;
4362
4363 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4364 if (!new_view) {
4365 free(new_usage);
4366 return false;
4367 }
4368
4369 /* Copy existing data & initialize new elements */
4370 for (i = 0; i < cpu_count; i++) {
4371 if (i < node->cpu_count) {
4372 new_usage[i].user = node->usage[i].user;
4373 new_usage[i].system = node->usage[i].system;
4374 new_usage[i].idle = node->usage[i].idle;
4375
4376 new_view[i].user = node->view[i].user;
4377 new_view[i].system = node->view[i].system;
4378 new_view[i].idle = node->view[i].idle;
4379 } else {
4380 new_usage[i].user = 0;
4381 new_usage[i].system = 0;
4382 new_usage[i].idle = 0;
4383
4384 new_view[i].user = 0;
4385 new_view[i].system = 0;
4386 new_view[i].idle = 0;
4387 }
4388 }
4389
4390 free(node->usage);
4391 free(node->view);
4392
4393 node->usage = new_usage;
4394 node->view = new_view;
4395 node->cpu_count = cpu_count;
4396
4397 return true;
4398 }
4399
4400 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4401 {
4402 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4403 struct cg_proc_stat_head *head = proc_stat_history[hash];
4404 struct cg_proc_stat *node;
4405
4406 node = find_proc_stat_node(head, cg);
4407
4408 if (!node) {
4409 node = new_proc_stat_node(usage, cpu_count, cg);
4410 if (!node)
4411 return NULL;
4412
4413 node = add_proc_stat_node(node);
4414 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4415 }
4416
4417 pthread_mutex_lock(&node->lock);
4418
4419 /* If additional CPUs on the host have been enabled, CPU usage counter
4420 * arrays have to be expanded */
4421 if (node->cpu_count < cpu_count) {
4422 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4423 node->cpu_count, cpu_count, cg);
4424
4425 if (!expand_proc_stat_node(node, cpu_count)) {
4426 pthread_mutex_unlock(&node->lock);
4427 lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4428 node->cpu_count, cpu_count, cg);
4429 return NULL;
4430 }
4431 }
4432
4433 return node;
4434 }
4435
4436 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4437 {
4438 int i;
4439
4440 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4441 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4442
4443 for (i = 0; i < cpu_count; i++) {
4444 node->view[i].user = 0;
4445 node->view[i].system = 0;
4446 node->view[i].idle = 0;
4447 }
4448
4449 node->cpu_count = cpu_count;
4450 }
4451
4452 static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, FILE *f, char *buf, size_t buf_size)
4453 {
4454 char *line = NULL;
4455 size_t linelen = 0, total_len = 0, rv = 0, l;
4456 int curcpu = -1; /* cpu numbering starts at 0 */
4457 int physcpu, i;
4458 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4459 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4460 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4461 unsigned long user_surplus = 0, system_surplus = 0;
4462 unsigned long total_sum, threshold;
4463 struct cg_proc_stat *stat_node;
4464 struct cpuacct_usage *diff = NULL;
4465 int nprocs = get_nprocs_conf();
4466
4467 if (cg_cpu_usage_size < nprocs)
4468 nprocs = cg_cpu_usage_size;
4469
4470 /* Read all CPU stats and stop when we've encountered other lines */
4471 while (getline(&line, &linelen, f) != -1) {
4472 int ret;
4473 char cpu_char[10]; /* That's a lot of cores */
4474 uint64_t all_used, cg_used;
4475
4476 if (strlen(line) == 0)
4477 continue;
4478 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4479 /* not a ^cpuN line containing a number N */
4480 break;
4481 }
4482
4483 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4484 continue;
4485
4486 if (physcpu >= cg_cpu_usage_size)
4487 continue;
4488
4489 curcpu ++;
4490 cpu_cnt ++;
4491
4492 if (!cpu_in_cpuset(physcpu, cpuset)) {
4493 for (i = curcpu; i <= physcpu; i++) {
4494 cg_cpu_usage[i].online = false;
4495 }
4496 continue;
4497 }
4498
4499 if (curcpu < physcpu) {
4500 /* Some CPUs may be disabled */
4501 for (i = curcpu; i < physcpu; i++)
4502 cg_cpu_usage[i].online = false;
4503
4504 curcpu = physcpu;
4505 }
4506
4507 cg_cpu_usage[curcpu].online = true;
4508
4509 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4510 &user,
4511 &nice,
4512 &system,
4513 &idle,
4514 &iowait,
4515 &irq,
4516 &softirq,
4517 &steal,
4518 &guest,
4519 &guest_nice);
4520
4521 if (ret != 10)
4522 continue;
4523
4524 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4525 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4526
4527 if (all_used >= cg_used) {
4528 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4529
4530 } else {
4531 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4532 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4533 curcpu, cg, all_used, cg_used);
4534 cg_cpu_usage[curcpu].idle = idle;
4535 }
4536 }
4537
4538 /* Cannot use more CPUs than is available due to cpuset */
4539 if (max_cpus > cpu_cnt)
4540 max_cpus = cpu_cnt;
4541
4542 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4543
4544 if (!stat_node) {
4545 lxcfs_error("unable to find/create stat node for %s\n", cg);
4546 rv = 0;
4547 goto err;
4548 }
4549
4550 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4551 if (!diff) {
4552 rv = 0;
4553 goto err;
4554 }
4555
4556 /*
4557 * If the new values are LOWER than values stored in memory, it means
4558 * the cgroup has been reset/recreated and we should reset too.
4559 */
4560 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4561 if (!cg_cpu_usage[curcpu].online)
4562 continue;
4563
4564 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4565 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4566
4567 break;
4568 }
4569
4570 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4571
4572 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4573 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4574
4575 if (!stat_node->usage[curcpu].online)
4576 continue;
4577
4578 i++;
4579
4580 stat_node->usage[curcpu].user += diff[curcpu].user;
4581 stat_node->usage[curcpu].system += diff[curcpu].system;
4582 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4583
4584 if (max_cpus > 0 && i >= max_cpus) {
4585 user_surplus += diff[curcpu].user;
4586 system_surplus += diff[curcpu].system;
4587 }
4588 }
4589
4590 /* Calculate usage counters of visible CPUs */
4591 if (max_cpus > 0) {
4592 /* threshold = maximum usage per cpu, including idle */
4593 threshold = total_sum / cpu_cnt * max_cpus;
4594
4595 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4596 if (i == max_cpus)
4597 break;
4598
4599 if (!stat_node->usage[curcpu].online)
4600 continue;
4601
4602 i++;
4603
4604 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4605 continue;
4606
4607 /* Add user */
4608 add_cpu_usage(
4609 &user_surplus,
4610 &diff[curcpu],
4611 &diff[curcpu].user,
4612 threshold);
4613
4614 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4615 continue;
4616
4617 /* If there is still room, add system */
4618 add_cpu_usage(
4619 &system_surplus,
4620 &diff[curcpu],
4621 &diff[curcpu].system,
4622 threshold);
4623 }
4624
4625 if (user_surplus > 0)
4626 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4627 if (system_surplus > 0)
4628 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4629
4630 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4631 if (i == max_cpus)
4632 break;
4633
4634 if (!stat_node->usage[curcpu].online)
4635 continue;
4636
4637 i++;
4638
4639 stat_node->view[curcpu].user += diff[curcpu].user;
4640 stat_node->view[curcpu].system += diff[curcpu].system;
4641 stat_node->view[curcpu].idle += diff[curcpu].idle;
4642
4643 user_sum += stat_node->view[curcpu].user;
4644 system_sum += stat_node->view[curcpu].system;
4645 idle_sum += stat_node->view[curcpu].idle;
4646 }
4647
4648 } else {
4649 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4650 if (!stat_node->usage[curcpu].online)
4651 continue;
4652
4653 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4654 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4655 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4656
4657 user_sum += stat_node->view[curcpu].user;
4658 system_sum += stat_node->view[curcpu].system;
4659 idle_sum += stat_node->view[curcpu].idle;
4660 }
4661 }
4662
4663 /* Render the file */
4664 /* cpu-all */
4665 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4666 user_sum,
4667 system_sum,
4668 idle_sum);
4669
4670 if (l < 0) {
4671 perror("Error writing to cache");
4672 rv = 0;
4673 goto err;
4674
4675 }
4676 if (l >= buf_size) {
4677 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4678 rv = 0;
4679 goto err;
4680 }
4681
4682 buf += l;
4683 buf_size -= l;
4684 total_len += l;
4685
4686 /* Render visible CPUs */
4687 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4688 if (!stat_node->usage[curcpu].online)
4689 continue;
4690
4691 i++;
4692
4693 if (max_cpus > 0 && i == max_cpus)
4694 break;
4695
4696 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4697 i,
4698 stat_node->view[curcpu].user,
4699 stat_node->view[curcpu].system,
4700 stat_node->view[curcpu].idle);
4701
4702 if (l < 0) {
4703 perror("Error writing to cache");
4704 rv = 0;
4705 goto err;
4706
4707 }
4708 if (l >= buf_size) {
4709 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4710 rv = 0;
4711 goto err;
4712 }
4713
4714 buf += l;
4715 buf_size -= l;
4716 total_len += l;
4717 }
4718
4719 /* Pass the rest of /proc/stat, start with the last line read */
4720 l = snprintf(buf, buf_size, "%s", line);
4721
4722 if (l < 0) {
4723 perror("Error writing to cache");
4724 rv = 0;
4725 goto err;
4726
4727 }
4728 if (l >= buf_size) {
4729 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4730 rv = 0;
4731 goto err;
4732 }
4733
4734 buf += l;
4735 buf_size -= l;
4736 total_len += l;
4737
4738 /* Pass the rest of the host's /proc/stat */
4739 while (getline(&line, &linelen, f) != -1) {
4740 l = snprintf(buf, buf_size, "%s", line);
4741 if (l < 0) {
4742 perror("Error writing to cache");
4743 rv = 0;
4744 goto err;
4745 }
4746 if (l >= buf_size) {
4747 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4748 rv = 0;
4749 goto err;
4750 }
4751 buf += l;
4752 buf_size -= l;
4753 total_len += l;
4754 }
4755
4756 rv = total_len;
4757
4758 err:
4759 if (stat_node)
4760 pthread_mutex_unlock(&stat_node->lock);
4761 if (line)
4762 free(line);
4763 if (diff)
4764 free(diff);
4765 return rv;
4766 }
4767
4768 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4769 static int proc_stat_read(char *buf, size_t size, off_t offset,
4770 struct fuse_file_info *fi)
4771 {
4772 struct fuse_context *fc = fuse_get_context();
4773 struct file_info *d = (struct file_info *)fi->fh;
4774 char *cg;
4775 char *cpuset = NULL;
4776 char *line = NULL;
4777 size_t linelen = 0, total_len = 0, rv = 0;
4778 int curcpu = -1; /* cpu numbering starts at 0 */
4779 int physcpu = 0;
4780 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4781 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
4782 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4783 char cpuall[CPUALL_MAX_SIZE];
4784 /* reserve for cpu all */
4785 char *cache = d->buf + CPUALL_MAX_SIZE;
4786 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4787 FILE *f = NULL;
4788 struct cpuacct_usage *cg_cpu_usage = NULL;
4789 int cg_cpu_usage_size = 0;
4790
4791 if (offset){
4792 if (offset > d->size)
4793 return -EINVAL;
4794 if (!d->cached)
4795 return 0;
4796 int left = d->size - offset;
4797 total_len = left > size ? size: left;
4798 memcpy(buf, d->buf + offset, total_len);
4799 return total_len;
4800 }
4801
4802 pid_t initpid = lookup_initpid_in_store(fc->pid);
4803 if (initpid <= 0)
4804 initpid = fc->pid;
4805 cg = get_pid_cgroup(initpid, "cpuset");
4806 if (!cg)
4807 return read_file("/proc/stat", buf, size, d);
4808 prune_init_slice(cg);
4809
4810 cpuset = get_cpuset(cg);
4811 if (!cpuset)
4812 goto err;
4813
4814 /*
4815 * Read cpuacct.usage_all for all CPUs.
4816 * If the cpuacct cgroup is present, it is used to calculate the container's
4817 * CPU usage. If not, values from the host's /proc/stat are used.
4818 */
4819 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
4820 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
4821 "falling back to the host's /proc/stat");
4822 }
4823
4824 f = fopen("/proc/stat", "r");
4825 if (!f)
4826 goto err;
4827
4828 //skip first line
4829 if (getline(&line, &linelen, f) < 0) {
4830 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4831 goto err;
4832 }
4833
4834 if (use_cpuview(cg) && cg_cpu_usage) {
4835 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4836 f, d->buf, d->buflen);
4837 goto out;
4838 }
4839
4840 while (getline(&line, &linelen, f) != -1) {
4841 ssize_t l;
4842 char cpu_char[10]; /* That's a lot of cores */
4843 char *c;
4844 uint64_t all_used, cg_used, new_idle;
4845 int ret;
4846
4847 if (strlen(line) == 0)
4848 continue;
4849 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4850 /* not a ^cpuN line containing a number N, just print it */
4851 l = snprintf(cache, cache_size, "%s", line);
4852 if (l < 0) {
4853 perror("Error writing to cache");
4854 rv = 0;
4855 goto err;
4856 }
4857 if (l >= cache_size) {
4858 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4859 rv = 0;
4860 goto err;
4861 }
4862 cache += l;
4863 cache_size -= l;
4864 total_len += l;
4865 continue;
4866 }
4867
4868 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4869 continue;
4870 if (!cpu_in_cpuset(physcpu, cpuset))
4871 continue;
4872 curcpu ++;
4873
4874 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4875 &user,
4876 &nice,
4877 &system,
4878 &idle,
4879 &iowait,
4880 &irq,
4881 &softirq,
4882 &steal,
4883 &guest,
4884 &guest_nice);
4885
4886 if (ret != 10 || !cg_cpu_usage) {
4887 c = strchr(line, ' ');
4888 if (!c)
4889 continue;
4890 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4891 if (l < 0) {
4892 perror("Error writing to cache");
4893 rv = 0;
4894 goto err;
4895
4896 }
4897 if (l >= cache_size) {
4898 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4899 rv = 0;
4900 goto err;
4901 }
4902
4903 cache += l;
4904 cache_size -= l;
4905 total_len += l;
4906
4907 if (ret != 10)
4908 continue;
4909 }
4910
4911 if (cg_cpu_usage) {
4912 if (physcpu >= cg_cpu_usage_size)
4913 break;
4914
4915 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4916 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
4917
4918 if (all_used >= cg_used) {
4919 new_idle = idle + (all_used - cg_used);
4920
4921 } else {
4922 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4923 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4924 curcpu, cg, all_used, cg_used);
4925 new_idle = idle;
4926 }
4927
4928 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4929 curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
4930 new_idle);
4931
4932 if (l < 0) {
4933 perror("Error writing to cache");
4934 rv = 0;
4935 goto err;
4936
4937 }
4938 if (l >= cache_size) {
4939 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4940 rv = 0;
4941 goto err;
4942 }
4943
4944 cache += l;
4945 cache_size -= l;
4946 total_len += l;
4947
4948 user_sum += cg_cpu_usage[physcpu].user;
4949 system_sum += cg_cpu_usage[physcpu].system;
4950 idle_sum += new_idle;
4951
4952 } else {
4953 user_sum += user;
4954 nice_sum += nice;
4955 system_sum += system;
4956 idle_sum += idle;
4957 iowait_sum += iowait;
4958 irq_sum += irq;
4959 softirq_sum += softirq;
4960 steal_sum += steal;
4961 guest_sum += guest;
4962 guest_nice_sum += guest_nice;
4963 }
4964 }
4965
4966 cache = d->buf;
4967
4968 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4969 user_sum,
4970 nice_sum,
4971 system_sum,
4972 idle_sum,
4973 iowait_sum,
4974 irq_sum,
4975 softirq_sum,
4976 steal_sum,
4977 guest_sum,
4978 guest_nice_sum);
4979 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4980 memcpy(cache, cpuall, cpuall_len);
4981 cache += cpuall_len;
4982 } else {
4983 /* shouldn't happen */
4984 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4985 cpuall_len = 0;
4986 }
4987
4988 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4989 total_len += cpuall_len;
4990
4991 out:
4992 d->cached = 1;
4993 d->size = total_len;
4994 if (total_len > size)
4995 total_len = size;
4996
4997 memcpy(buf, d->buf, total_len);
4998 rv = total_len;
4999
5000 err:
5001 if (f)
5002 fclose(f);
5003 if (cg_cpu_usage)
5004 free(cg_cpu_usage);
5005 free(line);
5006 free(cpuset);
5007 free(cg);
5008 return rv;
5009 }
5010
5011 /* This function retrieves the busy time of a group of tasks by looking at
5012 * cpuacct.usage. Unfortunately, this only makes sense when the container has
5013 * been given it's own cpuacct cgroup. If not, this function will take the busy
5014 * time of all other taks that do not actually belong to the container into
5015 * account as well. If someone has a clever solution for this please send a
5016 * patch!
5017 */
5018 static unsigned long get_reaper_busy(pid_t task)
5019 {
5020 pid_t initpid = lookup_initpid_in_store(task);
5021 char *cgroup = NULL, *usage_str = NULL;
5022 unsigned long usage = 0;
5023
5024 if (initpid <= 0)
5025 return 0;
5026
5027 cgroup = get_pid_cgroup(initpid, "cpuacct");
5028 if (!cgroup)
5029 goto out;
5030 prune_init_slice(cgroup);
5031 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
5032 goto out;
5033 usage = strtoul(usage_str, NULL, 10);
5034 usage /= 1000000000;
5035
5036 out:
5037 free(cgroup);
5038 free(usage_str);
5039 return usage;
5040 }
5041
5042 #if RELOADTEST
5043 void iwashere(void)
5044 {
5045 int fd;
5046
5047 fd = creat("/tmp/lxcfs-iwashere", 0644);
5048 if (fd >= 0)
5049 close(fd);
5050 }
5051 #endif
5052
5053 /*
5054 * We read /proc/uptime and reuse its second field.
5055 * For the first field, we use the mtime for the reaper for
5056 * the calling pid as returned by getreaperage
5057 */
5058 static int proc_uptime_read(char *buf, size_t size, off_t offset,
5059 struct fuse_file_info *fi)
5060 {
5061 struct fuse_context *fc = fuse_get_context();
5062 struct file_info *d = (struct file_info *)fi->fh;
5063 unsigned long int busytime = get_reaper_busy(fc->pid);
5064 char *cache = d->buf;
5065 ssize_t total_len = 0;
5066 uint64_t idletime, reaperage;
5067
5068 #if RELOADTEST
5069 iwashere();
5070 #endif
5071
5072 if (offset){
5073 if (!d->cached)
5074 return 0;
5075 if (offset > d->size)
5076 return -EINVAL;
5077 int left = d->size - offset;
5078 total_len = left > size ? size: left;
5079 memcpy(buf, cache + offset, total_len);
5080 return total_len;
5081 }
5082
5083 reaperage = get_reaper_age(fc->pid);
5084 /* To understand why this is done, please read the comment to the
5085 * get_reaper_busy() function.
5086 */
5087 idletime = reaperage;
5088 if (reaperage >= busytime)
5089 idletime = reaperage - busytime;
5090
5091 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
5092 if (total_len < 0 || total_len >= d->buflen){
5093 lxcfs_error("%s\n", "failed to write to cache");
5094 return 0;
5095 }
5096
5097 d->size = (int)total_len;
5098 d->cached = 1;
5099
5100 if (total_len > size) total_len = size;
5101
5102 memcpy(buf, d->buf, total_len);
5103 return total_len;
5104 }
5105
5106 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5107 struct fuse_file_info *fi)
5108 {
5109 char dev_name[72];
5110 struct fuse_context *fc = fuse_get_context();
5111 struct file_info *d = (struct file_info *)fi->fh;
5112 char *cg;
5113 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5114 *io_wait_time_str = NULL, *io_service_time_str = NULL;
5115 unsigned long read = 0, write = 0;
5116 unsigned long read_merged = 0, write_merged = 0;
5117 unsigned long read_sectors = 0, write_sectors = 0;
5118 unsigned long read_ticks = 0, write_ticks = 0;
5119 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5120 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5121 char *cache = d->buf;
5122 size_t cache_size = d->buflen;
5123 char *line = NULL;
5124 size_t linelen = 0, total_len = 0, rv = 0;
5125 unsigned int major = 0, minor = 0;
5126 int i = 0;
5127 FILE *f = NULL;
5128
5129 if (offset){
5130 if (offset > d->size)
5131 return -EINVAL;
5132 if (!d->cached)
5133 return 0;
5134 int left = d->size - offset;
5135 total_len = left > size ? size: left;
5136 memcpy(buf, cache + offset, total_len);
5137 return total_len;
5138 }
5139
5140 pid_t initpid = lookup_initpid_in_store(fc->pid);
5141 if (initpid <= 0)
5142 initpid = fc->pid;
5143 cg = get_pid_cgroup(initpid, "blkio");
5144 if (!cg)
5145 return read_file("/proc/diskstats", buf, size, d);
5146 prune_init_slice(cg);
5147
5148 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
5149 goto err;
5150 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
5151 goto err;
5152 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
5153 goto err;
5154 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
5155 goto err;
5156 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
5157 goto err;
5158
5159
5160 f = fopen("/proc/diskstats", "r");
5161 if (!f)
5162 goto err;
5163
5164 while (getline(&line, &linelen, f) != -1) {
5165 ssize_t l;
5166 char lbuf[256];
5167
5168 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5169 if (i != 3)
5170 continue;
5171
5172 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5173 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5174 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5175 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5176 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5177 read_sectors = read_sectors/512;
5178 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5179 write_sectors = write_sectors/512;
5180
5181 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5182 rd_svctm = rd_svctm/1000000;
5183 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5184 rd_wait = rd_wait/1000000;
5185 read_ticks = rd_svctm + rd_wait;
5186
5187 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5188 wr_svctm = wr_svctm/1000000;
5189 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5190 wr_wait = wr_wait/1000000;
5191 write_ticks = wr_svctm + wr_wait;
5192
5193 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5194 tot_ticks = tot_ticks/1000000;
5195
5196 memset(lbuf, 0, 256);
5197 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5198 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5199 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5200 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5201 else
5202 continue;
5203
5204 l = snprintf(cache, cache_size, "%s", lbuf);
5205 if (l < 0) {
5206 perror("Error writing to fuse buf");
5207 rv = 0;
5208 goto err;
5209 }
5210 if (l >= cache_size) {
5211 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5212 rv = 0;
5213 goto err;
5214 }
5215 cache += l;
5216 cache_size -= l;
5217 total_len += l;
5218 }
5219
5220 d->cached = 1;
5221 d->size = total_len;
5222 if (total_len > size ) total_len = size;
5223 memcpy(buf, d->buf, total_len);
5224
5225 rv = total_len;
5226 err:
5227 free(cg);
5228 if (f)
5229 fclose(f);
5230 free(line);
5231 free(io_serviced_str);
5232 free(io_merged_str);
5233 free(io_service_bytes_str);
5234 free(io_wait_time_str);
5235 free(io_service_time_str);
5236 return rv;
5237 }
5238
5239 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5240 struct fuse_file_info *fi)
5241 {
5242 struct fuse_context *fc = fuse_get_context();
5243 struct file_info *d = (struct file_info *)fi->fh;
5244 char *cg = NULL;
5245 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
5246 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
5247 ssize_t total_len = 0, rv = 0;
5248 ssize_t l = 0;
5249 char *cache = d->buf;
5250
5251 if (offset) {
5252 if (offset > d->size)
5253 return -EINVAL;
5254 if (!d->cached)
5255 return 0;
5256 int left = d->size - offset;
5257 total_len = left > size ? size: left;
5258 memcpy(buf, cache + offset, total_len);
5259 return total_len;
5260 }
5261
5262 pid_t initpid = lookup_initpid_in_store(fc->pid);
5263 if (initpid <= 0)
5264 initpid = fc->pid;
5265 cg = get_pid_cgroup(initpid, "memory");
5266 if (!cg)
5267 return read_file("/proc/swaps", buf, size, d);
5268 prune_init_slice(cg);
5269
5270 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
5271
5272 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
5273 goto err;
5274
5275 memusage = strtoul(memusage_str, NULL, 10);
5276
5277 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5278 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5279
5280 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
5281 memswusage = strtoul(memswusage_str, NULL, 10);
5282
5283 swap_total = (memswlimit - memlimit) / 1024;
5284 swap_free = (memswusage - memusage) / 1024;
5285 }
5286
5287 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5288
5289 /* When no mem + swap limit is specified or swapaccount=0*/
5290 if (!memswlimit) {
5291 char *line = NULL;
5292 size_t linelen = 0;
5293 FILE *f = fopen("/proc/meminfo", "r");
5294
5295 if (!f)
5296 goto err;
5297
5298 while (getline(&line, &linelen, f) != -1) {
5299 if (startswith(line, "SwapTotal:")) {
5300 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
5301 } else if (startswith(line, "SwapFree:")) {
5302 sscanf(line, "SwapFree: %8lu kB", &swap_free);
5303 }
5304 }
5305
5306 free(line);
5307 fclose(f);
5308 }
5309
5310 if (swap_total > 0) {
5311 l = snprintf(d->buf + total_len, d->size - total_len,
5312 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5313 swap_total, swap_free);
5314 total_len += l;
5315 }
5316
5317 if (total_len < 0 || l < 0) {
5318 perror("Error writing to cache");
5319 rv = 0;
5320 goto err;
5321 }
5322
5323 d->cached = 1;
5324 d->size = (int)total_len;
5325
5326 if (total_len > size) total_len = size;
5327 memcpy(buf, d->buf, total_len);
5328 rv = total_len;
5329
5330 err:
5331 free(cg);
5332 free(memswlimit_str);
5333 free(memlimit_str);
5334 free(memusage_str);
5335 free(memswusage_str);
5336 return rv;
5337 }
5338 /*
5339 * Find the process pid from cgroup path.
5340 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5341 * @pid_buf : put pid to pid_buf.
5342 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5343 * @depth : the depth of cgroup in container.
5344 * @sum : return the number of pid.
5345 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5346 */
5347 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5348 {
5349 DIR *dir;
5350 int fd;
5351 struct dirent *file;
5352 FILE *f = NULL;
5353 size_t linelen = 0;
5354 char *line = NULL;
5355 int pd;
5356 char *path_dir, *path;
5357 char **pid;
5358
5359 /* path = dpath + "/cgroup.procs" + /0 */
5360 do {
5361 path = malloc(strlen(dpath) + 20);
5362 } while (!path);
5363
5364 strcpy(path, dpath);
5365 fd = openat(cfd, path, O_RDONLY);
5366 if (fd < 0)
5367 goto out;
5368
5369 dir = fdopendir(fd);
5370 if (dir == NULL) {
5371 close(fd);
5372 goto out;
5373 }
5374
5375 while (((file = readdir(dir)) != NULL) && depth > 0) {
5376 if (strncmp(file->d_name, ".", 1) == 0)
5377 continue;
5378 if (strncmp(file->d_name, "..", 1) == 0)
5379 continue;
5380 if (file->d_type == DT_DIR) {
5381 /* path + '/' + d_name +/0 */
5382 do {
5383 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5384 } while (!path_dir);
5385 strcpy(path_dir, path);
5386 strcat(path_dir, "/");
5387 strcat(path_dir, file->d_name);
5388 pd = depth - 1;
5389 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5390 free(path_dir);
5391 }
5392 }
5393 closedir(dir);
5394
5395 strcat(path, "/cgroup.procs");
5396 fd = openat(cfd, path, O_RDONLY);
5397 if (fd < 0)
5398 goto out;
5399
5400 f = fdopen(fd, "r");
5401 if (!f) {
5402 close(fd);
5403 goto out;
5404 }
5405
5406 while (getline(&line, &linelen, f) != -1) {
5407 do {
5408 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5409 } while (!pid);
5410 *pid_buf = pid;
5411 do {
5412 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5413 } while (*(*pid_buf + sum) == NULL);
5414 strcpy(*(*pid_buf + sum), line);
5415 sum++;
5416 }
5417 fclose(f);
5418 out:
5419 if (line)
5420 free(line);
5421 free(path);
5422 return sum;
5423 }
5424 /*
5425 * calc_load calculates the load according to the following formula:
5426 * load1 = load0 * exp + active * (1 - exp)
5427 *
5428 * @load1: the new loadavg.
5429 * @load0: the former loadavg.
5430 * @active: the total number of running pid at this moment.
5431 * @exp: the fixed-point defined in the beginning.
5432 */
5433 static unsigned long
5434 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5435 {
5436 unsigned long newload;
5437
5438 active = active > 0 ? active * FIXED_1 : 0;
5439 newload = load * exp + active * (FIXED_1 - exp);
5440 if (active >= load)
5441 newload += FIXED_1 - 1;
5442
5443 return newload / FIXED_1;
5444 }
5445
5446 /*
5447 * Return 0 means that container p->cg is closed.
5448 * Return -1 means that error occurred in refresh.
5449 * Positive num equals the total number of pid.
5450 */
5451 static int refresh_load(struct load_node *p, char *path)
5452 {
5453 FILE *f = NULL;
5454 char **idbuf;
5455 char proc_path[256];
5456 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5457 char *line = NULL;
5458 size_t linelen = 0;
5459 int sum, length;
5460 DIR *dp;
5461 struct dirent *file;
5462
5463 do {
5464 idbuf = malloc(sizeof(char *));
5465 } while (!idbuf);
5466 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5467 /* normal exit */
5468 if (sum == 0)
5469 goto out;
5470
5471 for (i = 0; i < sum; i++) {
5472 /*clean up '\n' */
5473 length = strlen(idbuf[i])-1;
5474 idbuf[i][length] = '\0';
5475 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5476 if (ret < 0 || ret > 255) {
5477 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5478 i = sum;
5479 sum = -1;
5480 goto err_out;
5481 }
5482
5483 dp = opendir(proc_path);
5484 if (!dp) {
5485 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5486 continue;
5487 }
5488 while ((file = readdir(dp)) != NULL) {
5489 if (strncmp(file->d_name, ".", 1) == 0)
5490 continue;
5491 if (strncmp(file->d_name, "..", 1) == 0)
5492 continue;
5493 total_pid++;
5494 /* We make the biggest pid become last_pid.*/
5495 ret = atof(file->d_name);
5496 last_pid = (ret > last_pid) ? ret : last_pid;
5497
5498 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5499 if (ret < 0 || ret > 255) {
5500 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5501 i = sum;
5502 sum = -1;
5503 closedir(dp);
5504 goto err_out;
5505 }
5506 f = fopen(proc_path, "r");
5507 if (f != NULL) {
5508 while (getline(&line, &linelen, f) != -1) {
5509 /* Find State */
5510 if ((line[0] == 'S') && (line[1] == 't'))
5511 break;
5512 }
5513 if ((line[7] == 'R') || (line[7] == 'D'))
5514 run_pid++;
5515 fclose(f);
5516 }
5517 }
5518 closedir(dp);
5519 }
5520 /*Calculate the loadavg.*/
5521 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5522 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5523 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5524 p->run_pid = run_pid;
5525 p->total_pid = total_pid;
5526 p->last_pid = last_pid;
5527
5528 free(line);
5529 err_out:
5530 for (; i > 0; i--)
5531 free(idbuf[i-1]);
5532 out:
5533 free(idbuf);
5534 return sum;
5535 }
5536 /*
5537 * Traverse the hash table and update it.
5538 */
5539 void *load_begin(void *arg)
5540 {
5541
5542 char *path = NULL;
5543 int i, sum, length, ret;
5544 struct load_node *f;
5545 int first_node;
5546 clock_t time1, time2;
5547
5548 while (1) {
5549 if (loadavg_stop == 1)
5550 return NULL;
5551
5552 time1 = clock();
5553 for (i = 0; i < LOAD_SIZE; i++) {
5554 pthread_mutex_lock(&load_hash[i].lock);
5555 if (load_hash[i].next == NULL) {
5556 pthread_mutex_unlock(&load_hash[i].lock);
5557 continue;
5558 }
5559 f = load_hash[i].next;
5560 first_node = 1;
5561 while (f) {
5562 length = strlen(f->cg) + 2;
5563 do {
5564 /* strlen(f->cg) + '.' or '' + \0 */
5565 path = malloc(length);
5566 } while (!path);
5567
5568 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5569 if (ret < 0 || ret > length - 1) {
5570 /* snprintf failed, ignore the node.*/
5571 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5572 goto out;
5573 }
5574 sum = refresh_load(f, path);
5575 if (sum == 0) {
5576 f = del_node(f, i);
5577 } else {
5578 out: f = f->next;
5579 }
5580 free(path);
5581 /* load_hash[i].lock locks only on the first node.*/
5582 if (first_node == 1) {
5583 first_node = 0;
5584 pthread_mutex_unlock(&load_hash[i].lock);
5585 }
5586 }
5587 }
5588
5589 if (loadavg_stop == 1)
5590 return NULL;
5591
5592 time2 = clock();
5593 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5594 }
5595 }
5596
5597 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5598 struct fuse_file_info *fi)
5599 {
5600 struct fuse_context *fc = fuse_get_context();
5601 struct file_info *d = (struct file_info *)fi->fh;
5602 pid_t initpid;
5603 char *cg;
5604 size_t total_len = 0;
5605 char *cache = d->buf;
5606 struct load_node *n;
5607 int hash;
5608 int cfd, rv = 0;
5609 unsigned long a, b, c;
5610
5611 if (offset) {
5612 if (offset > d->size)
5613 return -EINVAL;
5614 if (!d->cached)
5615 return 0;
5616 int left = d->size - offset;
5617 total_len = left > size ? size : left;
5618 memcpy(buf, cache + offset, total_len);
5619 return total_len;
5620 }
5621 if (!loadavg)
5622 return read_file("/proc/loadavg", buf, size, d);
5623
5624 initpid = lookup_initpid_in_store(fc->pid);
5625 if (initpid <= 0)
5626 initpid = fc->pid;
5627 cg = get_pid_cgroup(initpid, "cpu");
5628 if (!cg)
5629 return read_file("/proc/loadavg", buf, size, d);
5630
5631 prune_init_slice(cg);
5632 hash = calc_hash(cg) % LOAD_SIZE;
5633 n = locate_node(cg, hash);
5634
5635 /* First time */
5636 if (n == NULL) {
5637 if (!find_mounted_controller("cpu", &cfd)) {
5638 /*
5639 * In locate_node() above, pthread_rwlock_unlock() isn't used
5640 * because delete is not allowed before read has ended.
5641 */
5642 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5643 rv = 0;
5644 goto err;
5645 }
5646 do {
5647 n = malloc(sizeof(struct load_node));
5648 } while (!n);
5649
5650 do {
5651 n->cg = malloc(strlen(cg)+1);
5652 } while (!n->cg);
5653 strcpy(n->cg, cg);
5654 n->avenrun[0] = 0;
5655 n->avenrun[1] = 0;
5656 n->avenrun[2] = 0;
5657 n->run_pid = 0;
5658 n->total_pid = 1;
5659 n->last_pid = initpid;
5660 n->cfd = cfd;
5661 insert_node(&n, hash);
5662 }
5663 a = n->avenrun[0] + (FIXED_1/200);
5664 b = n->avenrun[1] + (FIXED_1/200);
5665 c = n->avenrun[2] + (FIXED_1/200);
5666 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5667 LOAD_INT(a), LOAD_FRAC(a),
5668 LOAD_INT(b), LOAD_FRAC(b),
5669 LOAD_INT(c), LOAD_FRAC(c),
5670 n->run_pid, n->total_pid, n->last_pid);
5671 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5672 if (total_len < 0 || total_len >= d->buflen) {
5673 lxcfs_error("%s\n", "Failed to write to cache");
5674 rv = 0;
5675 goto err;
5676 }
5677 d->size = (int)total_len;
5678 d->cached = 1;
5679
5680 if (total_len > size)
5681 total_len = size;
5682 memcpy(buf, d->buf, total_len);
5683 rv = total_len;
5684
5685 err:
5686 free(cg);
5687 return rv;
5688 }
5689 /* Return a positive number on success, return 0 on failure.*/
5690 pthread_t load_daemon(int load_use)
5691 {
5692 int ret;
5693 pthread_t pid;
5694
5695 ret = init_load();
5696 if (ret == -1) {
5697 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5698 return 0;
5699 }
5700 ret = pthread_create(&pid, NULL, load_begin, NULL);
5701 if (ret != 0) {
5702 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5703 load_free();
5704 return 0;
5705 }
5706 /* use loadavg, here loadavg = 1*/
5707 loadavg = load_use;
5708 return pid;
5709 }
5710
5711 /* Returns 0 on success. */
5712 int stop_load_daemon(pthread_t pid)
5713 {
5714 int s;
5715
5716 /* Signal the thread to gracefully stop */
5717 loadavg_stop = 1;
5718
5719 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5720 if (s != 0) {
5721 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5722 return -1;
5723 }
5724
5725 load_free();
5726 loadavg_stop = 0;
5727
5728 return 0;
5729 }
5730
5731 static off_t get_procfile_size(const char *which)
5732 {
5733 FILE *f = fopen(which, "r");
5734 char *line = NULL;
5735 size_t len = 0;
5736 ssize_t sz, answer = 0;
5737 if (!f)
5738 return 0;
5739
5740 while ((sz = getline(&line, &len, f)) != -1)
5741 answer += sz;
5742 fclose (f);
5743 free(line);
5744
5745 return answer;
5746 }
5747
5748 int proc_getattr(const char *path, struct stat *sb)
5749 {
5750 struct timespec now;
5751
5752 memset(sb, 0, sizeof(struct stat));
5753 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5754 return -EINVAL;
5755 sb->st_uid = sb->st_gid = 0;
5756 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5757 if (strcmp(path, "/proc") == 0) {
5758 sb->st_mode = S_IFDIR | 00555;
5759 sb->st_nlink = 2;
5760 return 0;
5761 }
5762 if (strcmp(path, "/proc/meminfo") == 0 ||
5763 strcmp(path, "/proc/cpuinfo") == 0 ||
5764 strcmp(path, "/proc/uptime") == 0 ||
5765 strcmp(path, "/proc/stat") == 0 ||
5766 strcmp(path, "/proc/diskstats") == 0 ||
5767 strcmp(path, "/proc/swaps") == 0 ||
5768 strcmp(path, "/proc/loadavg") == 0) {
5769 sb->st_size = 0;
5770 sb->st_mode = S_IFREG | 00444;
5771 sb->st_nlink = 1;
5772 return 0;
5773 }
5774
5775 return -ENOENT;
5776 }
5777
5778 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5779 struct fuse_file_info *fi)
5780 {
5781 if (filler(buf, ".", NULL, 0) != 0 ||
5782 filler(buf, "..", NULL, 0) != 0 ||
5783 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5784 filler(buf, "meminfo", NULL, 0) != 0 ||
5785 filler(buf, "stat", NULL, 0) != 0 ||
5786 filler(buf, "uptime", NULL, 0) != 0 ||
5787 filler(buf, "diskstats", NULL, 0) != 0 ||
5788 filler(buf, "swaps", NULL, 0) != 0 ||
5789 filler(buf, "loadavg", NULL, 0) != 0)
5790 return -EINVAL;
5791 return 0;
5792 }
5793
5794 int proc_open(const char *path, struct fuse_file_info *fi)
5795 {
5796 int type = -1;
5797 struct file_info *info;
5798
5799 if (strcmp(path, "/proc/meminfo") == 0)
5800 type = LXC_TYPE_PROC_MEMINFO;
5801 else if (strcmp(path, "/proc/cpuinfo") == 0)
5802 type = LXC_TYPE_PROC_CPUINFO;
5803 else if (strcmp(path, "/proc/uptime") == 0)
5804 type = LXC_TYPE_PROC_UPTIME;
5805 else if (strcmp(path, "/proc/stat") == 0)
5806 type = LXC_TYPE_PROC_STAT;
5807 else if (strcmp(path, "/proc/diskstats") == 0)
5808 type = LXC_TYPE_PROC_DISKSTATS;
5809 else if (strcmp(path, "/proc/swaps") == 0)
5810 type = LXC_TYPE_PROC_SWAPS;
5811 else if (strcmp(path, "/proc/loadavg") == 0)
5812 type = LXC_TYPE_PROC_LOADAVG;
5813 if (type == -1)
5814 return -ENOENT;
5815
5816 info = malloc(sizeof(*info));
5817 if (!info)
5818 return -ENOMEM;
5819
5820 memset(info, 0, sizeof(*info));
5821 info->type = type;
5822
5823 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5824 do {
5825 info->buf = malloc(info->buflen);
5826 } while (!info->buf);
5827 memset(info->buf, 0, info->buflen);
5828 /* set actual size to buffer size */
5829 info->size = info->buflen;
5830
5831 fi->fh = (unsigned long)info;
5832 return 0;
5833 }
5834
5835 int proc_access(const char *path, int mask)
5836 {
5837 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5838 return 0;
5839
5840 /* these are all read-only */
5841 if ((mask & ~R_OK) != 0)
5842 return -EACCES;
5843 return 0;
5844 }
5845
5846 int proc_release(const char *path, struct fuse_file_info *fi)
5847 {
5848 do_release_file_info(fi);
5849 return 0;
5850 }
5851
5852 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5853 struct fuse_file_info *fi)
5854 {
5855 struct file_info *f = (struct file_info *) fi->fh;
5856
5857 switch (f->type) {
5858 case LXC_TYPE_PROC_MEMINFO:
5859 return proc_meminfo_read(buf, size, offset, fi);
5860 case LXC_TYPE_PROC_CPUINFO:
5861 return proc_cpuinfo_read(buf, size, offset, fi);
5862 case LXC_TYPE_PROC_UPTIME:
5863 return proc_uptime_read(buf, size, offset, fi);
5864 case LXC_TYPE_PROC_STAT:
5865 return proc_stat_read(buf, size, offset, fi);
5866 case LXC_TYPE_PROC_DISKSTATS:
5867 return proc_diskstats_read(buf, size, offset, fi);
5868 case LXC_TYPE_PROC_SWAPS:
5869 return proc_swaps_read(buf, size, offset, fi);
5870 case LXC_TYPE_PROC_LOADAVG:
5871 return proc_loadavg_read(buf, size, offset, fi);
5872 default:
5873 return -EINVAL;
5874 }
5875 }
5876
5877 /*
5878 * Functions needed to setup cgroups in the __constructor__.
5879 */
5880
5881 static bool mkdir_p(const char *dir, mode_t mode)
5882 {
5883 const char *tmp = dir;
5884 const char *orig = dir;
5885 char *makeme;
5886
5887 do {
5888 dir = tmp + strspn(tmp, "/");
5889 tmp = dir + strcspn(dir, "/");
5890 makeme = strndup(orig, dir - orig);
5891 if (!makeme)
5892 return false;
5893 if (mkdir(makeme, mode) && errno != EEXIST) {
5894 lxcfs_error("Failed to create directory '%s': %s.\n",
5895 makeme, strerror(errno));
5896 free(makeme);
5897 return false;
5898 }
5899 free(makeme);
5900 } while(tmp != dir);
5901
5902 return true;
5903 }
5904
5905 static bool umount_if_mounted(void)
5906 {
5907 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5908 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5909 return false;
5910 }
5911 return true;
5912 }
5913
5914 /* __typeof__ should be safe to use with all compilers. */
5915 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5916 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5917 {
5918 return (fs->f_type == (fs_type_magic)magic_val);
5919 }
5920
5921 /*
5922 * looking at fs/proc_namespace.c, it appears we can
5923 * actually expect the rootfs entry to very specifically contain
5924 * " - rootfs rootfs "
5925 * IIUC, so long as we've chrooted so that rootfs is not our root,
5926 * the rootfs entry should always be skipped in mountinfo contents.
5927 */
5928 static bool is_on_ramfs(void)
5929 {
5930 FILE *f;
5931 char *p, *p2;
5932 char *line = NULL;
5933 size_t len = 0;
5934 int i;
5935
5936 f = fopen("/proc/self/mountinfo", "r");
5937 if (!f)
5938 return false;
5939
5940 while (getline(&line, &len, f) != -1) {
5941 for (p = line, i = 0; p && i < 4; i++)
5942 p = strchr(p + 1, ' ');
5943 if (!p)
5944 continue;
5945 p2 = strchr(p + 1, ' ');
5946 if (!p2)
5947 continue;
5948 *p2 = '\0';
5949 if (strcmp(p + 1, "/") == 0) {
5950 // this is '/'. is it the ramfs?
5951 p = strchr(p2 + 1, '-');
5952 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5953 free(line);
5954 fclose(f);
5955 return true;
5956 }
5957 }
5958 }
5959 free(line);
5960 fclose(f);
5961 return false;
5962 }
5963
5964 static int pivot_enter()
5965 {
5966 int ret = -1, oldroot = -1, newroot = -1;
5967
5968 oldroot = open("/", O_DIRECTORY | O_RDONLY);
5969 if (oldroot < 0) {
5970 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5971 return ret;
5972 }
5973
5974 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5975 if (newroot < 0) {
5976 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5977 goto err;
5978 }
5979
5980 /* change into new root fs */
5981 if (fchdir(newroot) < 0) {
5982 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5983 goto err;
5984 }
5985
5986 /* pivot_root into our new root fs */
5987 if (pivot_root(".", ".") < 0) {
5988 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5989 goto err;
5990 }
5991
5992 /*
5993 * At this point the old-root is mounted on top of our new-root.
5994 * To unmounted it we must not be chdir'd into it, so escape back
5995 * to the old-root.
5996 */
5997 if (fchdir(oldroot) < 0) {
5998 lxcfs_error("%s\n", "Failed to enter old root.");
5999 goto err;
6000 }
6001
6002 if (umount2(".", MNT_DETACH) < 0) {
6003 lxcfs_error("%s\n", "Failed to detach old root.");
6004 goto err;
6005 }
6006
6007 if (fchdir(newroot) < 0) {
6008 lxcfs_error("%s\n", "Failed to re-enter new root.");
6009 goto err;
6010 }
6011
6012 ret = 0;
6013
6014 err:
6015 if (oldroot > 0)
6016 close(oldroot);
6017 if (newroot > 0)
6018 close(newroot);
6019
6020 return ret;
6021 }
6022
6023 static int chroot_enter()
6024 {
6025 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
6026 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
6027 return -1;
6028 }
6029
6030 if (chroot(".") < 0) {
6031 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
6032 return -1;
6033 }
6034
6035 if (chdir("/") < 0) {
6036 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
6037 return -1;
6038 }
6039
6040 return 0;
6041 }
6042
6043 static int permute_and_enter(void)
6044 {
6045 struct statfs sb;
6046
6047 if (statfs("/", &sb) < 0) {
6048 lxcfs_error("%s\n", "Could not stat / mountpoint.");
6049 return -1;
6050 }
6051
6052 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
6053 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
6054 * /proc/1/mountinfo. */
6055 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
6056 return chroot_enter();
6057
6058 if (pivot_enter() < 0) {
6059 lxcfs_error("%s\n", "Could not perform pivot root.");
6060 return -1;
6061 }
6062
6063 return 0;
6064 }
6065
6066 /* Prepare our new clean root. */
6067 static int permute_prepare(void)
6068 {
6069 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
6070 lxcfs_error("%s\n", "Failed to create directory for new root.");
6071 return -1;
6072 }
6073
6074 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
6075 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
6076 return -1;
6077 }
6078
6079 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
6080 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
6081 return -1;
6082 }
6083
6084 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
6085 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
6086 return -1;
6087 }
6088
6089 return 0;
6090 }
6091
6092 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6093 static bool permute_root(void)
6094 {
6095 /* Prepare new root. */
6096 if (permute_prepare() < 0)
6097 return false;
6098
6099 /* Pivot into new root. */
6100 if (permute_and_enter() < 0)
6101 return false;
6102
6103 return true;
6104 }
6105
6106 static int preserve_mnt_ns(int pid)
6107 {
6108 int ret;
6109 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6110 char path[len];
6111
6112 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6113 if (ret < 0 || (size_t)ret >= len)
6114 return -1;
6115
6116 return open(path, O_RDONLY | O_CLOEXEC);
6117 }
6118
6119 static bool cgfs_prepare_mounts(void)
6120 {
6121 if (!mkdir_p(BASEDIR, 0700)) {
6122 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6123 return false;
6124 }
6125
6126 if (!umount_if_mounted()) {
6127 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6128 return false;
6129 }
6130
6131 if (unshare(CLONE_NEWNS) < 0) {
6132 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6133 return false;
6134 }
6135
6136 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6137 if (cgroup_mount_ns_fd < 0) {
6138 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6139 return false;
6140 }
6141
6142 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6143 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6144 return false;
6145 }
6146
6147 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6148 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6149 return false;
6150 }
6151
6152 return true;
6153 }
6154
6155 static bool cgfs_mount_hierarchies(void)
6156 {
6157 char *target;
6158 size_t clen, len;
6159 int i, ret;
6160
6161 for (i = 0; i < num_hierarchies; i++) {
6162 char *controller = hierarchies[i];
6163
6164 clen = strlen(controller);
6165 len = strlen(BASEDIR) + clen + 2;
6166 target = malloc(len);
6167 if (!target)
6168 return false;
6169
6170 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
6171 if (ret < 0 || ret >= len) {
6172 free(target);
6173 return false;
6174 }
6175 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
6176 free(target);
6177 return false;
6178 }
6179 if (!strcmp(controller, "unified"))
6180 ret = mount("none", target, "cgroup2", 0, NULL);
6181 else
6182 ret = mount(controller, target, "cgroup", 0, controller);
6183 if (ret < 0) {
6184 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
6185 free(target);
6186 return false;
6187 }
6188
6189 fd_hierarchies[i] = open(target, O_DIRECTORY);
6190 if (fd_hierarchies[i] < 0) {
6191 free(target);
6192 return false;
6193 }
6194 free(target);
6195 }
6196 return true;
6197 }
6198
6199 static bool cgfs_setup_controllers(void)
6200 {
6201 if (!cgfs_prepare_mounts())
6202 return false;
6203
6204 if (!cgfs_mount_hierarchies()) {
6205 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6206 return false;
6207 }
6208
6209 if (!permute_root())
6210 return false;
6211
6212 return true;
6213 }
6214
6215 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
6216 {
6217 FILE *f;
6218 char *cret, *line = NULL;
6219 char cwd[MAXPATHLEN];
6220 size_t len = 0;
6221 int i, init_ns = -1;
6222 bool found_unified = false;
6223
6224 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
6225 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
6226 return;
6227 }
6228
6229 while (getline(&line, &len, f) != -1) {
6230 char *idx, *p, *p2;
6231
6232 p = strchr(line, ':');
6233 if (!p)
6234 goto out;
6235 idx = line;
6236 *(p++) = '\0';
6237
6238 p2 = strrchr(p, ':');
6239 if (!p2)
6240 goto out;
6241 *p2 = '\0';
6242
6243 /* With cgroupv2 /proc/self/cgroup can contain entries of the
6244 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
6245 * because it parses out the empty string "" and later on passes
6246 * it to mount(). Let's skip such entries.
6247 */
6248 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
6249 found_unified = true;
6250 p = "unified";
6251 }
6252
6253 if (!store_hierarchy(line, p))
6254 goto out;
6255 }
6256
6257 /* Preserve initial namespace. */
6258 init_ns = preserve_mnt_ns(getpid());
6259 if (init_ns < 0) {
6260 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
6261 goto out;
6262 }
6263
6264 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
6265 if (!fd_hierarchies) {
6266 lxcfs_error("%s\n", strerror(errno));
6267 goto out;
6268 }
6269
6270 for (i = 0; i < num_hierarchies; i++)
6271 fd_hierarchies[i] = -1;
6272
6273 cret = getcwd(cwd, MAXPATHLEN);
6274 if (!cret)
6275 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6276
6277 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6278 * to privately mount lxcfs cgroups. */
6279 if (!cgfs_setup_controllers()) {
6280 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
6281 goto out;
6282 }
6283
6284 if (setns(init_ns, 0) < 0) {
6285 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
6286 goto out;
6287 }
6288
6289 if (!cret || chdir(cwd) < 0)
6290 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6291
6292 if (!init_cpuview()) {
6293 lxcfs_error("%s\n", "failed to init CPU view");
6294 goto out;
6295 }
6296
6297 print_subsystems();
6298
6299 out:
6300 free(line);
6301 fclose(f);
6302 if (init_ns >= 0)
6303 close(init_ns);
6304 }
6305
6306 static void __attribute__((destructor)) free_subsystems(void)
6307 {
6308 int i;
6309
6310 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6311
6312 for (i = 0; i < num_hierarchies; i++) {
6313 if (hierarchies[i])
6314 free(hierarchies[i]);
6315 if (fd_hierarchies && fd_hierarchies[i] >= 0)
6316 close(fd_hierarchies[i]);
6317 }
6318 free(hierarchies);
6319 free(fd_hierarchies);
6320 free_cpuview();
6321
6322 if (cgroup_mount_ns_fd >= 0)
6323 close(cgroup_mount_ns_fd);
6324 }