]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
Merge pull request #290 from BurningXFlame/master
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #define __STDC_FORMAT_MACROS
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <fuse.h>
16 #include <inttypes.h>
17 #include <libgen.h>
18 #include <pthread.h>
19 #include <sched.h>
20 #include <stdarg.h>
21 #include <stdbool.h>
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <time.h>
27 #include <unistd.h>
28 #include <wait.h>
29 #include <linux/magic.h>
30 #include <linux/sched.h>
31 #include <sys/epoll.h>
32 #include <sys/mman.h>
33 #include <sys/mount.h>
34 #include <sys/param.h>
35 #include <sys/socket.h>
36 #include <sys/syscall.h>
37 #include <sys/sysinfo.h>
38 #include <sys/vfs.h>
39
40 #include "bindings.h"
41 #include "config.h" // for VERSION
42
43 /* Define pivot_root() if missing from the C library */
44 #ifndef HAVE_PIVOT_ROOT
45 static int pivot_root(const char * new_root, const char * put_old)
46 {
47 #ifdef __NR_pivot_root
48 return syscall(__NR_pivot_root, new_root, put_old);
49 #else
50 errno = ENOSYS;
51 return -1;
52 #endif
53 }
54 #else
55 extern int pivot_root(const char * new_root, const char * put_old);
56 #endif
57
58 struct cpuacct_usage {
59 uint64_t user;
60 uint64_t system;
61 uint64_t idle;
62 bool online;
63 };
64
65 /* The function of hash table.*/
66 #define LOAD_SIZE 100 /*the size of hash_table */
67 #define FLUSH_TIME 5 /*the flush rate */
68 #define DEPTH_DIR 3 /*the depth of per cgroup */
69 /* The function of calculate loadavg .*/
70 #define FSHIFT 11 /* nr of bits of precision */
71 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
72 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
73 #define EXP_5 2014 /* 1/exp(5sec/5min) */
74 #define EXP_15 2037 /* 1/exp(5sec/15min) */
75 #define LOAD_INT(x) ((x) >> FSHIFT)
76 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
77 /*
78 * This parameter is used for proc_loadavg_read().
79 * 1 means use loadavg, 0 means not use.
80 */
81 static int loadavg = 0;
82 static volatile sig_atomic_t loadavg_stop = 0;
83 static int calc_hash(const char *name)
84 {
85 unsigned int hash = 0;
86 unsigned int x = 0;
87 /* ELFHash algorithm. */
88 while (*name) {
89 hash = (hash << 4) + *name++;
90 x = hash & 0xf0000000;
91 if (x != 0)
92 hash ^= (x >> 24);
93 hash &= ~x;
94 }
95 return (hash & 0x7fffffff);
96 }
97
98 struct load_node {
99 char *cg; /*cg */
100 unsigned long avenrun[3]; /* Load averages */
101 unsigned int run_pid;
102 unsigned int total_pid;
103 unsigned int last_pid;
104 int cfd; /* The file descriptor of the mounted cgroup */
105 struct load_node *next;
106 struct load_node **pre;
107 };
108
109 struct load_head {
110 /*
111 * The lock is about insert load_node and refresh load_node.To the first
112 * load_node of each hash bucket, insert and refresh in this hash bucket is
113 * mutually exclusive.
114 */
115 pthread_mutex_t lock;
116 /*
117 * The rdlock is about read loadavg and delete load_node.To each hash
118 * bucket, read and delete is mutually exclusive. But at the same time, we
119 * allow paratactic read operation. This rdlock is at list level.
120 */
121 pthread_rwlock_t rdlock;
122 /*
123 * The rilock is about read loadavg and insert load_node.To the first
124 * load_node of each hash bucket, read and insert is mutually exclusive.
125 * But at the same time, we allow paratactic read operation.
126 */
127 pthread_rwlock_t rilock;
128 struct load_node *next;
129 };
130
131 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
132 /*
133 * init_load initialize the hash table.
134 * Return 0 on success, return -1 on failure.
135 */
136 static int init_load(void)
137 {
138 int i;
139 int ret;
140
141 for (i = 0; i < LOAD_SIZE; i++) {
142 load_hash[i].next = NULL;
143 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
144 if (ret != 0) {
145 lxcfs_error("%s\n", "Failed to initialize lock");
146 goto out3;
147 }
148 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
149 if (ret != 0) {
150 lxcfs_error("%s\n", "Failed to initialize rdlock");
151 goto out2;
152 }
153 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
154 if (ret != 0) {
155 lxcfs_error("%s\n", "Failed to initialize rilock");
156 goto out1;
157 }
158 }
159 return 0;
160 out1:
161 pthread_rwlock_destroy(&load_hash[i].rdlock);
162 out2:
163 pthread_mutex_destroy(&load_hash[i].lock);
164 out3:
165 while (i > 0) {
166 i--;
167 pthread_mutex_destroy(&load_hash[i].lock);
168 pthread_rwlock_destroy(&load_hash[i].rdlock);
169 pthread_rwlock_destroy(&load_hash[i].rilock);
170 }
171 return -1;
172 }
173
174 static void insert_node(struct load_node **n, int locate)
175 {
176 struct load_node *f;
177
178 pthread_mutex_lock(&load_hash[locate].lock);
179 pthread_rwlock_wrlock(&load_hash[locate].rilock);
180 f = load_hash[locate].next;
181 load_hash[locate].next = *n;
182
183 (*n)->pre = &(load_hash[locate].next);
184 if (f)
185 f->pre = &((*n)->next);
186 (*n)->next = f;
187 pthread_mutex_unlock(&load_hash[locate].lock);
188 pthread_rwlock_unlock(&load_hash[locate].rilock);
189 }
190 /*
191 * locate_node() finds special node. Not return NULL means success.
192 * It should be noted that rdlock isn't unlocked at the end of code
193 * because this function is used to read special node. Delete is not
194 * allowed before read has ended.
195 * unlock rdlock only in proc_loadavg_read().
196 */
197 static struct load_node *locate_node(char *cg, int locate)
198 {
199 struct load_node *f = NULL;
200 int i = 0;
201
202 pthread_rwlock_rdlock(&load_hash[locate].rilock);
203 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
204 if (load_hash[locate].next == NULL) {
205 pthread_rwlock_unlock(&load_hash[locate].rilock);
206 return f;
207 }
208 f = load_hash[locate].next;
209 pthread_rwlock_unlock(&load_hash[locate].rilock);
210 while (f && ((i = strcmp(f->cg, cg)) != 0))
211 f = f->next;
212 return f;
213 }
214 /* Delete the load_node n and return the next node of it. */
215 static struct load_node *del_node(struct load_node *n, int locate)
216 {
217 struct load_node *g;
218
219 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
220 if (n->next == NULL) {
221 *(n->pre) = NULL;
222 } else {
223 *(n->pre) = n->next;
224 n->next->pre = n->pre;
225 }
226 g = n->next;
227 free(n->cg);
228 free(n);
229 pthread_rwlock_unlock(&load_hash[locate].rdlock);
230 return g;
231 }
232
233 static void load_free(void)
234 {
235 int i;
236 struct load_node *f, *p;
237
238 for (i = 0; i < LOAD_SIZE; i++) {
239 pthread_mutex_lock(&load_hash[i].lock);
240 pthread_rwlock_wrlock(&load_hash[i].rilock);
241 pthread_rwlock_wrlock(&load_hash[i].rdlock);
242 if (load_hash[i].next == NULL) {
243 pthread_mutex_unlock(&load_hash[i].lock);
244 pthread_mutex_destroy(&load_hash[i].lock);
245 pthread_rwlock_unlock(&load_hash[i].rilock);
246 pthread_rwlock_destroy(&load_hash[i].rilock);
247 pthread_rwlock_unlock(&load_hash[i].rdlock);
248 pthread_rwlock_destroy(&load_hash[i].rdlock);
249 continue;
250 }
251 for (f = load_hash[i].next; f; ) {
252 free(f->cg);
253 p = f->next;
254 free(f);
255 f = p;
256 }
257 pthread_mutex_unlock(&load_hash[i].lock);
258 pthread_mutex_destroy(&load_hash[i].lock);
259 pthread_rwlock_unlock(&load_hash[i].rilock);
260 pthread_rwlock_destroy(&load_hash[i].rilock);
261 pthread_rwlock_unlock(&load_hash[i].rdlock);
262 pthread_rwlock_destroy(&load_hash[i].rdlock);
263 }
264 }
265
266 /* Data for CPU view */
267 struct cg_proc_stat {
268 char *cg;
269 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
270 struct cpuacct_usage *view; // Usage stats reported to the container
271 int cpu_count;
272 pthread_mutex_t lock; // For node manipulation
273 struct cg_proc_stat *next;
274 };
275
276 struct cg_proc_stat_head {
277 struct cg_proc_stat *next;
278 time_t lastcheck;
279
280 /*
281 * For access to the list. Reading can be parallel, pruning is exclusive.
282 */
283 pthread_rwlock_t lock;
284 };
285
286 #define CPUVIEW_HASH_SIZE 100
287 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
288
289 static bool cpuview_init_head(struct cg_proc_stat_head **head)
290 {
291 *head = malloc(sizeof(struct cg_proc_stat_head));
292 if (!(*head)) {
293 lxcfs_error("%s\n", strerror(errno));
294 return false;
295 }
296
297 (*head)->lastcheck = time(NULL);
298 (*head)->next = NULL;
299
300 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
301 lxcfs_error("%s\n", "Failed to initialize list lock");
302 free(*head);
303 return false;
304 }
305
306 return true;
307 }
308
309 static bool init_cpuview()
310 {
311 int i;
312
313 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
314 proc_stat_history[i] = NULL;
315
316 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
317 if (!cpuview_init_head(&proc_stat_history[i]))
318 goto err;
319 }
320
321 return true;
322
323 err:
324 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
325 if (proc_stat_history[i]) {
326 free(proc_stat_history[i]);
327 proc_stat_history[i] = NULL;
328 }
329 }
330
331 return false;
332 }
333
334 static void free_proc_stat_node(struct cg_proc_stat *node)
335 {
336 pthread_mutex_destroy(&node->lock);
337 free(node->cg);
338 free(node->usage);
339 free(node->view);
340 free(node);
341 }
342
343 static void cpuview_free_head(struct cg_proc_stat_head *head)
344 {
345 struct cg_proc_stat *node, *tmp;
346
347 if (head->next) {
348 node = head->next;
349
350 for (;;) {
351 tmp = node;
352 node = node->next;
353 free_proc_stat_node(tmp);
354
355 if (!node)
356 break;
357 }
358 }
359
360 pthread_rwlock_destroy(&head->lock);
361 free(head);
362 }
363
364 static void free_cpuview()
365 {
366 int i;
367
368 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
369 if (proc_stat_history[i])
370 cpuview_free_head(proc_stat_history[i]);
371 }
372 }
373
374 /*
375 * A table caching which pid is init for a pid namespace.
376 * When looking up which pid is init for $qpid, we first
377 * 1. Stat /proc/$qpid/ns/pid.
378 * 2. Check whether the ino_t is in our store.
379 * a. if not, fork a child in qpid's ns to send us
380 * ucred.pid = 1, and read the initpid. Cache
381 * initpid and creation time for /proc/initpid
382 * in a new store entry.
383 * b. if so, verify that /proc/initpid still matches
384 * what we have saved. If not, clear the store
385 * entry and go back to a. If so, return the
386 * cached initpid.
387 */
388 struct pidns_init_store {
389 ino_t ino; // inode number for /proc/$pid/ns/pid
390 pid_t initpid; // the pid of nit in that ns
391 long int ctime; // the time at which /proc/$initpid was created
392 struct pidns_init_store *next;
393 long int lastcheck;
394 };
395
396 /* lol - look at how they are allocated in the kernel */
397 #define PIDNS_HASH_SIZE 4096
398 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
399
400 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
401 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
402 static void lock_mutex(pthread_mutex_t *l)
403 {
404 int ret;
405
406 if ((ret = pthread_mutex_lock(l)) != 0) {
407 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
408 exit(1);
409 }
410 }
411
412 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
413 * Number of hierarchies mounted. */
414 static int num_hierarchies;
415
416 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
417 * Hierachies mounted {cpuset, blkio, ...}:
418 * Initialized via __constructor__ collect_and_mount_subsystems(). */
419 static char **hierarchies;
420
421 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
422 * Open file descriptors:
423 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
424 * private mount namespace.
425 * Initialized via __constructor__ collect_and_mount_subsystems().
426 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
427 * mounts and respective files in the private namespace even when located in
428 * another namespace using the *at() family of functions
429 * {openat(), fchownat(), ...}. */
430 static int *fd_hierarchies;
431 static int cgroup_mount_ns_fd = -1;
432
433 static void unlock_mutex(pthread_mutex_t *l)
434 {
435 int ret;
436
437 if ((ret = pthread_mutex_unlock(l)) != 0) {
438 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
439 exit(1);
440 }
441 }
442
443 static void store_lock(void)
444 {
445 lock_mutex(&pidns_store_mutex);
446 }
447
448 static void store_unlock(void)
449 {
450 unlock_mutex(&pidns_store_mutex);
451 }
452
453 /* Must be called under store_lock */
454 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
455 {
456 struct stat initsb;
457 char fnam[100];
458
459 snprintf(fnam, 100, "/proc/%d", e->initpid);
460 if (stat(fnam, &initsb) < 0)
461 return false;
462
463 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
464 initsb.st_ctime, e->initpid);
465
466 if (e->ctime != initsb.st_ctime)
467 return false;
468 return true;
469 }
470
471 /* Must be called under store_lock */
472 static void remove_initpid(struct pidns_init_store *e)
473 {
474 struct pidns_init_store *tmp;
475 int h;
476
477 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
478
479 h = HASH(e->ino);
480 if (pidns_hash_table[h] == e) {
481 pidns_hash_table[h] = e->next;
482 free(e);
483 return;
484 }
485
486 tmp = pidns_hash_table[h];
487 while (tmp) {
488 if (tmp->next == e) {
489 tmp->next = e->next;
490 free(e);
491 return;
492 }
493 tmp = tmp->next;
494 }
495 }
496
497 #define PURGE_SECS 5
498 /* Must be called under store_lock */
499 static void prune_initpid_store(void)
500 {
501 static long int last_prune = 0;
502 struct pidns_init_store *e, *prev, *delme;
503 long int now, threshold;
504 int i;
505
506 if (!last_prune) {
507 last_prune = time(NULL);
508 return;
509 }
510 now = time(NULL);
511 if (now < last_prune + PURGE_SECS)
512 return;
513
514 lxcfs_debug("%s\n", "Pruning.");
515
516 last_prune = now;
517 threshold = now - 2 * PURGE_SECS;
518
519 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
520 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
521 if (e->lastcheck < threshold) {
522
523 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
524
525 delme = e;
526 if (prev)
527 prev->next = e->next;
528 else
529 pidns_hash_table[i] = e->next;
530 e = e->next;
531 free(delme);
532 } else {
533 prev = e;
534 e = e->next;
535 }
536 }
537 }
538 }
539
540 /* Must be called under store_lock */
541 static void save_initpid(struct stat *sb, pid_t pid)
542 {
543 struct pidns_init_store *e;
544 char fpath[100];
545 struct stat procsb;
546 int h;
547
548 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
549
550 snprintf(fpath, 100, "/proc/%d", pid);
551 if (stat(fpath, &procsb) < 0)
552 return;
553 do {
554 e = malloc(sizeof(*e));
555 } while (!e);
556 e->ino = sb->st_ino;
557 e->initpid = pid;
558 e->ctime = procsb.st_ctime;
559 h = HASH(e->ino);
560 e->next = pidns_hash_table[h];
561 e->lastcheck = time(NULL);
562 pidns_hash_table[h] = e;
563 }
564
565 /*
566 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
567 * entry for the inode number and creation time. Verify that the init pid
568 * is still valid. If not, remove it. Return the entry if valid, NULL
569 * otherwise.
570 * Must be called under store_lock
571 */
572 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
573 {
574 int h = HASH(sb->st_ino);
575 struct pidns_init_store *e = pidns_hash_table[h];
576
577 while (e) {
578 if (e->ino == sb->st_ino) {
579 if (initpid_still_valid(e, sb)) {
580 e->lastcheck = time(NULL);
581 return e;
582 }
583 remove_initpid(e);
584 return NULL;
585 }
586 e = e->next;
587 }
588
589 return NULL;
590 }
591
592 static int is_dir(const char *path, int fd)
593 {
594 struct stat statbuf;
595 int ret = fstatat(fd, path, &statbuf, fd);
596 if (ret == 0 && S_ISDIR(statbuf.st_mode))
597 return 1;
598 return 0;
599 }
600
601 static char *must_copy_string(const char *str)
602 {
603 char *dup = NULL;
604 if (!str)
605 return NULL;
606 do {
607 dup = strdup(str);
608 } while (!dup);
609
610 return dup;
611 }
612
613 static inline void drop_trailing_newlines(char *s)
614 {
615 int l;
616
617 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
618 s[l-1] = '\0';
619 }
620
621 #define BATCH_SIZE 50
622 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
623 {
624 int newbatches = (newlen / BATCH_SIZE) + 1;
625 int oldbatches = (oldlen / BATCH_SIZE) + 1;
626
627 if (!*mem || newbatches > oldbatches) {
628 char *tmp;
629 do {
630 tmp = realloc(*mem, newbatches * BATCH_SIZE);
631 } while (!tmp);
632 *mem = tmp;
633 }
634 }
635 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
636 {
637 size_t newlen = *len + linelen;
638 dorealloc(contents, *len, newlen + 1);
639 memcpy(*contents + *len, line, linelen+1);
640 *len = newlen;
641 }
642
643 static char *slurp_file(const char *from, int fd)
644 {
645 char *line = NULL;
646 char *contents = NULL;
647 FILE *f = fdopen(fd, "r");
648 size_t len = 0, fulllen = 0;
649 ssize_t linelen;
650
651 if (!f)
652 return NULL;
653
654 while ((linelen = getline(&line, &len, f)) != -1) {
655 append_line(&contents, &fulllen, line, linelen);
656 }
657 fclose(f);
658
659 if (contents)
660 drop_trailing_newlines(contents);
661 free(line);
662 return contents;
663 }
664
665 static bool write_string(const char *fnam, const char *string, int fd)
666 {
667 FILE *f;
668 size_t len, ret;
669
670 f = fdopen(fd, "w");
671 if (!f)
672 return false;
673
674 len = strlen(string);
675 ret = fwrite(string, 1, len, f);
676 if (ret != len) {
677 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
678 strerror(errno), string, fnam);
679 fclose(f);
680 return false;
681 }
682
683 if (fclose(f) < 0) {
684 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
685 return false;
686 }
687
688 return true;
689 }
690
691 struct cgfs_files {
692 char *name;
693 uint32_t uid, gid;
694 uint32_t mode;
695 };
696
697 #define ALLOC_NUM 20
698 static bool store_hierarchy(char *stridx, char *h)
699 {
700 if (num_hierarchies % ALLOC_NUM == 0) {
701 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
702 n *= ALLOC_NUM;
703 char **tmp = realloc(hierarchies, n * sizeof(char *));
704 if (!tmp) {
705 lxcfs_error("%s\n", strerror(errno));
706 exit(1);
707 }
708 hierarchies = tmp;
709 }
710
711 hierarchies[num_hierarchies++] = must_copy_string(h);
712 return true;
713 }
714
715 static void print_subsystems(void)
716 {
717 int i;
718
719 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
720 fprintf(stderr, "hierarchies:\n");
721 for (i = 0; i < num_hierarchies; i++) {
722 if (hierarchies[i])
723 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
724 fd_hierarchies[i], hierarchies[i]);
725 }
726 }
727
728 static bool in_comma_list(const char *needle, const char *haystack)
729 {
730 const char *s = haystack, *e;
731 size_t nlen = strlen(needle);
732
733 while (*s && (e = strchr(s, ','))) {
734 if (nlen != e - s) {
735 s = e + 1;
736 continue;
737 }
738 if (strncmp(needle, s, nlen) == 0)
739 return true;
740 s = e + 1;
741 }
742 if (strcmp(needle, s) == 0)
743 return true;
744 return false;
745 }
746
747 /* do we need to do any massaging here? I'm not sure... */
748 /* Return the mounted controller and store the corresponding open file descriptor
749 * referring to the controller mountpoint in the private lxcfs namespace in
750 * @cfd.
751 */
752 static char *find_mounted_controller(const char *controller, int *cfd)
753 {
754 int i;
755
756 for (i = 0; i < num_hierarchies; i++) {
757 if (!hierarchies[i])
758 continue;
759 if (strcmp(hierarchies[i], controller) == 0) {
760 *cfd = fd_hierarchies[i];
761 return hierarchies[i];
762 }
763 if (in_comma_list(controller, hierarchies[i])) {
764 *cfd = fd_hierarchies[i];
765 return hierarchies[i];
766 }
767 }
768
769 return NULL;
770 }
771
772 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
773 const char *value)
774 {
775 int ret, fd, cfd;
776 size_t len;
777 char *fnam, *tmpc;
778
779 tmpc = find_mounted_controller(controller, &cfd);
780 if (!tmpc)
781 return false;
782
783 /* Make sure we pass a relative path to *at() family of functions.
784 * . + /cgroup + / + file + \0
785 */
786 len = strlen(cgroup) + strlen(file) + 3;
787 fnam = alloca(len);
788 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
789 if (ret < 0 || (size_t)ret >= len)
790 return false;
791
792 fd = openat(cfd, fnam, O_WRONLY);
793 if (fd < 0)
794 return false;
795
796 return write_string(fnam, value, fd);
797 }
798
799 // Chown all the files in the cgroup directory. We do this when we create
800 // a cgroup on behalf of a user.
801 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
802 {
803 struct dirent *direntp;
804 char path[MAXPATHLEN];
805 size_t len;
806 DIR *d;
807 int fd1, ret;
808
809 len = strlen(dirname);
810 if (len >= MAXPATHLEN) {
811 lxcfs_error("Pathname too long: %s\n", dirname);
812 return;
813 }
814
815 fd1 = openat(fd, dirname, O_DIRECTORY);
816 if (fd1 < 0)
817 return;
818
819 d = fdopendir(fd1);
820 if (!d) {
821 lxcfs_error("Failed to open %s\n", dirname);
822 return;
823 }
824
825 while ((direntp = readdir(d))) {
826 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
827 continue;
828 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
829 if (ret < 0 || ret >= MAXPATHLEN) {
830 lxcfs_error("Pathname too long under %s\n", dirname);
831 continue;
832 }
833 if (fchownat(fd, path, uid, gid, 0) < 0)
834 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
835 }
836 closedir(d);
837 }
838
839 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
840 {
841 int cfd;
842 size_t len;
843 char *dirnam, *tmpc;
844
845 tmpc = find_mounted_controller(controller, &cfd);
846 if (!tmpc)
847 return -EINVAL;
848
849 /* Make sure we pass a relative path to *at() family of functions.
850 * . + /cg + \0
851 */
852 len = strlen(cg) + 2;
853 dirnam = alloca(len);
854 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
855
856 if (mkdirat(cfd, dirnam, 0755) < 0)
857 return -errno;
858
859 if (uid == 0 && gid == 0)
860 return 0;
861
862 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
863 return -errno;
864
865 chown_all_cgroup_files(dirnam, uid, gid, cfd);
866
867 return 0;
868 }
869
870 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
871 {
872 struct dirent *direntp;
873 DIR *dir;
874 bool ret = false;
875 char pathname[MAXPATHLEN];
876 int dupfd;
877
878 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
879 if (dupfd < 0)
880 return false;
881
882 dir = fdopendir(dupfd);
883 if (!dir) {
884 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
885 close(dupfd);
886 return false;
887 }
888
889 while ((direntp = readdir(dir))) {
890 struct stat mystat;
891 int rc;
892
893 if (!strcmp(direntp->d_name, ".") ||
894 !strcmp(direntp->d_name, ".."))
895 continue;
896
897 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
898 if (rc < 0 || rc >= MAXPATHLEN) {
899 lxcfs_error("%s\n", "Pathname too long.");
900 continue;
901 }
902
903 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
904 if (rc) {
905 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
906 continue;
907 }
908 if (S_ISDIR(mystat.st_mode))
909 if (!recursive_rmdir(pathname, fd, cfd))
910 lxcfs_debug("Error removing %s.\n", pathname);
911 }
912
913 ret = true;
914 if (closedir(dir) < 0) {
915 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
916 ret = false;
917 }
918
919 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
920 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
921 ret = false;
922 }
923
924 close(dupfd);
925
926 return ret;
927 }
928
929 bool cgfs_remove(const char *controller, const char *cg)
930 {
931 int fd, cfd;
932 size_t len;
933 char *dirnam, *tmpc;
934 bool bret;
935
936 tmpc = find_mounted_controller(controller, &cfd);
937 if (!tmpc)
938 return false;
939
940 /* Make sure we pass a relative path to *at() family of functions.
941 * . + /cg + \0
942 */
943 len = strlen(cg) + 2;
944 dirnam = alloca(len);
945 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
946
947 fd = openat(cfd, dirnam, O_DIRECTORY);
948 if (fd < 0)
949 return false;
950
951 bret = recursive_rmdir(dirnam, fd, cfd);
952 close(fd);
953 return bret;
954 }
955
956 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
957 {
958 int cfd;
959 size_t len;
960 char *pathname, *tmpc;
961
962 tmpc = find_mounted_controller(controller, &cfd);
963 if (!tmpc)
964 return false;
965
966 /* Make sure we pass a relative path to *at() family of functions.
967 * . + /file + \0
968 */
969 len = strlen(file) + 2;
970 pathname = alloca(len);
971 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
972 if (fchmodat(cfd, pathname, mode, 0) < 0)
973 return false;
974 return true;
975 }
976
977 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
978 {
979 size_t len;
980 char *fname;
981
982 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
983 fname = alloca(len);
984 snprintf(fname, len, "%s/tasks", dirname);
985 if (fchownat(fd, fname, uid, gid, 0) != 0)
986 return -errno;
987 snprintf(fname, len, "%s/cgroup.procs", dirname);
988 if (fchownat(fd, fname, uid, gid, 0) != 0)
989 return -errno;
990 return 0;
991 }
992
993 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
994 {
995 int cfd;
996 size_t len;
997 char *pathname, *tmpc;
998
999 tmpc = find_mounted_controller(controller, &cfd);
1000 if (!tmpc)
1001 return -EINVAL;
1002
1003 /* Make sure we pass a relative path to *at() family of functions.
1004 * . + /file + \0
1005 */
1006 len = strlen(file) + 2;
1007 pathname = alloca(len);
1008 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1009 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
1010 return -errno;
1011
1012 if (is_dir(pathname, cfd))
1013 // like cgmanager did, we want to chown the tasks file as well
1014 return chown_tasks_files(pathname, uid, gid, cfd);
1015
1016 return 0;
1017 }
1018
1019 FILE *open_pids_file(const char *controller, const char *cgroup)
1020 {
1021 int fd, cfd;
1022 size_t len;
1023 char *pathname, *tmpc;
1024
1025 tmpc = find_mounted_controller(controller, &cfd);
1026 if (!tmpc)
1027 return NULL;
1028
1029 /* Make sure we pass a relative path to *at() family of functions.
1030 * . + /cgroup + / "cgroup.procs" + \0
1031 */
1032 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
1033 pathname = alloca(len);
1034 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1035
1036 fd = openat(cfd, pathname, O_WRONLY);
1037 if (fd < 0)
1038 return NULL;
1039
1040 return fdopen(fd, "w");
1041 }
1042
1043 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1044 void ***list, size_t typesize,
1045 void* (*iterator)(const char*, const char*, const char*))
1046 {
1047 int cfd, fd, ret;
1048 size_t len;
1049 char *cg, *tmpc;
1050 char pathname[MAXPATHLEN];
1051 size_t sz = 0, asz = 0;
1052 struct dirent *dirent;
1053 DIR *dir;
1054
1055 tmpc = find_mounted_controller(controller, &cfd);
1056 *list = NULL;
1057 if (!tmpc)
1058 return false;
1059
1060 /* Make sure we pass a relative path to *at() family of functions. */
1061 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1062 cg = alloca(len);
1063 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1064 if (ret < 0 || (size_t)ret >= len) {
1065 lxcfs_error("Pathname too long under %s\n", cgroup);
1066 return false;
1067 }
1068
1069 fd = openat(cfd, cg, O_DIRECTORY);
1070 if (fd < 0)
1071 return false;
1072
1073 dir = fdopendir(fd);
1074 if (!dir)
1075 return false;
1076
1077 while ((dirent = readdir(dir))) {
1078 struct stat mystat;
1079
1080 if (!strcmp(dirent->d_name, ".") ||
1081 !strcmp(dirent->d_name, ".."))
1082 continue;
1083
1084 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1085 if (ret < 0 || ret >= MAXPATHLEN) {
1086 lxcfs_error("Pathname too long under %s\n", cg);
1087 continue;
1088 }
1089
1090 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1091 if (ret) {
1092 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1093 continue;
1094 }
1095 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1096 (directories && !S_ISDIR(mystat.st_mode)))
1097 continue;
1098
1099 if (sz+2 >= asz) {
1100 void **tmp;
1101 asz += BATCH_SIZE;
1102 do {
1103 tmp = realloc(*list, asz * typesize);
1104 } while (!tmp);
1105 *list = tmp;
1106 }
1107 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1108 (*list)[sz+1] = NULL;
1109 sz++;
1110 }
1111 if (closedir(dir) < 0) {
1112 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1113 return false;
1114 }
1115 return true;
1116 }
1117
1118 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1119 {
1120 char *dup;
1121 do {
1122 dup = strdup(dir_entry);
1123 } while (!dup);
1124 return dup;
1125 }
1126
1127 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1128 {
1129 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1130 }
1131
1132 void free_key(struct cgfs_files *k)
1133 {
1134 if (!k)
1135 return;
1136 free(k->name);
1137 free(k);
1138 }
1139
1140 void free_keys(struct cgfs_files **keys)
1141 {
1142 int i;
1143
1144 if (!keys)
1145 return;
1146 for (i = 0; keys[i]; i++) {
1147 free_key(keys[i]);
1148 }
1149 free(keys);
1150 }
1151
1152 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1153 {
1154 int ret, fd, cfd;
1155 size_t len;
1156 char *fnam, *tmpc;
1157
1158 tmpc = find_mounted_controller(controller, &cfd);
1159 if (!tmpc)
1160 return false;
1161
1162 /* Make sure we pass a relative path to *at() family of functions.
1163 * . + /cgroup + / + file + \0
1164 */
1165 len = strlen(cgroup) + strlen(file) + 3;
1166 fnam = alloca(len);
1167 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1168 if (ret < 0 || (size_t)ret >= len)
1169 return false;
1170
1171 fd = openat(cfd, fnam, O_RDONLY);
1172 if (fd < 0)
1173 return false;
1174
1175 *value = slurp_file(fnam, fd);
1176 return *value != NULL;
1177 }
1178
1179 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1180 {
1181 int ret, cfd;
1182 size_t len;
1183 char *fnam, *tmpc;
1184
1185 tmpc = find_mounted_controller(controller, &cfd);
1186 if (!tmpc)
1187 return false;
1188
1189 /* Make sure we pass a relative path to *at() family of functions.
1190 * . + /cgroup + / + file + \0
1191 */
1192 len = strlen(cgroup) + strlen(file) + 3;
1193 fnam = alloca(len);
1194 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1195 if (ret < 0 || (size_t)ret >= len)
1196 return false;
1197
1198 return (faccessat(cfd, fnam, F_OK, 0) == 0);
1199 }
1200
1201 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1202 {
1203 int ret, cfd;
1204 size_t len;
1205 char *fnam, *tmpc;
1206 struct stat sb;
1207 struct cgfs_files *newkey;
1208
1209 tmpc = find_mounted_controller(controller, &cfd);
1210 if (!tmpc)
1211 return false;
1212
1213 if (file && *file == '/')
1214 file++;
1215
1216 if (file && strchr(file, '/'))
1217 return NULL;
1218
1219 /* Make sure we pass a relative path to *at() family of functions.
1220 * . + /cgroup + / + file + \0
1221 */
1222 len = strlen(cgroup) + 3;
1223 if (file)
1224 len += strlen(file) + 1;
1225 fnam = alloca(len);
1226 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1227 file ? "/" : "", file ? file : "");
1228
1229 ret = fstatat(cfd, fnam, &sb, 0);
1230 if (ret < 0)
1231 return NULL;
1232
1233 do {
1234 newkey = malloc(sizeof(struct cgfs_files));
1235 } while (!newkey);
1236 if (file)
1237 newkey->name = must_copy_string(file);
1238 else if (strrchr(cgroup, '/'))
1239 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1240 else
1241 newkey->name = must_copy_string(cgroup);
1242 newkey->uid = sb.st_uid;
1243 newkey->gid = sb.st_gid;
1244 newkey->mode = sb.st_mode;
1245
1246 return newkey;
1247 }
1248
1249 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1250 {
1251 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1252 if (!entry) {
1253 lxcfs_error("Error getting files under %s:%s\n", controller,
1254 cgroup);
1255 }
1256 return entry;
1257 }
1258
1259 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1260 {
1261 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1262 }
1263
1264 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1265 {
1266 int cfd;
1267 size_t len;
1268 char *fnam, *tmpc;
1269 int ret;
1270 struct stat sb;
1271
1272 tmpc = find_mounted_controller(controller, &cfd);
1273 if (!tmpc)
1274 return false;
1275
1276 /* Make sure we pass a relative path to *at() family of functions.
1277 * . + /cgroup + / + f + \0
1278 */
1279 len = strlen(cgroup) + strlen(f) + 3;
1280 fnam = alloca(len);
1281 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1282 if (ret < 0 || (size_t)ret >= len)
1283 return false;
1284
1285 ret = fstatat(cfd, fnam, &sb, 0);
1286 if (ret < 0 || !S_ISDIR(sb.st_mode))
1287 return false;
1288
1289 return true;
1290 }
1291
1292 #define SEND_CREDS_OK 0
1293 #define SEND_CREDS_NOTSK 1
1294 #define SEND_CREDS_FAIL 2
1295 static bool recv_creds(int sock, struct ucred *cred, char *v);
1296 static int wait_for_pid(pid_t pid);
1297 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1298 static int send_creds_clone_wrapper(void *arg);
1299
1300 /*
1301 * clone a task which switches to @task's namespace and writes '1'.
1302 * over a unix sock so we can read the task's reaper's pid in our
1303 * namespace
1304 *
1305 * Note: glibc's fork() does not respect pidns, which can lead to failed
1306 * assertions inside glibc (and thus failed forks) if the child's pid in
1307 * the pidns and the parent pid outside are identical. Using clone prevents
1308 * this issue.
1309 */
1310 static void write_task_init_pid_exit(int sock, pid_t target)
1311 {
1312 char fnam[100];
1313 pid_t pid;
1314 int fd, ret;
1315 size_t stack_size = sysconf(_SC_PAGESIZE);
1316 void *stack = alloca(stack_size);
1317
1318 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1319 if (ret < 0 || ret >= sizeof(fnam))
1320 _exit(1);
1321
1322 fd = open(fnam, O_RDONLY);
1323 if (fd < 0) {
1324 perror("write_task_init_pid_exit open of ns/pid");
1325 _exit(1);
1326 }
1327 if (setns(fd, 0)) {
1328 perror("write_task_init_pid_exit setns 1");
1329 close(fd);
1330 _exit(1);
1331 }
1332 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1333 if (pid < 0)
1334 _exit(1);
1335 if (pid != 0) {
1336 if (!wait_for_pid(pid))
1337 _exit(1);
1338 _exit(0);
1339 }
1340 }
1341
1342 static int send_creds_clone_wrapper(void *arg) {
1343 struct ucred cred;
1344 char v;
1345 int sock = *(int *)arg;
1346
1347 /* we are the child */
1348 cred.uid = 0;
1349 cred.gid = 0;
1350 cred.pid = 1;
1351 v = '1';
1352 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1353 return 1;
1354 return 0;
1355 }
1356
1357 static pid_t get_init_pid_for_task(pid_t task)
1358 {
1359 int sock[2];
1360 pid_t pid;
1361 pid_t ret = -1;
1362 char v = '0';
1363 struct ucred cred;
1364
1365 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1366 perror("socketpair");
1367 return -1;
1368 }
1369
1370 pid = fork();
1371 if (pid < 0)
1372 goto out;
1373 if (!pid) {
1374 close(sock[1]);
1375 write_task_init_pid_exit(sock[0], task);
1376 _exit(0);
1377 }
1378
1379 if (!recv_creds(sock[1], &cred, &v))
1380 goto out;
1381 ret = cred.pid;
1382
1383 out:
1384 close(sock[0]);
1385 close(sock[1]);
1386 if (pid > 0)
1387 wait_for_pid(pid);
1388 return ret;
1389 }
1390
1391 pid_t lookup_initpid_in_store(pid_t qpid)
1392 {
1393 pid_t answer = 0;
1394 struct stat sb;
1395 struct pidns_init_store *e;
1396 char fnam[100];
1397
1398 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1399 store_lock();
1400 if (stat(fnam, &sb) < 0)
1401 goto out;
1402 e = lookup_verify_initpid(&sb);
1403 if (e) {
1404 answer = e->initpid;
1405 goto out;
1406 }
1407 answer = get_init_pid_for_task(qpid);
1408 if (answer > 0)
1409 save_initpid(&sb, answer);
1410
1411 out:
1412 /* we prune at end in case we are returning
1413 * the value we were about to return */
1414 prune_initpid_store();
1415 store_unlock();
1416 return answer;
1417 }
1418
1419 static int wait_for_pid(pid_t pid)
1420 {
1421 int status, ret;
1422
1423 if (pid <= 0)
1424 return -1;
1425
1426 again:
1427 ret = waitpid(pid, &status, 0);
1428 if (ret == -1) {
1429 if (errno == EINTR)
1430 goto again;
1431 return -1;
1432 }
1433 if (ret != pid)
1434 goto again;
1435 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1436 return -1;
1437 return 0;
1438 }
1439
1440 /*
1441 * append the given formatted string to *src.
1442 * src: a pointer to a char* in which to append the formatted string.
1443 * sz: the number of characters printed so far, minus trailing \0.
1444 * asz: the allocated size so far
1445 * format: string format. See printf for details.
1446 * ...: varargs. See printf for details.
1447 */
1448 static void must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...)
1449 {
1450 char tmp[BUF_RESERVE_SIZE];
1451 va_list args;
1452
1453 va_start (args, format);
1454 int tmplen = vsnprintf(tmp, BUF_RESERVE_SIZE, format, args);
1455 va_end(args);
1456
1457 if (!*src || tmplen + *sz + 1 >= *asz) {
1458 char *tmp;
1459 do {
1460 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1461 } while (!tmp);
1462 *src = tmp;
1463 *asz += BUF_RESERVE_SIZE;
1464 }
1465 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1466 *sz += tmplen;
1467 }
1468
1469 /*
1470 * append pid to *src.
1471 * src: a pointer to a char* in which ot append the pid.
1472 * sz: the number of characters printed so far, minus trailing \0.
1473 * asz: the allocated size so far
1474 * pid: the pid to append
1475 */
1476 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1477 {
1478 must_strcat(src, sz, asz, "%d\n", (int)pid);
1479 }
1480
1481 /*
1482 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1483 * valid in the caller's namespace, return the id mapped into
1484 * pid's namespace.
1485 * Returns the mapped id, or -1 on error.
1486 */
1487 unsigned int
1488 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1489 {
1490 unsigned int nsuid, // base id for a range in the idfile's namespace
1491 hostuid, // base id for a range in the caller's namespace
1492 count; // number of ids in this range
1493 char line[400];
1494 int ret;
1495
1496 fseek(idfile, 0L, SEEK_SET);
1497 while (fgets(line, 400, idfile)) {
1498 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1499 if (ret != 3)
1500 continue;
1501 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1502 /*
1503 * uids wrapped around - unexpected as this is a procfile,
1504 * so just bail.
1505 */
1506 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1507 nsuid, hostuid, count, line);
1508 return -1;
1509 }
1510 if (hostuid <= in_id && hostuid+count > in_id) {
1511 /*
1512 * now since hostuid <= in_id < hostuid+count, and
1513 * hostuid+count and nsuid+count do not wrap around,
1514 * we know that nsuid+(in_id-hostuid) which must be
1515 * less that nsuid+(count) must not wrap around
1516 */
1517 return (in_id - hostuid) + nsuid;
1518 }
1519 }
1520
1521 // no answer found
1522 return -1;
1523 }
1524
1525 /*
1526 * for is_privileged_over,
1527 * specify whether we require the calling uid to be root in his
1528 * namespace
1529 */
1530 #define NS_ROOT_REQD true
1531 #define NS_ROOT_OPT false
1532
1533 #define PROCLEN 100
1534
1535 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1536 {
1537 char fpath[PROCLEN];
1538 int ret;
1539 bool answer = false;
1540 uid_t nsuid;
1541
1542 if (victim == -1 || uid == -1)
1543 return false;
1544
1545 /*
1546 * If the request is one not requiring root in the namespace,
1547 * then having the same uid suffices. (i.e. uid 1000 has write
1548 * access to files owned by uid 1000
1549 */
1550 if (!req_ns_root && uid == victim)
1551 return true;
1552
1553 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1554 if (ret < 0 || ret >= PROCLEN)
1555 return false;
1556 FILE *f = fopen(fpath, "r");
1557 if (!f)
1558 return false;
1559
1560 /* if caller's not root in his namespace, reject */
1561 nsuid = convert_id_to_ns(f, uid);
1562 if (nsuid)
1563 goto out;
1564
1565 /*
1566 * If victim is not mapped into caller's ns, reject.
1567 * XXX I'm not sure this check is needed given that fuse
1568 * will be sending requests where the vfs has converted
1569 */
1570 nsuid = convert_id_to_ns(f, victim);
1571 if (nsuid == -1)
1572 goto out;
1573
1574 answer = true;
1575
1576 out:
1577 fclose(f);
1578 return answer;
1579 }
1580
1581 static bool perms_include(int fmode, mode_t req_mode)
1582 {
1583 mode_t r;
1584
1585 switch (req_mode & O_ACCMODE) {
1586 case O_RDONLY:
1587 r = S_IROTH;
1588 break;
1589 case O_WRONLY:
1590 r = S_IWOTH;
1591 break;
1592 case O_RDWR:
1593 r = S_IROTH | S_IWOTH;
1594 break;
1595 default:
1596 return false;
1597 }
1598 return ((fmode & r) == r);
1599 }
1600
1601
1602 /*
1603 * taskcg is a/b/c
1604 * querycg is /a/b/c/d/e
1605 * we return 'd'
1606 */
1607 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1608 {
1609 char *start, *end;
1610
1611 if (strlen(taskcg) <= strlen(querycg)) {
1612 lxcfs_error("%s\n", "I was fed bad input.");
1613 return NULL;
1614 }
1615
1616 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1617 start = strdup(taskcg + 1);
1618 else
1619 start = strdup(taskcg + strlen(querycg) + 1);
1620 if (!start)
1621 return NULL;
1622 end = strchr(start, '/');
1623 if (end)
1624 *end = '\0';
1625 return start;
1626 }
1627
1628 static void stripnewline(char *x)
1629 {
1630 size_t l = strlen(x);
1631 if (l && x[l-1] == '\n')
1632 x[l-1] = '\0';
1633 }
1634
1635 char *get_pid_cgroup(pid_t pid, const char *contrl)
1636 {
1637 int cfd;
1638 char fnam[PROCLEN];
1639 FILE *f;
1640 char *answer = NULL;
1641 char *line = NULL;
1642 size_t len = 0;
1643 int ret;
1644 const char *h = find_mounted_controller(contrl, &cfd);
1645 if (!h)
1646 return NULL;
1647
1648 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1649 if (ret < 0 || ret >= PROCLEN)
1650 return NULL;
1651 if (!(f = fopen(fnam, "r")))
1652 return NULL;
1653
1654 while (getline(&line, &len, f) != -1) {
1655 char *c1, *c2;
1656 if (!line[0])
1657 continue;
1658 c1 = strchr(line, ':');
1659 if (!c1)
1660 goto out;
1661 c1++;
1662 c2 = strchr(c1, ':');
1663 if (!c2)
1664 goto out;
1665 *c2 = '\0';
1666 if (strcmp(c1, h) != 0)
1667 continue;
1668 c2++;
1669 stripnewline(c2);
1670 do {
1671 answer = strdup(c2);
1672 } while (!answer);
1673 break;
1674 }
1675
1676 out:
1677 fclose(f);
1678 free(line);
1679 return answer;
1680 }
1681
1682 /*
1683 * check whether a fuse context may access a cgroup dir or file
1684 *
1685 * If file is not null, it is a cgroup file to check under cg.
1686 * If file is null, then we are checking perms on cg itself.
1687 *
1688 * For files we can check the mode of the list_keys result.
1689 * For cgroups, we must make assumptions based on the files under the
1690 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1691 * yet.
1692 */
1693 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1694 {
1695 struct cgfs_files *k = NULL;
1696 bool ret = false;
1697
1698 k = cgfs_get_key(contrl, cg, file);
1699 if (!k)
1700 return false;
1701
1702 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1703 if (perms_include(k->mode >> 6, mode)) {
1704 ret = true;
1705 goto out;
1706 }
1707 }
1708 if (fc->gid == k->gid) {
1709 if (perms_include(k->mode >> 3, mode)) {
1710 ret = true;
1711 goto out;
1712 }
1713 }
1714 ret = perms_include(k->mode, mode);
1715
1716 out:
1717 free_key(k);
1718 return ret;
1719 }
1720
1721 #define INITSCOPE "/init.scope"
1722 void prune_init_slice(char *cg)
1723 {
1724 char *point;
1725 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1726
1727 if (cg_len < initscope_len)
1728 return;
1729
1730 point = cg + cg_len - initscope_len;
1731 if (strcmp(point, INITSCOPE) == 0) {
1732 if (point == cg)
1733 *(point+1) = '\0';
1734 else
1735 *point = '\0';
1736 }
1737 }
1738
1739 /*
1740 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1741 * If pid is in /a, he may act on /a/b, but not on /b.
1742 * if the answer is false and nextcg is not NULL, then *nextcg will point
1743 * to a string containing the next cgroup directory under cg, which must be
1744 * freed by the caller.
1745 */
1746 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1747 {
1748 bool answer = false;
1749 char *c2 = get_pid_cgroup(pid, contrl);
1750 char *linecmp;
1751
1752 if (!c2)
1753 return false;
1754 prune_init_slice(c2);
1755
1756 /*
1757 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1758 * they pass in a cgroup without leading '/'
1759 *
1760 * The original line here was:
1761 * linecmp = *cg == '/' ? c2 : c2+1;
1762 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1763 * Serge, do you know?
1764 */
1765 if (*cg == '/' || !strncmp(cg, "./", 2))
1766 linecmp = c2;
1767 else
1768 linecmp = c2 + 1;
1769 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1770 if (nextcg) {
1771 *nextcg = get_next_cgroup_dir(linecmp, cg);
1772 }
1773 goto out;
1774 }
1775 answer = true;
1776
1777 out:
1778 free(c2);
1779 return answer;
1780 }
1781
1782 /*
1783 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1784 */
1785 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1786 {
1787 bool answer = false;
1788 char *c2, *task_cg;
1789 size_t target_len, task_len;
1790
1791 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1792 return true;
1793
1794 c2 = get_pid_cgroup(pid, contrl);
1795 if (!c2)
1796 return false;
1797 prune_init_slice(c2);
1798
1799 task_cg = c2 + 1;
1800 target_len = strlen(cg);
1801 task_len = strlen(task_cg);
1802 if (task_len == 0) {
1803 /* Task is in the root cg, it can see everything. This case is
1804 * not handled by the strmcps below, since they test for the
1805 * last /, but that is the first / that we've chopped off
1806 * above.
1807 */
1808 answer = true;
1809 goto out;
1810 }
1811 if (strcmp(cg, task_cg) == 0) {
1812 answer = true;
1813 goto out;
1814 }
1815 if (target_len < task_len) {
1816 /* looking up a parent dir */
1817 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1818 answer = true;
1819 goto out;
1820 }
1821 if (target_len > task_len) {
1822 /* looking up a child dir */
1823 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1824 answer = true;
1825 goto out;
1826 }
1827
1828 out:
1829 free(c2);
1830 return answer;
1831 }
1832
1833 /*
1834 * given /cgroup/freezer/a/b, return "freezer".
1835 * the returned char* should NOT be freed.
1836 */
1837 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1838 {
1839 const char *p1;
1840 char *contr, *slash;
1841
1842 if (strlen(path) < 9) {
1843 errno = EACCES;
1844 return NULL;
1845 }
1846 if (*(path + 7) != '/') {
1847 errno = EINVAL;
1848 return NULL;
1849 }
1850 p1 = path + 8;
1851 contr = strdupa(p1);
1852 if (!contr) {
1853 errno = ENOMEM;
1854 return NULL;
1855 }
1856 slash = strstr(contr, "/");
1857 if (slash)
1858 *slash = '\0';
1859
1860 int i;
1861 for (i = 0; i < num_hierarchies; i++) {
1862 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1863 return hierarchies[i];
1864 }
1865 errno = ENOENT;
1866 return NULL;
1867 }
1868
1869 /*
1870 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1871 * Note that the returned value may include files (keynames) etc
1872 */
1873 static const char *find_cgroup_in_path(const char *path)
1874 {
1875 const char *p1;
1876
1877 if (strlen(path) < 9) {
1878 errno = EACCES;
1879 return NULL;
1880 }
1881 p1 = strstr(path + 8, "/");
1882 if (!p1) {
1883 errno = EINVAL;
1884 return NULL;
1885 }
1886 errno = 0;
1887 return p1 + 1;
1888 }
1889
1890 /*
1891 * split the last path element from the path in @cg.
1892 * @dir is newly allocated and should be freed, @last not
1893 */
1894 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1895 {
1896 char *p;
1897
1898 do {
1899 *dir = strdup(cg);
1900 } while (!*dir);
1901 *last = strrchr(cg, '/');
1902 if (!*last) {
1903 *last = NULL;
1904 return;
1905 }
1906 p = strrchr(*dir, '/');
1907 *p = '\0';
1908 }
1909
1910 /*
1911 * FUSE ops for /cgroup
1912 */
1913
1914 int cg_getattr(const char *path, struct stat *sb)
1915 {
1916 struct timespec now;
1917 struct fuse_context *fc = fuse_get_context();
1918 char * cgdir = NULL;
1919 char *last = NULL, *path1, *path2;
1920 struct cgfs_files *k = NULL;
1921 const char *cgroup;
1922 const char *controller = NULL;
1923 int ret = -ENOENT;
1924
1925
1926 if (!fc)
1927 return -EIO;
1928
1929 memset(sb, 0, sizeof(struct stat));
1930
1931 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1932 return -EINVAL;
1933
1934 sb->st_uid = sb->st_gid = 0;
1935 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1936 sb->st_size = 0;
1937
1938 if (strcmp(path, "/cgroup") == 0) {
1939 sb->st_mode = S_IFDIR | 00755;
1940 sb->st_nlink = 2;
1941 return 0;
1942 }
1943
1944 controller = pick_controller_from_path(fc, path);
1945 if (!controller)
1946 return -errno;
1947 cgroup = find_cgroup_in_path(path);
1948 if (!cgroup) {
1949 /* this is just /cgroup/controller, return it as a dir */
1950 sb->st_mode = S_IFDIR | 00755;
1951 sb->st_nlink = 2;
1952 return 0;
1953 }
1954
1955 get_cgdir_and_path(cgroup, &cgdir, &last);
1956
1957 if (!last) {
1958 path1 = "/";
1959 path2 = cgdir;
1960 } else {
1961 path1 = cgdir;
1962 path2 = last;
1963 }
1964
1965 pid_t initpid = lookup_initpid_in_store(fc->pid);
1966 if (initpid <= 0)
1967 initpid = fc->pid;
1968 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1969 * Then check that caller's cgroup is under path if last is a child
1970 * cgroup, or cgdir if last is a file */
1971
1972 if (is_child_cgroup(controller, path1, path2)) {
1973 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1974 ret = -ENOENT;
1975 goto out;
1976 }
1977 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1978 /* this is just /cgroup/controller, return it as a dir */
1979 sb->st_mode = S_IFDIR | 00555;
1980 sb->st_nlink = 2;
1981 ret = 0;
1982 goto out;
1983 }
1984 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1985 ret = -EACCES;
1986 goto out;
1987 }
1988
1989 // get uid, gid, from '/tasks' file and make up a mode
1990 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1991 sb->st_mode = S_IFDIR | 00755;
1992 k = cgfs_get_key(controller, cgroup, NULL);
1993 if (!k) {
1994 sb->st_uid = sb->st_gid = 0;
1995 } else {
1996 sb->st_uid = k->uid;
1997 sb->st_gid = k->gid;
1998 }
1999 free_key(k);
2000 sb->st_nlink = 2;
2001 ret = 0;
2002 goto out;
2003 }
2004
2005 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
2006 sb->st_mode = S_IFREG | k->mode;
2007 sb->st_nlink = 1;
2008 sb->st_uid = k->uid;
2009 sb->st_gid = k->gid;
2010 sb->st_size = 0;
2011 free_key(k);
2012 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2013 ret = -ENOENT;
2014 goto out;
2015 }
2016 ret = 0;
2017 }
2018
2019 out:
2020 free(cgdir);
2021 return ret;
2022 }
2023
2024 int cg_opendir(const char *path, struct fuse_file_info *fi)
2025 {
2026 struct fuse_context *fc = fuse_get_context();
2027 const char *cgroup;
2028 struct file_info *dir_info;
2029 char *controller = NULL;
2030
2031 if (!fc)
2032 return -EIO;
2033
2034 if (strcmp(path, "/cgroup") == 0) {
2035 cgroup = NULL;
2036 controller = NULL;
2037 } else {
2038 // return list of keys for the controller, and list of child cgroups
2039 controller = pick_controller_from_path(fc, path);
2040 if (!controller)
2041 return -errno;
2042
2043 cgroup = find_cgroup_in_path(path);
2044 if (!cgroup) {
2045 /* this is just /cgroup/controller, return its contents */
2046 cgroup = "/";
2047 }
2048 }
2049
2050 pid_t initpid = lookup_initpid_in_store(fc->pid);
2051 if (initpid <= 0)
2052 initpid = fc->pid;
2053 if (cgroup) {
2054 if (!caller_may_see_dir(initpid, controller, cgroup))
2055 return -ENOENT;
2056 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2057 return -EACCES;
2058 }
2059
2060 /* we'll free this at cg_releasedir */
2061 dir_info = malloc(sizeof(*dir_info));
2062 if (!dir_info)
2063 return -ENOMEM;
2064 dir_info->controller = must_copy_string(controller);
2065 dir_info->cgroup = must_copy_string(cgroup);
2066 dir_info->type = LXC_TYPE_CGDIR;
2067 dir_info->buf = NULL;
2068 dir_info->file = NULL;
2069 dir_info->buflen = 0;
2070
2071 fi->fh = (unsigned long)dir_info;
2072 return 0;
2073 }
2074
2075 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2076 struct fuse_file_info *fi)
2077 {
2078 struct file_info *d = (struct file_info *)fi->fh;
2079 struct cgfs_files **list = NULL;
2080 int i, ret;
2081 char *nextcg = NULL;
2082 struct fuse_context *fc = fuse_get_context();
2083 char **clist = NULL;
2084
2085 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2086 return -EIO;
2087
2088 if (d->type != LXC_TYPE_CGDIR) {
2089 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
2090 return -EIO;
2091 }
2092 if (!d->cgroup && !d->controller) {
2093 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2094 int i;
2095
2096 for (i = 0; i < num_hierarchies; i++) {
2097 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2098 return -EIO;
2099 }
2100 }
2101 return 0;
2102 }
2103
2104 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2105 // not a valid cgroup
2106 ret = -EINVAL;
2107 goto out;
2108 }
2109
2110 pid_t initpid = lookup_initpid_in_store(fc->pid);
2111 if (initpid <= 0)
2112 initpid = fc->pid;
2113 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2114 if (nextcg) {
2115 ret = filler(buf, nextcg, NULL, 0);
2116 free(nextcg);
2117 if (ret != 0) {
2118 ret = -EIO;
2119 goto out;
2120 }
2121 }
2122 ret = 0;
2123 goto out;
2124 }
2125
2126 for (i = 0; list && list[i]; i++) {
2127 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2128 ret = -EIO;
2129 goto out;
2130 }
2131 }
2132
2133 // now get the list of child cgroups
2134
2135 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2136 ret = 0;
2137 goto out;
2138 }
2139 if (clist) {
2140 for (i = 0; clist[i]; i++) {
2141 if (filler(buf, clist[i], NULL, 0) != 0) {
2142 ret = -EIO;
2143 goto out;
2144 }
2145 }
2146 }
2147 ret = 0;
2148
2149 out:
2150 free_keys(list);
2151 if (clist) {
2152 for (i = 0; clist[i]; i++)
2153 free(clist[i]);
2154 free(clist);
2155 }
2156 return ret;
2157 }
2158
2159 void do_release_file_info(struct fuse_file_info *fi)
2160 {
2161 struct file_info *f = (struct file_info *)fi->fh;
2162
2163 if (!f)
2164 return;
2165
2166 fi->fh = 0;
2167
2168 free(f->controller);
2169 f->controller = NULL;
2170 free(f->cgroup);
2171 f->cgroup = NULL;
2172 free(f->file);
2173 f->file = NULL;
2174 free(f->buf);
2175 f->buf = NULL;
2176 free(f);
2177 f = NULL;
2178 }
2179
2180 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2181 {
2182 do_release_file_info(fi);
2183 return 0;
2184 }
2185
2186 int cg_open(const char *path, struct fuse_file_info *fi)
2187 {
2188 const char *cgroup;
2189 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2190 struct cgfs_files *k = NULL;
2191 struct file_info *file_info;
2192 struct fuse_context *fc = fuse_get_context();
2193 int ret;
2194
2195 if (!fc)
2196 return -EIO;
2197
2198 controller = pick_controller_from_path(fc, path);
2199 if (!controller)
2200 return -errno;
2201 cgroup = find_cgroup_in_path(path);
2202 if (!cgroup)
2203 return -errno;
2204
2205 get_cgdir_and_path(cgroup, &cgdir, &last);
2206 if (!last) {
2207 path1 = "/";
2208 path2 = cgdir;
2209 } else {
2210 path1 = cgdir;
2211 path2 = last;
2212 }
2213
2214 k = cgfs_get_key(controller, path1, path2);
2215 if (!k) {
2216 ret = -EINVAL;
2217 goto out;
2218 }
2219 free_key(k);
2220
2221 pid_t initpid = lookup_initpid_in_store(fc->pid);
2222 if (initpid <= 0)
2223 initpid = fc->pid;
2224 if (!caller_may_see_dir(initpid, controller, path1)) {
2225 ret = -ENOENT;
2226 goto out;
2227 }
2228 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2229 ret = -EACCES;
2230 goto out;
2231 }
2232
2233 /* we'll free this at cg_release */
2234 file_info = malloc(sizeof(*file_info));
2235 if (!file_info) {
2236 ret = -ENOMEM;
2237 goto out;
2238 }
2239 file_info->controller = must_copy_string(controller);
2240 file_info->cgroup = must_copy_string(path1);
2241 file_info->file = must_copy_string(path2);
2242 file_info->type = LXC_TYPE_CGFILE;
2243 file_info->buf = NULL;
2244 file_info->buflen = 0;
2245
2246 fi->fh = (unsigned long)file_info;
2247 ret = 0;
2248
2249 out:
2250 free(cgdir);
2251 return ret;
2252 }
2253
2254 int cg_access(const char *path, int mode)
2255 {
2256 int ret;
2257 const char *cgroup;
2258 char *path1, *path2, *controller;
2259 char *last = NULL, *cgdir = NULL;
2260 struct cgfs_files *k = NULL;
2261 struct fuse_context *fc = fuse_get_context();
2262
2263 if (strcmp(path, "/cgroup") == 0)
2264 return 0;
2265
2266 if (!fc)
2267 return -EIO;
2268
2269 controller = pick_controller_from_path(fc, path);
2270 if (!controller)
2271 return -errno;
2272 cgroup = find_cgroup_in_path(path);
2273 if (!cgroup) {
2274 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2275 if ((mode & W_OK) == 0)
2276 return 0;
2277 return -EACCES;
2278 }
2279
2280 get_cgdir_and_path(cgroup, &cgdir, &last);
2281 if (!last) {
2282 path1 = "/";
2283 path2 = cgdir;
2284 } else {
2285 path1 = cgdir;
2286 path2 = last;
2287 }
2288
2289 k = cgfs_get_key(controller, path1, path2);
2290 if (!k) {
2291 if ((mode & W_OK) == 0)
2292 ret = 0;
2293 else
2294 ret = -EACCES;
2295 goto out;
2296 }
2297 free_key(k);
2298
2299 pid_t initpid = lookup_initpid_in_store(fc->pid);
2300 if (initpid <= 0)
2301 initpid = fc->pid;
2302 if (!caller_may_see_dir(initpid, controller, path1)) {
2303 ret = -ENOENT;
2304 goto out;
2305 }
2306 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2307 ret = -EACCES;
2308 goto out;
2309 }
2310
2311 ret = 0;
2312
2313 out:
2314 free(cgdir);
2315 return ret;
2316 }
2317
2318 int cg_release(const char *path, struct fuse_file_info *fi)
2319 {
2320 do_release_file_info(fi);
2321 return 0;
2322 }
2323
2324 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2325
2326 static bool wait_for_sock(int sock, int timeout)
2327 {
2328 struct epoll_event ev;
2329 int epfd, ret, now, starttime, deltatime, saved_errno;
2330
2331 if ((starttime = time(NULL)) < 0)
2332 return false;
2333
2334 if ((epfd = epoll_create(1)) < 0) {
2335 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2336 return false;
2337 }
2338
2339 ev.events = POLLIN_SET;
2340 ev.data.fd = sock;
2341 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2342 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2343 close(epfd);
2344 return false;
2345 }
2346
2347 again:
2348 if ((now = time(NULL)) < 0) {
2349 close(epfd);
2350 return false;
2351 }
2352
2353 deltatime = (starttime + timeout) - now;
2354 if (deltatime < 0) { // timeout
2355 errno = 0;
2356 close(epfd);
2357 return false;
2358 }
2359 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2360 if (ret < 0 && errno == EINTR)
2361 goto again;
2362 saved_errno = errno;
2363 close(epfd);
2364
2365 if (ret <= 0) {
2366 errno = saved_errno;
2367 return false;
2368 }
2369 return true;
2370 }
2371
2372 static int msgrecv(int sockfd, void *buf, size_t len)
2373 {
2374 if (!wait_for_sock(sockfd, 2))
2375 return -1;
2376 return recv(sockfd, buf, len, MSG_DONTWAIT);
2377 }
2378
2379 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2380 {
2381 struct msghdr msg = { 0 };
2382 struct iovec iov;
2383 struct cmsghdr *cmsg;
2384 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2385 char buf[1];
2386 buf[0] = 'p';
2387
2388 if (pingfirst) {
2389 if (msgrecv(sock, buf, 1) != 1) {
2390 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2391 return SEND_CREDS_FAIL;
2392 }
2393 }
2394
2395 msg.msg_control = cmsgbuf;
2396 msg.msg_controllen = sizeof(cmsgbuf);
2397
2398 cmsg = CMSG_FIRSTHDR(&msg);
2399 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2400 cmsg->cmsg_level = SOL_SOCKET;
2401 cmsg->cmsg_type = SCM_CREDENTIALS;
2402 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2403
2404 msg.msg_name = NULL;
2405 msg.msg_namelen = 0;
2406
2407 buf[0] = v;
2408 iov.iov_base = buf;
2409 iov.iov_len = sizeof(buf);
2410 msg.msg_iov = &iov;
2411 msg.msg_iovlen = 1;
2412
2413 if (sendmsg(sock, &msg, 0) < 0) {
2414 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2415 if (errno == 3)
2416 return SEND_CREDS_NOTSK;
2417 return SEND_CREDS_FAIL;
2418 }
2419
2420 return SEND_CREDS_OK;
2421 }
2422
2423 static bool recv_creds(int sock, struct ucred *cred, char *v)
2424 {
2425 struct msghdr msg = { 0 };
2426 struct iovec iov;
2427 struct cmsghdr *cmsg;
2428 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2429 char buf[1];
2430 int ret;
2431 int optval = 1;
2432
2433 *v = '1';
2434
2435 cred->pid = -1;
2436 cred->uid = -1;
2437 cred->gid = -1;
2438
2439 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2440 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2441 return false;
2442 }
2443 buf[0] = '1';
2444 if (write(sock, buf, 1) != 1) {
2445 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2446 return false;
2447 }
2448
2449 msg.msg_name = NULL;
2450 msg.msg_namelen = 0;
2451 msg.msg_control = cmsgbuf;
2452 msg.msg_controllen = sizeof(cmsgbuf);
2453
2454 iov.iov_base = buf;
2455 iov.iov_len = sizeof(buf);
2456 msg.msg_iov = &iov;
2457 msg.msg_iovlen = 1;
2458
2459 if (!wait_for_sock(sock, 2)) {
2460 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2461 return false;
2462 }
2463 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2464 if (ret < 0) {
2465 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2466 return false;
2467 }
2468
2469 cmsg = CMSG_FIRSTHDR(&msg);
2470
2471 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2472 cmsg->cmsg_level == SOL_SOCKET &&
2473 cmsg->cmsg_type == SCM_CREDENTIALS) {
2474 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2475 }
2476 *v = buf[0];
2477
2478 return true;
2479 }
2480
2481 struct pid_ns_clone_args {
2482 int *cpipe;
2483 int sock;
2484 pid_t tpid;
2485 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2486 };
2487
2488 /*
2489 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2490 * with clone(). This simply writes '1' as ACK back to the parent
2491 * before calling the actual wrapped function.
2492 */
2493 static int pid_ns_clone_wrapper(void *arg) {
2494 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2495 char b = '1';
2496
2497 close(args->cpipe[0]);
2498 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2499 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2500 close(args->cpipe[1]);
2501 return args->wrapped(args->sock, args->tpid);
2502 }
2503
2504 /*
2505 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2506 * int value back over the socket. This shifts the pid from the
2507 * sender's pidns into tpid's pidns.
2508 */
2509 static int pid_to_ns(int sock, pid_t tpid)
2510 {
2511 char v = '0';
2512 struct ucred cred;
2513
2514 while (recv_creds(sock, &cred, &v)) {
2515 if (v == '1')
2516 return 0;
2517 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2518 return 1;
2519 }
2520 return 0;
2521 }
2522
2523
2524 /*
2525 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2526 * in your old pidns. Only children which you clone will be in the target
2527 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2528 * actually convert pids.
2529 *
2530 * Note: glibc's fork() does not respect pidns, which can lead to failed
2531 * assertions inside glibc (and thus failed forks) if the child's pid in
2532 * the pidns and the parent pid outside are identical. Using clone prevents
2533 * this issue.
2534 */
2535 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2536 {
2537 int newnsfd = -1, ret, cpipe[2];
2538 char fnam[100];
2539 pid_t cpid;
2540 char v;
2541
2542 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2543 if (ret < 0 || ret >= sizeof(fnam))
2544 _exit(1);
2545 newnsfd = open(fnam, O_RDONLY);
2546 if (newnsfd < 0)
2547 _exit(1);
2548 if (setns(newnsfd, 0) < 0)
2549 _exit(1);
2550 close(newnsfd);
2551
2552 if (pipe(cpipe) < 0)
2553 _exit(1);
2554
2555 struct pid_ns_clone_args args = {
2556 .cpipe = cpipe,
2557 .sock = sock,
2558 .tpid = tpid,
2559 .wrapped = &pid_to_ns
2560 };
2561 size_t stack_size = sysconf(_SC_PAGESIZE);
2562 void *stack = alloca(stack_size);
2563
2564 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2565 if (cpid < 0)
2566 _exit(1);
2567
2568 // give the child 1 second to be done forking and
2569 // write its ack
2570 if (!wait_for_sock(cpipe[0], 1))
2571 _exit(1);
2572 ret = read(cpipe[0], &v, 1);
2573 if (ret != sizeof(char) || v != '1')
2574 _exit(1);
2575
2576 if (!wait_for_pid(cpid))
2577 _exit(1);
2578 _exit(0);
2579 }
2580
2581 /*
2582 * To read cgroup files with a particular pid, we will setns into the child
2583 * pidns, open a pipe, fork a child - which will be the first to really be in
2584 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2585 */
2586 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2587 {
2588 int sock[2] = {-1, -1};
2589 char *tmpdata = NULL;
2590 int ret;
2591 pid_t qpid, cpid = -1;
2592 bool answer = false;
2593 char v = '0';
2594 struct ucred cred;
2595 size_t sz = 0, asz = 0;
2596
2597 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2598 return false;
2599
2600 /*
2601 * Now we read the pids from returned data one by one, pass
2602 * them into a child in the target namespace, read back the
2603 * translated pids, and put them into our to-return data
2604 */
2605
2606 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2607 perror("socketpair");
2608 free(tmpdata);
2609 return false;
2610 }
2611
2612 cpid = fork();
2613 if (cpid == -1)
2614 goto out;
2615
2616 if (!cpid) // child - exits when done
2617 pid_to_ns_wrapper(sock[1], tpid);
2618
2619 char *ptr = tmpdata;
2620 cred.uid = 0;
2621 cred.gid = 0;
2622 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2623 cred.pid = qpid;
2624 ret = send_creds(sock[0], &cred, v, true);
2625
2626 if (ret == SEND_CREDS_NOTSK)
2627 goto next;
2628 if (ret == SEND_CREDS_FAIL)
2629 goto out;
2630
2631 // read converted results
2632 if (!wait_for_sock(sock[0], 2)) {
2633 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2634 goto out;
2635 }
2636 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2637 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2638 goto out;
2639 }
2640 must_strcat_pid(d, &sz, &asz, qpid);
2641 next:
2642 ptr = strchr(ptr, '\n');
2643 if (!ptr)
2644 break;
2645 ptr++;
2646 }
2647
2648 cred.pid = getpid();
2649 v = '1';
2650 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2651 // failed to ask child to exit
2652 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2653 goto out;
2654 }
2655
2656 answer = true;
2657
2658 out:
2659 free(tmpdata);
2660 if (cpid != -1)
2661 wait_for_pid(cpid);
2662 if (sock[0] != -1) {
2663 close(sock[0]);
2664 close(sock[1]);
2665 }
2666 return answer;
2667 }
2668
2669 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2670 struct fuse_file_info *fi)
2671 {
2672 struct fuse_context *fc = fuse_get_context();
2673 struct file_info *f = (struct file_info *)fi->fh;
2674 struct cgfs_files *k = NULL;
2675 char *data = NULL;
2676 int ret, s;
2677 bool r;
2678
2679 if (f->type != LXC_TYPE_CGFILE) {
2680 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2681 return -EIO;
2682 }
2683
2684 if (offset)
2685 return 0;
2686
2687 if (!fc)
2688 return -EIO;
2689
2690 if (!f->controller)
2691 return -EINVAL;
2692
2693 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2694 return -EINVAL;
2695 }
2696 free_key(k);
2697
2698
2699 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2700 ret = -EACCES;
2701 goto out;
2702 }
2703
2704 if (strcmp(f->file, "tasks") == 0 ||
2705 strcmp(f->file, "/tasks") == 0 ||
2706 strcmp(f->file, "/cgroup.procs") == 0 ||
2707 strcmp(f->file, "cgroup.procs") == 0)
2708 // special case - we have to translate the pids
2709 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2710 else
2711 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2712
2713 if (!r) {
2714 ret = -EINVAL;
2715 goto out;
2716 }
2717
2718 if (!data) {
2719 ret = 0;
2720 goto out;
2721 }
2722 s = strlen(data);
2723 if (s > size)
2724 s = size;
2725 memcpy(buf, data, s);
2726 if (s > 0 && s < size && data[s-1] != '\n')
2727 buf[s++] = '\n';
2728
2729 ret = s;
2730
2731 out:
2732 free(data);
2733 return ret;
2734 }
2735
2736 static int pid_from_ns(int sock, pid_t tpid)
2737 {
2738 pid_t vpid;
2739 struct ucred cred;
2740 char v;
2741 int ret;
2742
2743 cred.uid = 0;
2744 cred.gid = 0;
2745 while (1) {
2746 if (!wait_for_sock(sock, 2)) {
2747 lxcfs_error("%s\n", "Timeout reading from parent.");
2748 return 1;
2749 }
2750 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2751 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2752 return 1;
2753 }
2754 if (vpid == -1) // done
2755 break;
2756 v = '0';
2757 cred.pid = vpid;
2758 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2759 v = '1';
2760 cred.pid = getpid();
2761 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2762 return 1;
2763 }
2764 }
2765 return 0;
2766 }
2767
2768 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2769 {
2770 int newnsfd = -1, ret, cpipe[2];
2771 char fnam[100];
2772 pid_t cpid;
2773 char v;
2774
2775 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2776 if (ret < 0 || ret >= sizeof(fnam))
2777 _exit(1);
2778 newnsfd = open(fnam, O_RDONLY);
2779 if (newnsfd < 0)
2780 _exit(1);
2781 if (setns(newnsfd, 0) < 0)
2782 _exit(1);
2783 close(newnsfd);
2784
2785 if (pipe(cpipe) < 0)
2786 _exit(1);
2787
2788 struct pid_ns_clone_args args = {
2789 .cpipe = cpipe,
2790 .sock = sock,
2791 .tpid = tpid,
2792 .wrapped = &pid_from_ns
2793 };
2794 size_t stack_size = sysconf(_SC_PAGESIZE);
2795 void *stack = alloca(stack_size);
2796
2797 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2798 if (cpid < 0)
2799 _exit(1);
2800
2801 // give the child 1 second to be done forking and
2802 // write its ack
2803 if (!wait_for_sock(cpipe[0], 1))
2804 _exit(1);
2805 ret = read(cpipe[0], &v, 1);
2806 if (ret != sizeof(char) || v != '1')
2807 _exit(1);
2808
2809 if (!wait_for_pid(cpid))
2810 _exit(1);
2811 _exit(0);
2812 }
2813
2814 /*
2815 * Given host @uid, return the uid to which it maps in
2816 * @pid's user namespace, or -1 if none.
2817 */
2818 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2819 {
2820 FILE *f;
2821 char line[400];
2822
2823 sprintf(line, "/proc/%d/uid_map", pid);
2824 if ((f = fopen(line, "r")) == NULL) {
2825 return false;
2826 }
2827
2828 *answer = convert_id_to_ns(f, uid);
2829 fclose(f);
2830
2831 if (*answer == -1)
2832 return false;
2833 return true;
2834 }
2835
2836 /*
2837 * get_pid_creds: get the real uid and gid of @pid from
2838 * /proc/$$/status
2839 * (XXX should we use euid here?)
2840 */
2841 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2842 {
2843 char line[400];
2844 uid_t u;
2845 gid_t g;
2846 FILE *f;
2847
2848 *uid = -1;
2849 *gid = -1;
2850 sprintf(line, "/proc/%d/status", pid);
2851 if ((f = fopen(line, "r")) == NULL) {
2852 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2853 return;
2854 }
2855 while (fgets(line, 400, f)) {
2856 if (strncmp(line, "Uid:", 4) == 0) {
2857 if (sscanf(line+4, "%u", &u) != 1) {
2858 lxcfs_error("bad uid line for pid %u\n", pid);
2859 fclose(f);
2860 return;
2861 }
2862 *uid = u;
2863 } else if (strncmp(line, "Gid:", 4) == 0) {
2864 if (sscanf(line+4, "%u", &g) != 1) {
2865 lxcfs_error("bad gid line for pid %u\n", pid);
2866 fclose(f);
2867 return;
2868 }
2869 *gid = g;
2870 }
2871 }
2872 fclose(f);
2873 }
2874
2875 /*
2876 * May the requestor @r move victim @v to a new cgroup?
2877 * This is allowed if
2878 * . they are the same task
2879 * . they are ownedy by the same uid
2880 * . @r is root on the host, or
2881 * . @v's uid is mapped into @r's where @r is root.
2882 */
2883 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2884 {
2885 uid_t v_uid, tmpuid;
2886 gid_t v_gid;
2887
2888 if (r == v)
2889 return true;
2890 if (r_uid == 0)
2891 return true;
2892 get_pid_creds(v, &v_uid, &v_gid);
2893 if (r_uid == v_uid)
2894 return true;
2895 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2896 && hostuid_to_ns(v_uid, r, &tmpuid))
2897 return true;
2898 return false;
2899 }
2900
2901 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2902 const char *file, const char *buf)
2903 {
2904 int sock[2] = {-1, -1};
2905 pid_t qpid, cpid = -1;
2906 FILE *pids_file = NULL;
2907 bool answer = false, fail = false;
2908
2909 pids_file = open_pids_file(contrl, cg);
2910 if (!pids_file)
2911 return false;
2912
2913 /*
2914 * write the pids to a socket, have helper in writer's pidns
2915 * call movepid for us
2916 */
2917 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2918 perror("socketpair");
2919 goto out;
2920 }
2921
2922 cpid = fork();
2923 if (cpid == -1)
2924 goto out;
2925
2926 if (!cpid) { // child
2927 fclose(pids_file);
2928 pid_from_ns_wrapper(sock[1], tpid);
2929 }
2930
2931 const char *ptr = buf;
2932 while (sscanf(ptr, "%d", &qpid) == 1) {
2933 struct ucred cred;
2934 char v;
2935
2936 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2937 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2938 goto out;
2939 }
2940
2941 if (recv_creds(sock[0], &cred, &v)) {
2942 if (v == '0') {
2943 if (!may_move_pid(tpid, tuid, cred.pid)) {
2944 fail = true;
2945 break;
2946 }
2947 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2948 fail = true;
2949 }
2950 }
2951
2952 ptr = strchr(ptr, '\n');
2953 if (!ptr)
2954 break;
2955 ptr++;
2956 }
2957
2958 /* All good, write the value */
2959 qpid = -1;
2960 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2961 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2962
2963 if (!fail)
2964 answer = true;
2965
2966 out:
2967 if (cpid != -1)
2968 wait_for_pid(cpid);
2969 if (sock[0] != -1) {
2970 close(sock[0]);
2971 close(sock[1]);
2972 }
2973 if (pids_file) {
2974 if (fclose(pids_file) != 0)
2975 answer = false;
2976 }
2977 return answer;
2978 }
2979
2980 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2981 struct fuse_file_info *fi)
2982 {
2983 struct fuse_context *fc = fuse_get_context();
2984 char *localbuf = NULL;
2985 struct cgfs_files *k = NULL;
2986 struct file_info *f = (struct file_info *)fi->fh;
2987 bool r;
2988
2989 if (f->type != LXC_TYPE_CGFILE) {
2990 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2991 return -EIO;
2992 }
2993
2994 if (offset)
2995 return 0;
2996
2997 if (!fc)
2998 return -EIO;
2999
3000 localbuf = alloca(size+1);
3001 localbuf[size] = '\0';
3002 memcpy(localbuf, buf, size);
3003
3004 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
3005 size = -EINVAL;
3006 goto out;
3007 }
3008
3009 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
3010 size = -EACCES;
3011 goto out;
3012 }
3013
3014 if (strcmp(f->file, "tasks") == 0 ||
3015 strcmp(f->file, "/tasks") == 0 ||
3016 strcmp(f->file, "/cgroup.procs") == 0 ||
3017 strcmp(f->file, "cgroup.procs") == 0)
3018 // special case - we have to translate the pids
3019 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
3020 else
3021 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
3022
3023 if (!r)
3024 size = -EINVAL;
3025
3026 out:
3027 free_key(k);
3028 return size;
3029 }
3030
3031 int cg_chown(const char *path, uid_t uid, gid_t gid)
3032 {
3033 struct fuse_context *fc = fuse_get_context();
3034 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3035 struct cgfs_files *k = NULL;
3036 const char *cgroup;
3037 int ret;
3038
3039 if (!fc)
3040 return -EIO;
3041
3042 if (strcmp(path, "/cgroup") == 0)
3043 return -EPERM;
3044
3045 controller = pick_controller_from_path(fc, path);
3046 if (!controller)
3047 return errno == ENOENT ? -EPERM : -errno;
3048
3049 cgroup = find_cgroup_in_path(path);
3050 if (!cgroup)
3051 /* this is just /cgroup/controller */
3052 return -EPERM;
3053
3054 get_cgdir_and_path(cgroup, &cgdir, &last);
3055
3056 if (!last) {
3057 path1 = "/";
3058 path2 = cgdir;
3059 } else {
3060 path1 = cgdir;
3061 path2 = last;
3062 }
3063
3064 if (is_child_cgroup(controller, path1, path2)) {
3065 // get uid, gid, from '/tasks' file and make up a mode
3066 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3067 k = cgfs_get_key(controller, cgroup, "tasks");
3068
3069 } else
3070 k = cgfs_get_key(controller, path1, path2);
3071
3072 if (!k) {
3073 ret = -EINVAL;
3074 goto out;
3075 }
3076
3077 /*
3078 * This being a fuse request, the uid and gid must be valid
3079 * in the caller's namespace. So we can just check to make
3080 * sure that the caller is root in his uid, and privileged
3081 * over the file's current owner.
3082 */
3083 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3084 ret = -EACCES;
3085 goto out;
3086 }
3087
3088 ret = cgfs_chown_file(controller, cgroup, uid, gid);
3089
3090 out:
3091 free_key(k);
3092 free(cgdir);
3093
3094 return ret;
3095 }
3096
3097 int cg_chmod(const char *path, mode_t mode)
3098 {
3099 struct fuse_context *fc = fuse_get_context();
3100 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3101 struct cgfs_files *k = NULL;
3102 const char *cgroup;
3103 int ret;
3104
3105 if (!fc)
3106 return -EIO;
3107
3108 if (strcmp(path, "/cgroup") == 0)
3109 return -EPERM;
3110
3111 controller = pick_controller_from_path(fc, path);
3112 if (!controller)
3113 return errno == ENOENT ? -EPERM : -errno;
3114
3115 cgroup = find_cgroup_in_path(path);
3116 if (!cgroup)
3117 /* this is just /cgroup/controller */
3118 return -EPERM;
3119
3120 get_cgdir_and_path(cgroup, &cgdir, &last);
3121
3122 if (!last) {
3123 path1 = "/";
3124 path2 = cgdir;
3125 } else {
3126 path1 = cgdir;
3127 path2 = last;
3128 }
3129
3130 if (is_child_cgroup(controller, path1, path2)) {
3131 // get uid, gid, from '/tasks' file and make up a mode
3132 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3133 k = cgfs_get_key(controller, cgroup, "tasks");
3134
3135 } else
3136 k = cgfs_get_key(controller, path1, path2);
3137
3138 if (!k) {
3139 ret = -EINVAL;
3140 goto out;
3141 }
3142
3143 /*
3144 * This being a fuse request, the uid and gid must be valid
3145 * in the caller's namespace. So we can just check to make
3146 * sure that the caller is root in his uid, and privileged
3147 * over the file's current owner.
3148 */
3149 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3150 ret = -EPERM;
3151 goto out;
3152 }
3153
3154 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3155 ret = -EINVAL;
3156 goto out;
3157 }
3158
3159 ret = 0;
3160 out:
3161 free_key(k);
3162 free(cgdir);
3163 return ret;
3164 }
3165
3166 int cg_mkdir(const char *path, mode_t mode)
3167 {
3168 struct fuse_context *fc = fuse_get_context();
3169 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3170 const char *cgroup;
3171 int ret;
3172
3173 if (!fc)
3174 return -EIO;
3175
3176 controller = pick_controller_from_path(fc, path);
3177 if (!controller)
3178 return errno == ENOENT ? -EPERM : -errno;
3179
3180 cgroup = find_cgroup_in_path(path);
3181 if (!cgroup)
3182 return -errno;
3183
3184 get_cgdir_and_path(cgroup, &cgdir, &last);
3185 if (!last)
3186 path1 = "/";
3187 else
3188 path1 = cgdir;
3189
3190 pid_t initpid = lookup_initpid_in_store(fc->pid);
3191 if (initpid <= 0)
3192 initpid = fc->pid;
3193 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3194 if (!next)
3195 ret = -EINVAL;
3196 else if (last && strcmp(next, last) == 0)
3197 ret = -EEXIST;
3198 else
3199 ret = -EPERM;
3200 goto out;
3201 }
3202
3203 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3204 ret = -EACCES;
3205 goto out;
3206 }
3207 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3208 ret = -EACCES;
3209 goto out;
3210 }
3211
3212 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3213
3214 out:
3215 free(cgdir);
3216 free(next);
3217 return ret;
3218 }
3219
3220 int cg_rmdir(const char *path)
3221 {
3222 struct fuse_context *fc = fuse_get_context();
3223 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3224 const char *cgroup;
3225 int ret;
3226
3227 if (!fc)
3228 return -EIO;
3229
3230 controller = pick_controller_from_path(fc, path);
3231 if (!controller) /* Someone's trying to delete "/cgroup". */
3232 return -EPERM;
3233
3234 cgroup = find_cgroup_in_path(path);
3235 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3236 return -EPERM;
3237
3238 get_cgdir_and_path(cgroup, &cgdir, &last);
3239 if (!last) {
3240 /* Someone's trying to delete a cgroup on the same level as the
3241 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3242 * rmdir "/cgroup/blkio/init.slice".
3243 */
3244 ret = -EPERM;
3245 goto out;
3246 }
3247
3248 pid_t initpid = lookup_initpid_in_store(fc->pid);
3249 if (initpid <= 0)
3250 initpid = fc->pid;
3251 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3252 if (!last || (next && (strcmp(next, last) == 0)))
3253 ret = -EBUSY;
3254 else
3255 ret = -ENOENT;
3256 goto out;
3257 }
3258
3259 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3260 ret = -EACCES;
3261 goto out;
3262 }
3263 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3264 ret = -EACCES;
3265 goto out;
3266 }
3267
3268 if (!cgfs_remove(controller, cgroup)) {
3269 ret = -EINVAL;
3270 goto out;
3271 }
3272
3273 ret = 0;
3274
3275 out:
3276 free(cgdir);
3277 free(next);
3278 return ret;
3279 }
3280
3281 static bool startswith(const char *line, const char *pref)
3282 {
3283 if (strncmp(line, pref, strlen(pref)) == 0)
3284 return true;
3285 return false;
3286 }
3287
3288 static void parse_memstat(char *memstat, unsigned long *cached,
3289 unsigned long *active_anon, unsigned long *inactive_anon,
3290 unsigned long *active_file, unsigned long *inactive_file,
3291 unsigned long *unevictable, unsigned long *shmem)
3292 {
3293 char *eol;
3294
3295 while (*memstat) {
3296 if (startswith(memstat, "total_cache")) {
3297 sscanf(memstat + 11, "%lu", cached);
3298 *cached /= 1024;
3299 } else if (startswith(memstat, "total_active_anon")) {
3300 sscanf(memstat + 17, "%lu", active_anon);
3301 *active_anon /= 1024;
3302 } else if (startswith(memstat, "total_inactive_anon")) {
3303 sscanf(memstat + 19, "%lu", inactive_anon);
3304 *inactive_anon /= 1024;
3305 } else if (startswith(memstat, "total_active_file")) {
3306 sscanf(memstat + 17, "%lu", active_file);
3307 *active_file /= 1024;
3308 } else if (startswith(memstat, "total_inactive_file")) {
3309 sscanf(memstat + 19, "%lu", inactive_file);
3310 *inactive_file /= 1024;
3311 } else if (startswith(memstat, "total_unevictable")) {
3312 sscanf(memstat + 17, "%lu", unevictable);
3313 *unevictable /= 1024;
3314 } else if (startswith(memstat, "total_shmem")) {
3315 sscanf(memstat + 11, "%lu", shmem);
3316 *shmem /= 1024;
3317 }
3318 eol = strchr(memstat, '\n');
3319 if (!eol)
3320 return;
3321 memstat = eol+1;
3322 }
3323 }
3324
3325 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3326 {
3327 char *eol;
3328 char key[32];
3329
3330 memset(key, 0, 32);
3331 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3332
3333 size_t len = strlen(key);
3334 *v = 0;
3335
3336 while (*str) {
3337 if (startswith(str, key)) {
3338 sscanf(str + len, "%lu", v);
3339 return;
3340 }
3341 eol = strchr(str, '\n');
3342 if (!eol)
3343 return;
3344 str = eol+1;
3345 }
3346 }
3347
3348 int read_file(const char *path, char *buf, size_t size, struct file_info *d)
3349 {
3350 size_t linelen = 0, total_len = 0, rv = 0;
3351 char *line = NULL;
3352 char *cache = d->buf;
3353 size_t cache_size = d->buflen;
3354 FILE *f = fopen(path, "r");
3355 if (!f)
3356 return 0;
3357
3358 while (getline(&line, &linelen, f) != -1) {
3359 ssize_t l = snprintf(cache, cache_size, "%s", line);
3360 if (l < 0) {
3361 perror("Error writing to cache");
3362 rv = 0;
3363 goto err;
3364 }
3365 if (l >= cache_size) {
3366 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3367 rv = 0;
3368 goto err;
3369 }
3370 cache += l;
3371 cache_size -= l;
3372 total_len += l;
3373 }
3374
3375 d->size = total_len;
3376 if (total_len > size)
3377 total_len = size;
3378
3379 /* read from off 0 */
3380 memcpy(buf, d->buf, total_len);
3381 rv = total_len;
3382 err:
3383 fclose(f);
3384 free(line);
3385 return rv;
3386 }
3387
3388 /*
3389 * FUSE ops for /proc
3390 */
3391
3392 static unsigned long get_memlimit(const char *cgroup, const char *file)
3393 {
3394 char *memlimit_str = NULL;
3395 unsigned long memlimit = -1;
3396
3397 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3398 memlimit = strtoul(memlimit_str, NULL, 10);
3399
3400 free(memlimit_str);
3401
3402 return memlimit;
3403 }
3404
3405 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3406 {
3407 char *copy = strdupa(cgroup);
3408 unsigned long memlimit = 0, retlimit;
3409
3410 retlimit = get_memlimit(copy, file);
3411
3412 while (strcmp(copy, "/") != 0) {
3413 copy = dirname(copy);
3414 memlimit = get_memlimit(copy, file);
3415 if (memlimit != -1 && memlimit < retlimit)
3416 retlimit = memlimit;
3417 };
3418
3419 return retlimit;
3420 }
3421
3422 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3423 struct fuse_file_info *fi)
3424 {
3425 struct fuse_context *fc = fuse_get_context();
3426 struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
3427 struct file_info *d = (struct file_info *)fi->fh;
3428 char *cg;
3429 char *memusage_str = NULL, *memstat_str = NULL,
3430 *memswlimit_str = NULL, *memswusage_str = NULL;
3431 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3432 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3433 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
3434 hostswtotal = 0;
3435 char *line = NULL;
3436 size_t linelen = 0, total_len = 0, rv = 0;
3437 char *cache = d->buf;
3438 size_t cache_size = d->buflen;
3439 FILE *f = NULL;
3440
3441 if (offset){
3442 if (offset > d->size)
3443 return -EINVAL;
3444 if (!d->cached)
3445 return 0;
3446 int left = d->size - offset;
3447 total_len = left > size ? size: left;
3448 memcpy(buf, cache + offset, total_len);
3449 return total_len;
3450 }
3451
3452 pid_t initpid = lookup_initpid_in_store(fc->pid);
3453 if (initpid <= 0)
3454 initpid = fc->pid;
3455 cg = get_pid_cgroup(initpid, "memory");
3456 if (!cg)
3457 return read_file("/proc/meminfo", buf, size, d);
3458 prune_init_slice(cg);
3459
3460 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3461 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3462 goto err;
3463 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3464 goto err;
3465
3466 // Following values are allowed to fail, because swapaccount might be turned
3467 // off for current kernel
3468 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3469 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3470 {
3471 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3472 memswusage = strtoul(memswusage_str, NULL, 10);
3473
3474 memswlimit = memswlimit / 1024;
3475 memswusage = memswusage / 1024;
3476 }
3477
3478 memusage = strtoul(memusage_str, NULL, 10);
3479 memlimit /= 1024;
3480 memusage /= 1024;
3481
3482 parse_memstat(memstat_str, &cached, &active_anon,
3483 &inactive_anon, &active_file, &inactive_file,
3484 &unevictable, &shmem);
3485
3486 f = fopen("/proc/meminfo", "r");
3487 if (!f)
3488 goto err;
3489
3490 while (getline(&line, &linelen, f) != -1) {
3491 ssize_t l;
3492 char *printme, lbuf[100];
3493
3494 memset(lbuf, 0, 100);
3495 if (startswith(line, "MemTotal:")) {
3496 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3497 if (hosttotal < memlimit)
3498 memlimit = hosttotal;
3499 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3500 printme = lbuf;
3501 } else if (startswith(line, "MemFree:")) {
3502 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3503 printme = lbuf;
3504 } else if (startswith(line, "MemAvailable:")) {
3505 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
3506 printme = lbuf;
3507 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 && opts->swap_off == false) {
3508 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3509 if (hostswtotal < memswlimit)
3510 memswlimit = hostswtotal;
3511 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
3512 printme = lbuf;
3513 } else if (startswith(line, "SwapTotal:") && opts->swap_off == true) {
3514 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", 0UL);
3515 printme = lbuf;
3516 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0 && opts->swap_off == false) {
3517 unsigned long swaptotal = memswlimit,
3518 swapusage = memswusage - memusage,
3519 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3520 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3521 printme = lbuf;
3522 } else if (startswith(line, "SwapFree:") && opts->swap_off == true) {
3523 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", 0UL);
3524 printme = lbuf;
3525 } else if (startswith(line, "Slab:")) {
3526 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3527 printme = lbuf;
3528 } else if (startswith(line, "Buffers:")) {
3529 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3530 printme = lbuf;
3531 } else if (startswith(line, "Cached:")) {
3532 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3533 printme = lbuf;
3534 } else if (startswith(line, "SwapCached:")) {
3535 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3536 printme = lbuf;
3537 } else if (startswith(line, "Active:")) {
3538 snprintf(lbuf, 100, "Active: %8lu kB\n",
3539 active_anon + active_file);
3540 printme = lbuf;
3541 } else if (startswith(line, "Inactive:")) {
3542 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3543 inactive_anon + inactive_file);
3544 printme = lbuf;
3545 } else if (startswith(line, "Active(anon)")) {
3546 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3547 printme = lbuf;
3548 } else if (startswith(line, "Inactive(anon)")) {
3549 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3550 printme = lbuf;
3551 } else if (startswith(line, "Active(file)")) {
3552 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3553 printme = lbuf;
3554 } else if (startswith(line, "Inactive(file)")) {
3555 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3556 printme = lbuf;
3557 } else if (startswith(line, "Unevictable")) {
3558 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3559 printme = lbuf;
3560 } else if (startswith(line, "SReclaimable")) {
3561 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3562 printme = lbuf;
3563 } else if (startswith(line, "SUnreclaim")) {
3564 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3565 printme = lbuf;
3566 } else if (startswith(line, "Shmem:")) {
3567 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3568 printme = lbuf;
3569 } else if (startswith(line, "ShmemHugePages")) {
3570 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3571 printme = lbuf;
3572 } else if (startswith(line, "ShmemPmdMapped")) {
3573 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3574 printme = lbuf;
3575 } else
3576 printme = line;
3577
3578 l = snprintf(cache, cache_size, "%s", printme);
3579 if (l < 0) {
3580 perror("Error writing to cache");
3581 rv = 0;
3582 goto err;
3583
3584 }
3585 if (l >= cache_size) {
3586 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3587 rv = 0;
3588 goto err;
3589 }
3590
3591 cache += l;
3592 cache_size -= l;
3593 total_len += l;
3594 }
3595
3596 d->cached = 1;
3597 d->size = total_len;
3598 if (total_len > size ) total_len = size;
3599 memcpy(buf, d->buf, total_len);
3600
3601 rv = total_len;
3602 err:
3603 if (f)
3604 fclose(f);
3605 free(line);
3606 free(cg);
3607 free(memusage_str);
3608 free(memswlimit_str);
3609 free(memswusage_str);
3610 free(memstat_str);
3611 return rv;
3612 }
3613
3614 /*
3615 * Read the cpuset.cpus for cg
3616 * Return the answer in a newly allocated string which must be freed
3617 */
3618 char *get_cpuset(const char *cg)
3619 {
3620 char *answer;
3621
3622 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3623 return NULL;
3624 return answer;
3625 }
3626
3627 bool cpu_in_cpuset(int cpu, const char *cpuset);
3628
3629 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3630 {
3631 int cpu;
3632
3633 if (sscanf(line, "processor : %d", &cpu) != 1)
3634 return false;
3635 return cpu_in_cpuset(cpu, cpuset);
3636 }
3637
3638 /*
3639 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3640 * depending on `param`. Parameter value is returned throuh `value`.
3641 */
3642 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3643 {
3644 bool rv = false;
3645 char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3646 char *str = NULL;
3647
3648 sprintf(file, "cpu.cfs_%s_us", param);
3649
3650 if (!cgfs_get_value("cpu", cg, file, &str))
3651 goto err;
3652
3653 if (sscanf(str, "%ld", value) != 1)
3654 goto err;
3655
3656 rv = true;
3657
3658 err:
3659 if (str)
3660 free(str);
3661 return rv;
3662 }
3663
3664 /*
3665 * Return the maximum number of visible CPUs based on CPU quotas.
3666 * If there is no quota set, zero is returned.
3667 */
3668 int max_cpu_count(const char *cg)
3669 {
3670 int rv, nprocs;
3671 int64_t cfs_quota, cfs_period;
3672
3673 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3674 return 0;
3675
3676 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3677 return 0;
3678
3679 if (cfs_quota <= 0 || cfs_period <= 0)
3680 return 0;
3681
3682 rv = cfs_quota / cfs_period;
3683
3684 /* In case quota/period does not yield a whole number, add one CPU for
3685 * the remainder.
3686 */
3687 if ((cfs_quota % cfs_period) > 0)
3688 rv += 1;
3689
3690 nprocs = get_nprocs();
3691
3692 if (rv > nprocs)
3693 rv = nprocs;
3694
3695 return rv;
3696 }
3697
3698 /*
3699 * Return the exact number of visible CPUs based on CPU quotas.
3700 * If there is no quota set, zero is returned.
3701 */
3702 static double exact_cpu_count(const char *cg)
3703 {
3704 double rv;
3705 int nprocs;
3706 int64_t cfs_quota, cfs_period;
3707
3708 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3709 return 0;
3710
3711 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3712 return 0;
3713
3714 if (cfs_quota <= 0 || cfs_period <= 0)
3715 return 0;
3716
3717 rv = (double)cfs_quota / (double)cfs_period;
3718
3719 nprocs = get_nprocs();
3720
3721 if (rv > nprocs)
3722 rv = nprocs;
3723
3724 return rv;
3725 }
3726
3727 /*
3728 * Determine whether CPU views should be used or not.
3729 */
3730 bool use_cpuview(const char *cg)
3731 {
3732 int cfd;
3733 char *tmpc;
3734
3735 tmpc = find_mounted_controller("cpu", &cfd);
3736 if (!tmpc)
3737 return false;
3738
3739 tmpc = find_mounted_controller("cpuacct", &cfd);
3740 if (!tmpc)
3741 return false;
3742
3743 return true;
3744 }
3745
3746 /*
3747 * check whether this is a '^processor" line in /proc/cpuinfo
3748 */
3749 static bool is_processor_line(const char *line)
3750 {
3751 int cpu;
3752
3753 if (sscanf(line, "processor : %d", &cpu) == 1)
3754 return true;
3755 return false;
3756 }
3757
3758 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3759 struct fuse_file_info *fi)
3760 {
3761 struct fuse_context *fc = fuse_get_context();
3762 struct file_info *d = (struct file_info *)fi->fh;
3763 char *cg;
3764 char *cpuset = NULL;
3765 char *line = NULL;
3766 size_t linelen = 0, total_len = 0, rv = 0;
3767 bool am_printing = false, firstline = true, is_s390x = false;
3768 int curcpu = -1, cpu, max_cpus = 0;
3769 bool use_view;
3770 char *cache = d->buf;
3771 size_t cache_size = d->buflen;
3772 FILE *f = NULL;
3773
3774 if (offset){
3775 if (offset > d->size)
3776 return -EINVAL;
3777 if (!d->cached)
3778 return 0;
3779 int left = d->size - offset;
3780 total_len = left > size ? size: left;
3781 memcpy(buf, cache + offset, total_len);
3782 return total_len;
3783 }
3784
3785 pid_t initpid = lookup_initpid_in_store(fc->pid);
3786 if (initpid <= 0)
3787 initpid = fc->pid;
3788 cg = get_pid_cgroup(initpid, "cpuset");
3789 if (!cg)
3790 return read_file("proc/cpuinfo", buf, size, d);
3791 prune_init_slice(cg);
3792
3793 cpuset = get_cpuset(cg);
3794 if (!cpuset)
3795 goto err;
3796
3797 use_view = use_cpuview(cg);
3798
3799 if (use_view)
3800 max_cpus = max_cpu_count(cg);
3801
3802 f = fopen("/proc/cpuinfo", "r");
3803 if (!f)
3804 goto err;
3805
3806 while (getline(&line, &linelen, f) != -1) {
3807 ssize_t l;
3808 if (firstline) {
3809 firstline = false;
3810 if (strstr(line, "IBM/S390") != NULL) {
3811 is_s390x = true;
3812 am_printing = true;
3813 continue;
3814 }
3815 }
3816 if (strncmp(line, "# processors:", 12) == 0)
3817 continue;
3818 if (is_processor_line(line)) {
3819 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3820 break;
3821 am_printing = cpuline_in_cpuset(line, cpuset);
3822 if (am_printing) {
3823 curcpu ++;
3824 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3825 if (l < 0) {
3826 perror("Error writing to cache");
3827 rv = 0;
3828 goto err;
3829 }
3830 if (l >= cache_size) {
3831 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3832 rv = 0;
3833 goto err;
3834 }
3835 cache += l;
3836 cache_size -= l;
3837 total_len += l;
3838 }
3839 continue;
3840 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3841 char *p;
3842 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3843 break;
3844 if (!cpu_in_cpuset(cpu, cpuset))
3845 continue;
3846 curcpu ++;
3847 p = strchr(line, ':');
3848 if (!p || !*p)
3849 goto err;
3850 p++;
3851 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3852 if (l < 0) {
3853 perror("Error writing to cache");
3854 rv = 0;
3855 goto err;
3856 }
3857 if (l >= cache_size) {
3858 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3859 rv = 0;
3860 goto err;
3861 }
3862 cache += l;
3863 cache_size -= l;
3864 total_len += l;
3865 continue;
3866
3867 }
3868 if (am_printing) {
3869 l = snprintf(cache, cache_size, "%s", line);
3870 if (l < 0) {
3871 perror("Error writing to cache");
3872 rv = 0;
3873 goto err;
3874 }
3875 if (l >= cache_size) {
3876 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3877 rv = 0;
3878 goto err;
3879 }
3880 cache += l;
3881 cache_size -= l;
3882 total_len += l;
3883 }
3884 }
3885
3886 if (is_s390x) {
3887 char *origcache = d->buf;
3888 ssize_t l;
3889 do {
3890 d->buf = malloc(d->buflen);
3891 } while (!d->buf);
3892 cache = d->buf;
3893 cache_size = d->buflen;
3894 total_len = 0;
3895 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3896 if (l < 0 || l >= cache_size) {
3897 free(origcache);
3898 goto err;
3899 }
3900 cache_size -= l;
3901 cache += l;
3902 total_len += l;
3903 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3904 if (l < 0 || l >= cache_size) {
3905 free(origcache);
3906 goto err;
3907 }
3908 cache_size -= l;
3909 cache += l;
3910 total_len += l;
3911 l = snprintf(cache, cache_size, "%s", origcache);
3912 free(origcache);
3913 if (l < 0 || l >= cache_size)
3914 goto err;
3915 total_len += l;
3916 }
3917
3918 d->cached = 1;
3919 d->size = total_len;
3920 if (total_len > size ) total_len = size;
3921
3922 /* read from off 0 */
3923 memcpy(buf, d->buf, total_len);
3924 rv = total_len;
3925 err:
3926 if (f)
3927 fclose(f);
3928 free(line);
3929 free(cpuset);
3930 free(cg);
3931 return rv;
3932 }
3933
3934 static uint64_t get_reaper_start_time(pid_t pid)
3935 {
3936 int ret;
3937 FILE *f;
3938 uint64_t starttime;
3939 /* strlen("/proc/") = 6
3940 * +
3941 * LXCFS_NUMSTRLEN64
3942 * +
3943 * strlen("/stat") = 5
3944 * +
3945 * \0 = 1
3946 * */
3947 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3948 char path[__PROC_PID_STAT_LEN];
3949 pid_t qpid;
3950
3951 qpid = lookup_initpid_in_store(pid);
3952 if (qpid <= 0) {
3953 /* Caller can check for EINVAL on 0. */
3954 errno = EINVAL;
3955 return 0;
3956 }
3957
3958 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3959 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3960 /* Caller can check for EINVAL on 0. */
3961 errno = EINVAL;
3962 return 0;
3963 }
3964
3965 f = fopen(path, "r");
3966 if (!f) {
3967 /* Caller can check for EINVAL on 0. */
3968 errno = EINVAL;
3969 return 0;
3970 }
3971
3972 /* Note that the *scanf() argument supression requires that length
3973 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3974 * at us. It's like telling someone you're not married and then asking
3975 * if you can bring your wife to the party.
3976 */
3977 ret = fscanf(f, "%*d " /* (1) pid %d */
3978 "%*s " /* (2) comm %s */
3979 "%*c " /* (3) state %c */
3980 "%*d " /* (4) ppid %d */
3981 "%*d " /* (5) pgrp %d */
3982 "%*d " /* (6) session %d */
3983 "%*d " /* (7) tty_nr %d */
3984 "%*d " /* (8) tpgid %d */
3985 "%*u " /* (9) flags %u */
3986 "%*u " /* (10) minflt %lu */
3987 "%*u " /* (11) cminflt %lu */
3988 "%*u " /* (12) majflt %lu */
3989 "%*u " /* (13) cmajflt %lu */
3990 "%*u " /* (14) utime %lu */
3991 "%*u " /* (15) stime %lu */
3992 "%*d " /* (16) cutime %ld */
3993 "%*d " /* (17) cstime %ld */
3994 "%*d " /* (18) priority %ld */
3995 "%*d " /* (19) nice %ld */
3996 "%*d " /* (20) num_threads %ld */
3997 "%*d " /* (21) itrealvalue %ld */
3998 "%" PRIu64, /* (22) starttime %llu */
3999 &starttime);
4000 if (ret != 1) {
4001 fclose(f);
4002 /* Caller can check for EINVAL on 0. */
4003 errno = EINVAL;
4004 return 0;
4005 }
4006
4007 fclose(f);
4008
4009 errno = 0;
4010 return starttime;
4011 }
4012
4013 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
4014 {
4015 uint64_t clockticks;
4016 int64_t ticks_per_sec;
4017
4018 clockticks = get_reaper_start_time(pid);
4019 if (clockticks == 0 && errno == EINVAL) {
4020 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
4021 return 0;
4022 }
4023
4024 ticks_per_sec = sysconf(_SC_CLK_TCK);
4025 if (ticks_per_sec < 0 && errno == EINVAL) {
4026 lxcfs_debug(
4027 "%s\n",
4028 "failed to determine number of clock ticks in a second");
4029 return 0;
4030 }
4031
4032 return (clockticks /= ticks_per_sec);
4033 }
4034
4035 static uint64_t get_reaper_age(pid_t pid)
4036 {
4037 uint64_t procstart, uptime, procage;
4038
4039 /* We need to substract the time the process has started since system
4040 * boot minus the time when the system has started to get the actual
4041 * reaper age.
4042 */
4043 procstart = get_reaper_start_time_in_sec(pid);
4044 procage = procstart;
4045 if (procstart > 0) {
4046 int ret;
4047 struct timespec spec;
4048
4049 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
4050 if (ret < 0)
4051 return 0;
4052 /* We could make this more precise here by using the tv_nsec
4053 * field in the timespec struct and convert it to milliseconds
4054 * and then create a double for the seconds and milliseconds but
4055 * that seems more work than it is worth.
4056 */
4057 uptime = spec.tv_sec;
4058 procage = uptime - procstart;
4059 }
4060
4061 return procage;
4062 }
4063
4064 /*
4065 * Returns 0 on success.
4066 * It is the caller's responsibility to free `return_usage`, unless this
4067 * function returns an error.
4068 */
4069 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
4070 {
4071 int cpucount = get_nprocs_conf();
4072 struct cpuacct_usage *cpu_usage;
4073 int rv = 0, i, j, ret;
4074 int cg_cpu;
4075 uint64_t cg_user, cg_system;
4076 int64_t ticks_per_sec;
4077 char *usage_str = NULL;
4078
4079 ticks_per_sec = sysconf(_SC_CLK_TCK);
4080
4081 if (ticks_per_sec < 0 && errno == EINVAL) {
4082 lxcfs_v(
4083 "%s\n",
4084 "read_cpuacct_usage_all failed to determine number of clock ticks "
4085 "in a second");
4086 return -1;
4087 }
4088
4089 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4090 if (!cpu_usage)
4091 return -ENOMEM;
4092
4093 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
4094 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4095 // read cpuacct.usage_percpu instead
4096 lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", "");
4097 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_percpu", &usage_str)) {
4098 rv = -1;
4099 goto err;
4100 }
4101 lxcfs_v("usage_str: %s\n", usage_str);
4102
4103 // convert cpuacct.usage_percpu into cpuacct.usage_all
4104 lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", "");
4105
4106 char *data = NULL;
4107 size_t sz = 0, asz = 0;
4108
4109 must_strcat(&data, &sz, &asz, "cpu user system\n");
4110
4111 int i = 0, read_pos = 0, read_cnt=0;
4112 while (sscanf(usage_str + read_pos, "%lu %n", &cg_user, &read_cnt) > 0) {
4113 lxcfs_debug("i: %d, cg_user: %lu, read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt);
4114 must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user);
4115 i++;
4116 read_pos += read_cnt;
4117 }
4118
4119 free(usage_str);
4120 usage_str = data;
4121
4122 lxcfs_v("usage_str: %s\n", usage_str);
4123 }
4124
4125 int read_pos = 0, read_cnt=0;
4126 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4127 lxcfs_error("read_cpuacct_usage_all reading first line from "
4128 "%s/cpuacct.usage_all failed.\n", cg);
4129 rv = -1;
4130 goto err;
4131 }
4132
4133 read_pos += read_cnt;
4134
4135 for (i = 0, j = 0; i < cpucount; i++) {
4136 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4137 &cg_system, &read_cnt);
4138
4139 if (ret == EOF)
4140 break;
4141
4142 if (ret != 3) {
4143 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4144 "failed.\n", cg);
4145 rv = -1;
4146 goto err;
4147 }
4148
4149 read_pos += read_cnt;
4150
4151 /* Convert the time from nanoseconds to USER_HZ */
4152 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4153 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4154 j++;
4155 }
4156
4157 rv = 0;
4158 *return_usage = cpu_usage;
4159 *size = cpucount;
4160
4161 err:
4162 if (usage_str)
4163 free(usage_str);
4164
4165 if (rv != 0) {
4166 free(cpu_usage);
4167 *return_usage = NULL;
4168 }
4169
4170 return rv;
4171 }
4172
4173 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4174 {
4175 int i;
4176 unsigned long sum = 0;
4177
4178 for (i = 0; i < cpu_count; i++) {
4179 if (!newer[i].online)
4180 continue;
4181
4182 /* When cpuset is changed on the fly, the CPUs might get reordered.
4183 * We could either reset all counters, or check that the substractions
4184 * below will return expected results.
4185 */
4186 if (newer[i].user > older[i].user)
4187 diff[i].user = newer[i].user - older[i].user;
4188 else
4189 diff[i].user = 0;
4190
4191 if (newer[i].system > older[i].system)
4192 diff[i].system = newer[i].system - older[i].system;
4193 else
4194 diff[i].system = 0;
4195
4196 if (newer[i].idle > older[i].idle)
4197 diff[i].idle = newer[i].idle - older[i].idle;
4198 else
4199 diff[i].idle = 0;
4200
4201 sum += diff[i].user;
4202 sum += diff[i].system;
4203 sum += diff[i].idle;
4204 }
4205
4206 return sum;
4207 }
4208
4209 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4210 {
4211 unsigned long free_space, to_add;
4212
4213 free_space = threshold - usage->user - usage->system;
4214
4215 if (free_space > usage->idle)
4216 free_space = usage->idle;
4217
4218 to_add = free_space > *surplus ? *surplus : free_space;
4219
4220 *counter += to_add;
4221 usage->idle -= to_add;
4222 *surplus -= to_add;
4223 }
4224
4225 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4226 {
4227 struct cg_proc_stat *first = NULL, *prev, *tmp;
4228
4229 for (prev = NULL; node; ) {
4230 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4231 tmp = node;
4232 lxcfs_debug("Removing stat node for %s\n", node->cg);
4233
4234 if (prev)
4235 prev->next = node->next;
4236 else
4237 first = node->next;
4238
4239 node = node->next;
4240 free_proc_stat_node(tmp);
4241 } else {
4242 if (!first)
4243 first = node;
4244 prev = node;
4245 node = node->next;
4246 }
4247 }
4248
4249 return first;
4250 }
4251
4252 #define PROC_STAT_PRUNE_INTERVAL 10
4253 static void prune_proc_stat_history(void)
4254 {
4255 int i;
4256 time_t now = time(NULL);
4257
4258 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4259 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4260
4261 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4262 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4263 return;
4264 }
4265
4266 if (proc_stat_history[i]->next) {
4267 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4268 proc_stat_history[i]->lastcheck = now;
4269 }
4270
4271 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4272 }
4273 }
4274
4275 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4276 {
4277 struct cg_proc_stat *node;
4278
4279 pthread_rwlock_rdlock(&head->lock);
4280
4281 if (!head->next) {
4282 pthread_rwlock_unlock(&head->lock);
4283 return NULL;
4284 }
4285
4286 node = head->next;
4287
4288 do {
4289 if (strcmp(cg, node->cg) == 0)
4290 goto out;
4291 } while ((node = node->next));
4292
4293 node = NULL;
4294
4295 out:
4296 pthread_rwlock_unlock(&head->lock);
4297 prune_proc_stat_history();
4298 return node;
4299 }
4300
4301 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4302 {
4303 struct cg_proc_stat *node;
4304 int i;
4305
4306 node = malloc(sizeof(struct cg_proc_stat));
4307 if (!node)
4308 goto err;
4309
4310 node->cg = NULL;
4311 node->usage = NULL;
4312 node->view = NULL;
4313
4314 node->cg = malloc(strlen(cg) + 1);
4315 if (!node->cg)
4316 goto err;
4317
4318 strcpy(node->cg, cg);
4319
4320 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4321 if (!node->usage)
4322 goto err;
4323
4324 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4325
4326 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4327 if (!node->view)
4328 goto err;
4329
4330 node->cpu_count = cpu_count;
4331 node->next = NULL;
4332
4333 if (pthread_mutex_init(&node->lock, NULL) != 0) {
4334 lxcfs_error("%s\n", "Failed to initialize node lock");
4335 goto err;
4336 }
4337
4338 for (i = 0; i < cpu_count; i++) {
4339 node->view[i].user = 0;
4340 node->view[i].system = 0;
4341 node->view[i].idle = 0;
4342 }
4343
4344 return node;
4345
4346 err:
4347 if (node && node->cg)
4348 free(node->cg);
4349 if (node && node->usage)
4350 free(node->usage);
4351 if (node && node->view)
4352 free(node->view);
4353 if (node)
4354 free(node);
4355
4356 return NULL;
4357 }
4358
4359 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4360 {
4361 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4362 struct cg_proc_stat_head *head = proc_stat_history[hash];
4363 struct cg_proc_stat *node, *rv = new_node;
4364
4365 pthread_rwlock_wrlock(&head->lock);
4366
4367 if (!head->next) {
4368 head->next = new_node;
4369 goto out;
4370 }
4371
4372 node = head->next;
4373
4374 for (;;) {
4375 if (strcmp(node->cg, new_node->cg) == 0) {
4376 /* The node is already present, return it */
4377 free_proc_stat_node(new_node);
4378 rv = node;
4379 goto out;
4380 }
4381
4382 if (node->next) {
4383 node = node->next;
4384 continue;
4385 }
4386
4387 node->next = new_node;
4388 goto out;
4389 }
4390
4391 out:
4392 pthread_rwlock_unlock(&head->lock);
4393 return rv;
4394 }
4395
4396 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4397 {
4398 struct cpuacct_usage *new_usage, *new_view;
4399 int i;
4400
4401 /* Allocate new memory */
4402 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4403 if (!new_usage)
4404 return false;
4405
4406 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4407 if (!new_view) {
4408 free(new_usage);
4409 return false;
4410 }
4411
4412 /* Copy existing data & initialize new elements */
4413 for (i = 0; i < cpu_count; i++) {
4414 if (i < node->cpu_count) {
4415 new_usage[i].user = node->usage[i].user;
4416 new_usage[i].system = node->usage[i].system;
4417 new_usage[i].idle = node->usage[i].idle;
4418
4419 new_view[i].user = node->view[i].user;
4420 new_view[i].system = node->view[i].system;
4421 new_view[i].idle = node->view[i].idle;
4422 } else {
4423 new_usage[i].user = 0;
4424 new_usage[i].system = 0;
4425 new_usage[i].idle = 0;
4426
4427 new_view[i].user = 0;
4428 new_view[i].system = 0;
4429 new_view[i].idle = 0;
4430 }
4431 }
4432
4433 free(node->usage);
4434 free(node->view);
4435
4436 node->usage = new_usage;
4437 node->view = new_view;
4438 node->cpu_count = cpu_count;
4439
4440 return true;
4441 }
4442
4443 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4444 {
4445 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4446 struct cg_proc_stat_head *head = proc_stat_history[hash];
4447 struct cg_proc_stat *node;
4448
4449 node = find_proc_stat_node(head, cg);
4450
4451 if (!node) {
4452 node = new_proc_stat_node(usage, cpu_count, cg);
4453 if (!node)
4454 return NULL;
4455
4456 node = add_proc_stat_node(node);
4457 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4458 }
4459
4460 pthread_mutex_lock(&node->lock);
4461
4462 /* If additional CPUs on the host have been enabled, CPU usage counter
4463 * arrays have to be expanded */
4464 if (node->cpu_count < cpu_count) {
4465 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4466 node->cpu_count, cpu_count, cg);
4467
4468 if (!expand_proc_stat_node(node, cpu_count)) {
4469 pthread_mutex_unlock(&node->lock);
4470 lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4471 node->cpu_count, cpu_count, cg);
4472 return NULL;
4473 }
4474 }
4475
4476 return node;
4477 }
4478
4479 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4480 {
4481 int i;
4482
4483 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4484 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4485
4486 for (i = 0; i < cpu_count; i++) {
4487 node->view[i].user = 0;
4488 node->view[i].system = 0;
4489 node->view[i].idle = 0;
4490 }
4491
4492 node->cpu_count = cpu_count;
4493 }
4494
4495 static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, FILE *f, char *buf, size_t buf_size)
4496 {
4497 char *line = NULL;
4498 size_t linelen = 0, total_len = 0, rv = 0, l;
4499 int curcpu = -1; /* cpu numbering starts at 0 */
4500 int physcpu, i;
4501 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4502 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4503 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4504 unsigned long user_surplus = 0, system_surplus = 0;
4505 unsigned long total_sum, threshold;
4506 struct cg_proc_stat *stat_node;
4507 struct cpuacct_usage *diff = NULL;
4508 int nprocs = get_nprocs_conf();
4509
4510 if (cg_cpu_usage_size < nprocs)
4511 nprocs = cg_cpu_usage_size;
4512
4513 /* Read all CPU stats and stop when we've encountered other lines */
4514 while (getline(&line, &linelen, f) != -1) {
4515 int ret;
4516 char cpu_char[10]; /* That's a lot of cores */
4517 uint64_t all_used, cg_used;
4518
4519 if (strlen(line) == 0)
4520 continue;
4521 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4522 /* not a ^cpuN line containing a number N */
4523 break;
4524 }
4525
4526 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4527 continue;
4528
4529 if (physcpu >= cg_cpu_usage_size)
4530 continue;
4531
4532 curcpu ++;
4533 cpu_cnt ++;
4534
4535 if (!cpu_in_cpuset(physcpu, cpuset)) {
4536 for (i = curcpu; i <= physcpu; i++) {
4537 cg_cpu_usage[i].online = false;
4538 }
4539 continue;
4540 }
4541
4542 if (curcpu < physcpu) {
4543 /* Some CPUs may be disabled */
4544 for (i = curcpu; i < physcpu; i++)
4545 cg_cpu_usage[i].online = false;
4546
4547 curcpu = physcpu;
4548 }
4549
4550 cg_cpu_usage[curcpu].online = true;
4551
4552 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4553 &user,
4554 &nice,
4555 &system,
4556 &idle,
4557 &iowait,
4558 &irq,
4559 &softirq,
4560 &steal,
4561 &guest,
4562 &guest_nice);
4563
4564 if (ret != 10)
4565 continue;
4566
4567 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4568 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4569
4570 if (all_used >= cg_used) {
4571 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4572
4573 } else {
4574 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4575 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4576 curcpu, cg, all_used, cg_used);
4577 cg_cpu_usage[curcpu].idle = idle;
4578 }
4579 }
4580
4581 /* Cannot use more CPUs than is available due to cpuset */
4582 if (max_cpus > cpu_cnt)
4583 max_cpus = cpu_cnt;
4584
4585 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4586
4587 if (!stat_node) {
4588 lxcfs_error("unable to find/create stat node for %s\n", cg);
4589 rv = 0;
4590 goto err;
4591 }
4592
4593 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4594 if (!diff) {
4595 rv = 0;
4596 goto err;
4597 }
4598
4599 /*
4600 * If the new values are LOWER than values stored in memory, it means
4601 * the cgroup has been reset/recreated and we should reset too.
4602 */
4603 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4604 if (!cg_cpu_usage[curcpu].online)
4605 continue;
4606
4607 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4608 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4609
4610 break;
4611 }
4612
4613 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4614
4615 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4616 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4617
4618 if (!stat_node->usage[curcpu].online)
4619 continue;
4620
4621 i++;
4622
4623 stat_node->usage[curcpu].user += diff[curcpu].user;
4624 stat_node->usage[curcpu].system += diff[curcpu].system;
4625 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4626
4627 if (max_cpus > 0 && i >= max_cpus) {
4628 user_surplus += diff[curcpu].user;
4629 system_surplus += diff[curcpu].system;
4630 }
4631 }
4632
4633 /* Calculate usage counters of visible CPUs */
4634 if (max_cpus > 0) {
4635 /* threshold = maximum usage per cpu, including idle */
4636 threshold = total_sum / cpu_cnt * max_cpus;
4637
4638 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4639 if (!stat_node->usage[curcpu].online)
4640 continue;
4641
4642 i++;
4643
4644 if (i == max_cpus)
4645 break;
4646
4647 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4648 continue;
4649
4650 /* Add user */
4651 add_cpu_usage(
4652 &user_surplus,
4653 &diff[curcpu],
4654 &diff[curcpu].user,
4655 threshold);
4656
4657 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4658 continue;
4659
4660 /* If there is still room, add system */
4661 add_cpu_usage(
4662 &system_surplus,
4663 &diff[curcpu],
4664 &diff[curcpu].system,
4665 threshold);
4666 }
4667
4668 if (user_surplus > 0)
4669 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4670 if (system_surplus > 0)
4671 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4672
4673 unsigned long diff_user = 0;
4674 unsigned long diff_system = 0;
4675 unsigned long diff_idle = 0;
4676 unsigned long max_diff_idle = 0;
4677 unsigned long max_diff_idle_index = 0;
4678 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4679 if (!stat_node->usage[curcpu].online)
4680 continue;
4681
4682 i++;
4683
4684 if (i == max_cpus)
4685 break;
4686
4687 stat_node->view[curcpu].user += diff[curcpu].user;
4688 stat_node->view[curcpu].system += diff[curcpu].system;
4689 stat_node->view[curcpu].idle += diff[curcpu].idle;
4690
4691 user_sum += stat_node->view[curcpu].user;
4692 system_sum += stat_node->view[curcpu].system;
4693 idle_sum += stat_node->view[curcpu].idle;
4694
4695 diff_user += diff[curcpu].user;
4696 diff_system += diff[curcpu].system;
4697 diff_idle += diff[curcpu].idle;
4698 if (diff[curcpu].idle > max_diff_idle) {
4699 max_diff_idle = diff[curcpu].idle;
4700 max_diff_idle_index = curcpu;
4701 }
4702
4703 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
4704 }
4705 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
4706
4707 // revise cpu usage view to support partial cpu case
4708 double exact_cpus = exact_cpu_count(cg);
4709 if (exact_cpus < (double)max_cpus){
4710 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
4711 unsigned long delta = (unsigned long)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
4712 lxcfs_v("delta: %lu\n", delta);
4713 lxcfs_v("idle_sum before: %lu\n", idle_sum);
4714 idle_sum = idle_sum > delta ? idle_sum - delta : 0;
4715 lxcfs_v("idle_sum after: %lu\n", idle_sum);
4716
4717 curcpu = max_diff_idle_index;
4718 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
4719 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
4720 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
4721 }
4722 } else {
4723 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4724 if (!stat_node->usage[curcpu].online)
4725 continue;
4726
4727 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4728 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4729 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4730
4731 user_sum += stat_node->view[curcpu].user;
4732 system_sum += stat_node->view[curcpu].system;
4733 idle_sum += stat_node->view[curcpu].idle;
4734 }
4735 }
4736
4737 /* Render the file */
4738 /* cpu-all */
4739 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4740 user_sum,
4741 system_sum,
4742 idle_sum);
4743 lxcfs_v("cpu-all: %s\n", buf);
4744
4745 if (l < 0) {
4746 perror("Error writing to cache");
4747 rv = 0;
4748 goto err;
4749 }
4750 if (l >= buf_size) {
4751 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4752 rv = 0;
4753 goto err;
4754 }
4755
4756 buf += l;
4757 buf_size -= l;
4758 total_len += l;
4759
4760 /* Render visible CPUs */
4761 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4762 if (!stat_node->usage[curcpu].online)
4763 continue;
4764
4765 i++;
4766
4767 if (max_cpus > 0 && i == max_cpus)
4768 break;
4769
4770 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4771 i,
4772 stat_node->view[curcpu].user,
4773 stat_node->view[curcpu].system,
4774 stat_node->view[curcpu].idle);
4775 lxcfs_v("cpu: %s\n", buf);
4776
4777 if (l < 0) {
4778 perror("Error writing to cache");
4779 rv = 0;
4780 goto err;
4781
4782 }
4783 if (l >= buf_size) {
4784 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4785 rv = 0;
4786 goto err;
4787 }
4788
4789 buf += l;
4790 buf_size -= l;
4791 total_len += l;
4792 }
4793
4794 /* Pass the rest of /proc/stat, start with the last line read */
4795 l = snprintf(buf, buf_size, "%s", line);
4796
4797 if (l < 0) {
4798 perror("Error writing to cache");
4799 rv = 0;
4800 goto err;
4801
4802 }
4803 if (l >= buf_size) {
4804 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4805 rv = 0;
4806 goto err;
4807 }
4808
4809 buf += l;
4810 buf_size -= l;
4811 total_len += l;
4812
4813 /* Pass the rest of the host's /proc/stat */
4814 while (getline(&line, &linelen, f) != -1) {
4815 l = snprintf(buf, buf_size, "%s", line);
4816 if (l < 0) {
4817 perror("Error writing to cache");
4818 rv = 0;
4819 goto err;
4820 }
4821 if (l >= buf_size) {
4822 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4823 rv = 0;
4824 goto err;
4825 }
4826 buf += l;
4827 buf_size -= l;
4828 total_len += l;
4829 }
4830
4831 rv = total_len;
4832
4833 err:
4834 if (stat_node)
4835 pthread_mutex_unlock(&stat_node->lock);
4836 if (line)
4837 free(line);
4838 if (diff)
4839 free(diff);
4840 return rv;
4841 }
4842
4843 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4844 static int proc_stat_read(char *buf, size_t size, off_t offset,
4845 struct fuse_file_info *fi)
4846 {
4847 struct fuse_context *fc = fuse_get_context();
4848 struct file_info *d = (struct file_info *)fi->fh;
4849 char *cg;
4850 char *cpuset = NULL;
4851 char *line = NULL;
4852 size_t linelen = 0, total_len = 0, rv = 0;
4853 int curcpu = -1; /* cpu numbering starts at 0 */
4854 int physcpu = 0;
4855 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4856 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
4857 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4858 char cpuall[CPUALL_MAX_SIZE];
4859 /* reserve for cpu all */
4860 char *cache = d->buf + CPUALL_MAX_SIZE;
4861 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4862 FILE *f = NULL;
4863 struct cpuacct_usage *cg_cpu_usage = NULL;
4864 int cg_cpu_usage_size = 0;
4865
4866 if (offset){
4867 if (offset > d->size)
4868 return -EINVAL;
4869 if (!d->cached)
4870 return 0;
4871 int left = d->size - offset;
4872 total_len = left > size ? size: left;
4873 memcpy(buf, d->buf + offset, total_len);
4874 return total_len;
4875 }
4876
4877 pid_t initpid = lookup_initpid_in_store(fc->pid);
4878 lxcfs_v("initpid: %d\n", initpid);
4879 if (initpid <= 0)
4880 initpid = fc->pid;
4881 cg = get_pid_cgroup(initpid, "cpuset");
4882 lxcfs_v("cg: %s\n", cg);
4883 if (!cg)
4884 return read_file("/proc/stat", buf, size, d);
4885 prune_init_slice(cg);
4886
4887 cpuset = get_cpuset(cg);
4888 if (!cpuset)
4889 goto err;
4890
4891 /*
4892 * Read cpuacct.usage_all for all CPUs.
4893 * If the cpuacct cgroup is present, it is used to calculate the container's
4894 * CPU usage. If not, values from the host's /proc/stat are used.
4895 */
4896 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
4897 lxcfs_v("%s\n", "proc_stat_read failed to read from cpuacct, "
4898 "falling back to the host's /proc/stat");
4899 }
4900
4901 f = fopen("/proc/stat", "r");
4902 if (!f)
4903 goto err;
4904
4905 //skip first line
4906 if (getline(&line, &linelen, f) < 0) {
4907 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4908 goto err;
4909 }
4910
4911 if (use_cpuview(cg) && cg_cpu_usage) {
4912 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4913 f, d->buf, d->buflen);
4914 goto out;
4915 }
4916
4917 while (getline(&line, &linelen, f) != -1) {
4918 ssize_t l;
4919 char cpu_char[10]; /* That's a lot of cores */
4920 char *c;
4921 uint64_t all_used, cg_used, new_idle;
4922 int ret;
4923
4924 if (strlen(line) == 0)
4925 continue;
4926 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4927 /* not a ^cpuN line containing a number N, just print it */
4928 l = snprintf(cache, cache_size, "%s", line);
4929 if (l < 0) {
4930 perror("Error writing to cache");
4931 rv = 0;
4932 goto err;
4933 }
4934 if (l >= cache_size) {
4935 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4936 rv = 0;
4937 goto err;
4938 }
4939 cache += l;
4940 cache_size -= l;
4941 total_len += l;
4942 continue;
4943 }
4944
4945 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4946 continue;
4947 if (!cpu_in_cpuset(physcpu, cpuset))
4948 continue;
4949 curcpu ++;
4950
4951 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4952 &user,
4953 &nice,
4954 &system,
4955 &idle,
4956 &iowait,
4957 &irq,
4958 &softirq,
4959 &steal,
4960 &guest,
4961 &guest_nice);
4962
4963 if (ret != 10 || !cg_cpu_usage) {
4964 c = strchr(line, ' ');
4965 if (!c)
4966 continue;
4967 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4968 if (l < 0) {
4969 perror("Error writing to cache");
4970 rv = 0;
4971 goto err;
4972
4973 }
4974 if (l >= cache_size) {
4975 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4976 rv = 0;
4977 goto err;
4978 }
4979
4980 cache += l;
4981 cache_size -= l;
4982 total_len += l;
4983
4984 if (ret != 10)
4985 continue;
4986 }
4987
4988 if (cg_cpu_usage) {
4989 if (physcpu >= cg_cpu_usage_size)
4990 break;
4991
4992 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4993 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
4994
4995 if (all_used >= cg_used) {
4996 new_idle = idle + (all_used - cg_used);
4997
4998 } else {
4999 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
5000 "%lu in cpuacct.usage_all; unable to determine idle time\n",
5001 curcpu, cg, all_used, cg_used);
5002 new_idle = idle;
5003 }
5004
5005 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
5006 curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
5007 new_idle);
5008
5009 if (l < 0) {
5010 perror("Error writing to cache");
5011 rv = 0;
5012 goto err;
5013
5014 }
5015 if (l >= cache_size) {
5016 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5017 rv = 0;
5018 goto err;
5019 }
5020
5021 cache += l;
5022 cache_size -= l;
5023 total_len += l;
5024
5025 user_sum += cg_cpu_usage[physcpu].user;
5026 system_sum += cg_cpu_usage[physcpu].system;
5027 idle_sum += new_idle;
5028
5029 } else {
5030 user_sum += user;
5031 nice_sum += nice;
5032 system_sum += system;
5033 idle_sum += idle;
5034 iowait_sum += iowait;
5035 irq_sum += irq;
5036 softirq_sum += softirq;
5037 steal_sum += steal;
5038 guest_sum += guest;
5039 guest_nice_sum += guest_nice;
5040 }
5041 }
5042
5043 cache = d->buf;
5044
5045 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5046 user_sum,
5047 nice_sum,
5048 system_sum,
5049 idle_sum,
5050 iowait_sum,
5051 irq_sum,
5052 softirq_sum,
5053 steal_sum,
5054 guest_sum,
5055 guest_nice_sum);
5056 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
5057 memcpy(cache, cpuall, cpuall_len);
5058 cache += cpuall_len;
5059 } else {
5060 /* shouldn't happen */
5061 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
5062 cpuall_len = 0;
5063 }
5064
5065 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
5066 total_len += cpuall_len;
5067
5068 out:
5069 d->cached = 1;
5070 d->size = total_len;
5071 if (total_len > size)
5072 total_len = size;
5073
5074 memcpy(buf, d->buf, total_len);
5075 rv = total_len;
5076
5077 err:
5078 if (f)
5079 fclose(f);
5080 if (cg_cpu_usage)
5081 free(cg_cpu_usage);
5082 free(line);
5083 free(cpuset);
5084 free(cg);
5085 return rv;
5086 }
5087
5088 /* This function retrieves the busy time of a group of tasks by looking at
5089 * cpuacct.usage. Unfortunately, this only makes sense when the container has
5090 * been given it's own cpuacct cgroup. If not, this function will take the busy
5091 * time of all other taks that do not actually belong to the container into
5092 * account as well. If someone has a clever solution for this please send a
5093 * patch!
5094 */
5095 static unsigned long get_reaper_busy(pid_t task)
5096 {
5097 pid_t initpid = lookup_initpid_in_store(task);
5098 char *cgroup = NULL, *usage_str = NULL;
5099 unsigned long usage = 0;
5100
5101 if (initpid <= 0)
5102 return 0;
5103
5104 cgroup = get_pid_cgroup(initpid, "cpuacct");
5105 if (!cgroup)
5106 goto out;
5107 prune_init_slice(cgroup);
5108 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
5109 goto out;
5110 usage = strtoul(usage_str, NULL, 10);
5111 usage /= 1000000000;
5112
5113 out:
5114 free(cgroup);
5115 free(usage_str);
5116 return usage;
5117 }
5118
5119 #if RELOADTEST
5120 void iwashere(void)
5121 {
5122 int fd;
5123
5124 fd = creat("/tmp/lxcfs-iwashere", 0644);
5125 if (fd >= 0)
5126 close(fd);
5127 }
5128 #endif
5129
5130 /*
5131 * We read /proc/uptime and reuse its second field.
5132 * For the first field, we use the mtime for the reaper for
5133 * the calling pid as returned by getreaperage
5134 */
5135 static int proc_uptime_read(char *buf, size_t size, off_t offset,
5136 struct fuse_file_info *fi)
5137 {
5138 struct fuse_context *fc = fuse_get_context();
5139 struct file_info *d = (struct file_info *)fi->fh;
5140 unsigned long int busytime = get_reaper_busy(fc->pid);
5141 char *cache = d->buf;
5142 ssize_t total_len = 0;
5143 uint64_t idletime, reaperage;
5144
5145 #if RELOADTEST
5146 iwashere();
5147 #endif
5148
5149 if (offset){
5150 if (!d->cached)
5151 return 0;
5152 if (offset > d->size)
5153 return -EINVAL;
5154 int left = d->size - offset;
5155 total_len = left > size ? size: left;
5156 memcpy(buf, cache + offset, total_len);
5157 return total_len;
5158 }
5159
5160 reaperage = get_reaper_age(fc->pid);
5161 /* To understand why this is done, please read the comment to the
5162 * get_reaper_busy() function.
5163 */
5164 idletime = reaperage;
5165 if (reaperage >= busytime)
5166 idletime = reaperage - busytime;
5167
5168 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
5169 if (total_len < 0 || total_len >= d->buflen){
5170 lxcfs_error("%s\n", "failed to write to cache");
5171 return 0;
5172 }
5173
5174 d->size = (int)total_len;
5175 d->cached = 1;
5176
5177 if (total_len > size) total_len = size;
5178
5179 memcpy(buf, d->buf, total_len);
5180 return total_len;
5181 }
5182
5183 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5184 struct fuse_file_info *fi)
5185 {
5186 char dev_name[72];
5187 struct fuse_context *fc = fuse_get_context();
5188 struct file_info *d = (struct file_info *)fi->fh;
5189 char *cg;
5190 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5191 *io_wait_time_str = NULL, *io_service_time_str = NULL;
5192 unsigned long read = 0, write = 0;
5193 unsigned long read_merged = 0, write_merged = 0;
5194 unsigned long read_sectors = 0, write_sectors = 0;
5195 unsigned long read_ticks = 0, write_ticks = 0;
5196 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5197 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5198 char *cache = d->buf;
5199 size_t cache_size = d->buflen;
5200 char *line = NULL;
5201 size_t linelen = 0, total_len = 0, rv = 0;
5202 unsigned int major = 0, minor = 0;
5203 int i = 0;
5204 FILE *f = NULL;
5205
5206 if (offset){
5207 if (offset > d->size)
5208 return -EINVAL;
5209 if (!d->cached)
5210 return 0;
5211 int left = d->size - offset;
5212 total_len = left > size ? size: left;
5213 memcpy(buf, cache + offset, total_len);
5214 return total_len;
5215 }
5216
5217 pid_t initpid = lookup_initpid_in_store(fc->pid);
5218 if (initpid <= 0)
5219 initpid = fc->pid;
5220 cg = get_pid_cgroup(initpid, "blkio");
5221 if (!cg)
5222 return read_file("/proc/diskstats", buf, size, d);
5223 prune_init_slice(cg);
5224
5225 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
5226 goto err;
5227 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
5228 goto err;
5229 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
5230 goto err;
5231 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
5232 goto err;
5233 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
5234 goto err;
5235
5236
5237 f = fopen("/proc/diskstats", "r");
5238 if (!f)
5239 goto err;
5240
5241 while (getline(&line, &linelen, f) != -1) {
5242 ssize_t l;
5243 char lbuf[256];
5244
5245 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5246 if (i != 3)
5247 continue;
5248
5249 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5250 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5251 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5252 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5253 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5254 read_sectors = read_sectors/512;
5255 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5256 write_sectors = write_sectors/512;
5257
5258 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5259 rd_svctm = rd_svctm/1000000;
5260 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5261 rd_wait = rd_wait/1000000;
5262 read_ticks = rd_svctm + rd_wait;
5263
5264 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5265 wr_svctm = wr_svctm/1000000;
5266 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5267 wr_wait = wr_wait/1000000;
5268 write_ticks = wr_svctm + wr_wait;
5269
5270 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5271 tot_ticks = tot_ticks/1000000;
5272
5273 memset(lbuf, 0, 256);
5274 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5275 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5276 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5277 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5278 else
5279 continue;
5280
5281 l = snprintf(cache, cache_size, "%s", lbuf);
5282 if (l < 0) {
5283 perror("Error writing to fuse buf");
5284 rv = 0;
5285 goto err;
5286 }
5287 if (l >= cache_size) {
5288 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5289 rv = 0;
5290 goto err;
5291 }
5292 cache += l;
5293 cache_size -= l;
5294 total_len += l;
5295 }
5296
5297 d->cached = 1;
5298 d->size = total_len;
5299 if (total_len > size ) total_len = size;
5300 memcpy(buf, d->buf, total_len);
5301
5302 rv = total_len;
5303 err:
5304 free(cg);
5305 if (f)
5306 fclose(f);
5307 free(line);
5308 free(io_serviced_str);
5309 free(io_merged_str);
5310 free(io_service_bytes_str);
5311 free(io_wait_time_str);
5312 free(io_service_time_str);
5313 return rv;
5314 }
5315
5316 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5317 struct fuse_file_info *fi)
5318 {
5319 struct fuse_context *fc = fuse_get_context();
5320 struct file_info *d = (struct file_info *)fi->fh;
5321 char *cg = NULL;
5322 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
5323 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
5324 ssize_t total_len = 0, rv = 0;
5325 ssize_t l = 0;
5326 char *cache = d->buf;
5327
5328 if (offset) {
5329 if (offset > d->size)
5330 return -EINVAL;
5331 if (!d->cached)
5332 return 0;
5333 int left = d->size - offset;
5334 total_len = left > size ? size: left;
5335 memcpy(buf, cache + offset, total_len);
5336 return total_len;
5337 }
5338
5339 pid_t initpid = lookup_initpid_in_store(fc->pid);
5340 if (initpid <= 0)
5341 initpid = fc->pid;
5342 cg = get_pid_cgroup(initpid, "memory");
5343 if (!cg)
5344 return read_file("/proc/swaps", buf, size, d);
5345 prune_init_slice(cg);
5346
5347 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
5348
5349 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
5350 goto err;
5351
5352 memusage = strtoul(memusage_str, NULL, 10);
5353
5354 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5355 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5356
5357 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
5358 memswusage = strtoul(memswusage_str, NULL, 10);
5359
5360 swap_total = (memswlimit - memlimit) / 1024;
5361 swap_free = (memswusage - memusage) / 1024;
5362 }
5363
5364 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5365
5366 /* When no mem + swap limit is specified or swapaccount=0*/
5367 if (!memswlimit) {
5368 char *line = NULL;
5369 size_t linelen = 0;
5370 FILE *f = fopen("/proc/meminfo", "r");
5371
5372 if (!f)
5373 goto err;
5374
5375 while (getline(&line, &linelen, f) != -1) {
5376 if (startswith(line, "SwapTotal:")) {
5377 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
5378 } else if (startswith(line, "SwapFree:")) {
5379 sscanf(line, "SwapFree: %8lu kB", &swap_free);
5380 }
5381 }
5382
5383 free(line);
5384 fclose(f);
5385 }
5386
5387 if (swap_total > 0) {
5388 l = snprintf(d->buf + total_len, d->size - total_len,
5389 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5390 swap_total, swap_free);
5391 total_len += l;
5392 }
5393
5394 if (total_len < 0 || l < 0) {
5395 perror("Error writing to cache");
5396 rv = 0;
5397 goto err;
5398 }
5399
5400 d->cached = 1;
5401 d->size = (int)total_len;
5402
5403 if (total_len > size) total_len = size;
5404 memcpy(buf, d->buf, total_len);
5405 rv = total_len;
5406
5407 err:
5408 free(cg);
5409 free(memswlimit_str);
5410 free(memlimit_str);
5411 free(memusage_str);
5412 free(memswusage_str);
5413 return rv;
5414 }
5415 /*
5416 * Find the process pid from cgroup path.
5417 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5418 * @pid_buf : put pid to pid_buf.
5419 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5420 * @depth : the depth of cgroup in container.
5421 * @sum : return the number of pid.
5422 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5423 */
5424 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5425 {
5426 DIR *dir;
5427 int fd;
5428 struct dirent *file;
5429 FILE *f = NULL;
5430 size_t linelen = 0;
5431 char *line = NULL;
5432 int pd;
5433 char *path_dir, *path;
5434 char **pid;
5435
5436 /* path = dpath + "/cgroup.procs" + /0 */
5437 do {
5438 path = malloc(strlen(dpath) + 20);
5439 } while (!path);
5440
5441 strcpy(path, dpath);
5442 fd = openat(cfd, path, O_RDONLY);
5443 if (fd < 0)
5444 goto out;
5445
5446 dir = fdopendir(fd);
5447 if (dir == NULL) {
5448 close(fd);
5449 goto out;
5450 }
5451
5452 while (((file = readdir(dir)) != NULL) && depth > 0) {
5453 if (strncmp(file->d_name, ".", 1) == 0)
5454 continue;
5455 if (strncmp(file->d_name, "..", 1) == 0)
5456 continue;
5457 if (file->d_type == DT_DIR) {
5458 /* path + '/' + d_name +/0 */
5459 do {
5460 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5461 } while (!path_dir);
5462 strcpy(path_dir, path);
5463 strcat(path_dir, "/");
5464 strcat(path_dir, file->d_name);
5465 pd = depth - 1;
5466 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5467 free(path_dir);
5468 }
5469 }
5470 closedir(dir);
5471
5472 strcat(path, "/cgroup.procs");
5473 fd = openat(cfd, path, O_RDONLY);
5474 if (fd < 0)
5475 goto out;
5476
5477 f = fdopen(fd, "r");
5478 if (!f) {
5479 close(fd);
5480 goto out;
5481 }
5482
5483 while (getline(&line, &linelen, f) != -1) {
5484 do {
5485 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5486 } while (!pid);
5487 *pid_buf = pid;
5488 do {
5489 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5490 } while (*(*pid_buf + sum) == NULL);
5491 strcpy(*(*pid_buf + sum), line);
5492 sum++;
5493 }
5494 fclose(f);
5495 out:
5496 if (line)
5497 free(line);
5498 free(path);
5499 return sum;
5500 }
5501 /*
5502 * calc_load calculates the load according to the following formula:
5503 * load1 = load0 * exp + active * (1 - exp)
5504 *
5505 * @load1: the new loadavg.
5506 * @load0: the former loadavg.
5507 * @active: the total number of running pid at this moment.
5508 * @exp: the fixed-point defined in the beginning.
5509 */
5510 static unsigned long
5511 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5512 {
5513 unsigned long newload;
5514
5515 active = active > 0 ? active * FIXED_1 : 0;
5516 newload = load * exp + active * (FIXED_1 - exp);
5517 if (active >= load)
5518 newload += FIXED_1 - 1;
5519
5520 return newload / FIXED_1;
5521 }
5522
5523 /*
5524 * Return 0 means that container p->cg is closed.
5525 * Return -1 means that error occurred in refresh.
5526 * Positive num equals the total number of pid.
5527 */
5528 static int refresh_load(struct load_node *p, char *path)
5529 {
5530 FILE *f = NULL;
5531 char **idbuf;
5532 char proc_path[256];
5533 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5534 char *line = NULL;
5535 size_t linelen = 0;
5536 int sum, length;
5537 DIR *dp;
5538 struct dirent *file;
5539
5540 do {
5541 idbuf = malloc(sizeof(char *));
5542 } while (!idbuf);
5543 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5544 /* normal exit */
5545 if (sum == 0)
5546 goto out;
5547
5548 for (i = 0; i < sum; i++) {
5549 /*clean up '\n' */
5550 length = strlen(idbuf[i])-1;
5551 idbuf[i][length] = '\0';
5552 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5553 if (ret < 0 || ret > 255) {
5554 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5555 i = sum;
5556 sum = -1;
5557 goto err_out;
5558 }
5559
5560 dp = opendir(proc_path);
5561 if (!dp) {
5562 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5563 continue;
5564 }
5565 while ((file = readdir(dp)) != NULL) {
5566 if (strncmp(file->d_name, ".", 1) == 0)
5567 continue;
5568 if (strncmp(file->d_name, "..", 1) == 0)
5569 continue;
5570 total_pid++;
5571 /* We make the biggest pid become last_pid.*/
5572 ret = atof(file->d_name);
5573 last_pid = (ret > last_pid) ? ret : last_pid;
5574
5575 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5576 if (ret < 0 || ret > 255) {
5577 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5578 i = sum;
5579 sum = -1;
5580 closedir(dp);
5581 goto err_out;
5582 }
5583 f = fopen(proc_path, "r");
5584 if (f != NULL) {
5585 while (getline(&line, &linelen, f) != -1) {
5586 /* Find State */
5587 if ((line[0] == 'S') && (line[1] == 't'))
5588 break;
5589 }
5590 if ((line[7] == 'R') || (line[7] == 'D'))
5591 run_pid++;
5592 fclose(f);
5593 }
5594 }
5595 closedir(dp);
5596 }
5597 /*Calculate the loadavg.*/
5598 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5599 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5600 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5601 p->run_pid = run_pid;
5602 p->total_pid = total_pid;
5603 p->last_pid = last_pid;
5604
5605 free(line);
5606 err_out:
5607 for (; i > 0; i--)
5608 free(idbuf[i-1]);
5609 out:
5610 free(idbuf);
5611 return sum;
5612 }
5613 /*
5614 * Traverse the hash table and update it.
5615 */
5616 void *load_begin(void *arg)
5617 {
5618
5619 char *path = NULL;
5620 int i, sum, length, ret;
5621 struct load_node *f;
5622 int first_node;
5623 clock_t time1, time2;
5624
5625 while (1) {
5626 if (loadavg_stop == 1)
5627 return NULL;
5628
5629 time1 = clock();
5630 for (i = 0; i < LOAD_SIZE; i++) {
5631 pthread_mutex_lock(&load_hash[i].lock);
5632 if (load_hash[i].next == NULL) {
5633 pthread_mutex_unlock(&load_hash[i].lock);
5634 continue;
5635 }
5636 f = load_hash[i].next;
5637 first_node = 1;
5638 while (f) {
5639 length = strlen(f->cg) + 2;
5640 do {
5641 /* strlen(f->cg) + '.' or '' + \0 */
5642 path = malloc(length);
5643 } while (!path);
5644
5645 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5646 if (ret < 0 || ret > length - 1) {
5647 /* snprintf failed, ignore the node.*/
5648 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5649 goto out;
5650 }
5651 sum = refresh_load(f, path);
5652 if (sum == 0) {
5653 f = del_node(f, i);
5654 } else {
5655 out: f = f->next;
5656 }
5657 free(path);
5658 /* load_hash[i].lock locks only on the first node.*/
5659 if (first_node == 1) {
5660 first_node = 0;
5661 pthread_mutex_unlock(&load_hash[i].lock);
5662 }
5663 }
5664 }
5665
5666 if (loadavg_stop == 1)
5667 return NULL;
5668
5669 time2 = clock();
5670 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5671 }
5672 }
5673
5674 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5675 struct fuse_file_info *fi)
5676 {
5677 struct fuse_context *fc = fuse_get_context();
5678 struct file_info *d = (struct file_info *)fi->fh;
5679 pid_t initpid;
5680 char *cg;
5681 size_t total_len = 0;
5682 char *cache = d->buf;
5683 struct load_node *n;
5684 int hash;
5685 int cfd, rv = 0;
5686 unsigned long a, b, c;
5687
5688 if (offset) {
5689 if (offset > d->size)
5690 return -EINVAL;
5691 if (!d->cached)
5692 return 0;
5693 int left = d->size - offset;
5694 total_len = left > size ? size : left;
5695 memcpy(buf, cache + offset, total_len);
5696 return total_len;
5697 }
5698 if (!loadavg)
5699 return read_file("/proc/loadavg", buf, size, d);
5700
5701 initpid = lookup_initpid_in_store(fc->pid);
5702 if (initpid <= 0)
5703 initpid = fc->pid;
5704 cg = get_pid_cgroup(initpid, "cpu");
5705 if (!cg)
5706 return read_file("/proc/loadavg", buf, size, d);
5707
5708 prune_init_slice(cg);
5709 hash = calc_hash(cg) % LOAD_SIZE;
5710 n = locate_node(cg, hash);
5711
5712 /* First time */
5713 if (n == NULL) {
5714 if (!find_mounted_controller("cpu", &cfd)) {
5715 /*
5716 * In locate_node() above, pthread_rwlock_unlock() isn't used
5717 * because delete is not allowed before read has ended.
5718 */
5719 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5720 rv = 0;
5721 goto err;
5722 }
5723 do {
5724 n = malloc(sizeof(struct load_node));
5725 } while (!n);
5726
5727 do {
5728 n->cg = malloc(strlen(cg)+1);
5729 } while (!n->cg);
5730 strcpy(n->cg, cg);
5731 n->avenrun[0] = 0;
5732 n->avenrun[1] = 0;
5733 n->avenrun[2] = 0;
5734 n->run_pid = 0;
5735 n->total_pid = 1;
5736 n->last_pid = initpid;
5737 n->cfd = cfd;
5738 insert_node(&n, hash);
5739 }
5740 a = n->avenrun[0] + (FIXED_1/200);
5741 b = n->avenrun[1] + (FIXED_1/200);
5742 c = n->avenrun[2] + (FIXED_1/200);
5743 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5744 LOAD_INT(a), LOAD_FRAC(a),
5745 LOAD_INT(b), LOAD_FRAC(b),
5746 LOAD_INT(c), LOAD_FRAC(c),
5747 n->run_pid, n->total_pid, n->last_pid);
5748 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5749 if (total_len < 0 || total_len >= d->buflen) {
5750 lxcfs_error("%s\n", "Failed to write to cache");
5751 rv = 0;
5752 goto err;
5753 }
5754 d->size = (int)total_len;
5755 d->cached = 1;
5756
5757 if (total_len > size)
5758 total_len = size;
5759 memcpy(buf, d->buf, total_len);
5760 rv = total_len;
5761
5762 err:
5763 free(cg);
5764 return rv;
5765 }
5766 /* Return a positive number on success, return 0 on failure.*/
5767 pthread_t load_daemon(int load_use)
5768 {
5769 int ret;
5770 pthread_t pid;
5771
5772 ret = init_load();
5773 if (ret == -1) {
5774 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5775 return 0;
5776 }
5777 ret = pthread_create(&pid, NULL, load_begin, NULL);
5778 if (ret != 0) {
5779 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5780 load_free();
5781 return 0;
5782 }
5783 /* use loadavg, here loadavg = 1*/
5784 loadavg = load_use;
5785 return pid;
5786 }
5787
5788 /* Returns 0 on success. */
5789 int stop_load_daemon(pthread_t pid)
5790 {
5791 int s;
5792
5793 /* Signal the thread to gracefully stop */
5794 loadavg_stop = 1;
5795
5796 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5797 if (s != 0) {
5798 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5799 return -1;
5800 }
5801
5802 load_free();
5803 loadavg_stop = 0;
5804
5805 return 0;
5806 }
5807
5808 static off_t get_procfile_size(const char *which)
5809 {
5810 FILE *f = fopen(which, "r");
5811 char *line = NULL;
5812 size_t len = 0;
5813 ssize_t sz, answer = 0;
5814 if (!f)
5815 return 0;
5816
5817 while ((sz = getline(&line, &len, f)) != -1)
5818 answer += sz;
5819 fclose (f);
5820 free(line);
5821
5822 return answer;
5823 }
5824
5825 int proc_getattr(const char *path, struct stat *sb)
5826 {
5827 struct timespec now;
5828
5829 memset(sb, 0, sizeof(struct stat));
5830 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5831 return -EINVAL;
5832 sb->st_uid = sb->st_gid = 0;
5833 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5834 if (strcmp(path, "/proc") == 0) {
5835 sb->st_mode = S_IFDIR | 00555;
5836 sb->st_nlink = 2;
5837 return 0;
5838 }
5839 if (strcmp(path, "/proc/meminfo") == 0 ||
5840 strcmp(path, "/proc/cpuinfo") == 0 ||
5841 strcmp(path, "/proc/uptime") == 0 ||
5842 strcmp(path, "/proc/stat") == 0 ||
5843 strcmp(path, "/proc/diskstats") == 0 ||
5844 strcmp(path, "/proc/swaps") == 0 ||
5845 strcmp(path, "/proc/loadavg") == 0) {
5846 sb->st_size = 0;
5847 sb->st_mode = S_IFREG | 00444;
5848 sb->st_nlink = 1;
5849 return 0;
5850 }
5851
5852 return -ENOENT;
5853 }
5854
5855 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5856 struct fuse_file_info *fi)
5857 {
5858 if (filler(buf, ".", NULL, 0) != 0 ||
5859 filler(buf, "..", NULL, 0) != 0 ||
5860 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5861 filler(buf, "meminfo", NULL, 0) != 0 ||
5862 filler(buf, "stat", NULL, 0) != 0 ||
5863 filler(buf, "uptime", NULL, 0) != 0 ||
5864 filler(buf, "diskstats", NULL, 0) != 0 ||
5865 filler(buf, "swaps", NULL, 0) != 0 ||
5866 filler(buf, "loadavg", NULL, 0) != 0)
5867 return -EINVAL;
5868 return 0;
5869 }
5870
5871 int proc_open(const char *path, struct fuse_file_info *fi)
5872 {
5873 int type = -1;
5874 struct file_info *info;
5875
5876 if (strcmp(path, "/proc/meminfo") == 0)
5877 type = LXC_TYPE_PROC_MEMINFO;
5878 else if (strcmp(path, "/proc/cpuinfo") == 0)
5879 type = LXC_TYPE_PROC_CPUINFO;
5880 else if (strcmp(path, "/proc/uptime") == 0)
5881 type = LXC_TYPE_PROC_UPTIME;
5882 else if (strcmp(path, "/proc/stat") == 0)
5883 type = LXC_TYPE_PROC_STAT;
5884 else if (strcmp(path, "/proc/diskstats") == 0)
5885 type = LXC_TYPE_PROC_DISKSTATS;
5886 else if (strcmp(path, "/proc/swaps") == 0)
5887 type = LXC_TYPE_PROC_SWAPS;
5888 else if (strcmp(path, "/proc/loadavg") == 0)
5889 type = LXC_TYPE_PROC_LOADAVG;
5890 if (type == -1)
5891 return -ENOENT;
5892
5893 info = malloc(sizeof(*info));
5894 if (!info)
5895 return -ENOMEM;
5896
5897 memset(info, 0, sizeof(*info));
5898 info->type = type;
5899
5900 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5901 do {
5902 info->buf = malloc(info->buflen);
5903 } while (!info->buf);
5904 memset(info->buf, 0, info->buflen);
5905 /* set actual size to buffer size */
5906 info->size = info->buflen;
5907
5908 fi->fh = (unsigned long)info;
5909 return 0;
5910 }
5911
5912 int proc_access(const char *path, int mask)
5913 {
5914 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5915 return 0;
5916
5917 /* these are all read-only */
5918 if ((mask & ~R_OK) != 0)
5919 return -EACCES;
5920 return 0;
5921 }
5922
5923 int proc_release(const char *path, struct fuse_file_info *fi)
5924 {
5925 do_release_file_info(fi);
5926 return 0;
5927 }
5928
5929 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5930 struct fuse_file_info *fi)
5931 {
5932 struct file_info *f = (struct file_info *) fi->fh;
5933
5934 switch (f->type) {
5935 case LXC_TYPE_PROC_MEMINFO:
5936 return proc_meminfo_read(buf, size, offset, fi);
5937 case LXC_TYPE_PROC_CPUINFO:
5938 return proc_cpuinfo_read(buf, size, offset, fi);
5939 case LXC_TYPE_PROC_UPTIME:
5940 return proc_uptime_read(buf, size, offset, fi);
5941 case LXC_TYPE_PROC_STAT:
5942 return proc_stat_read(buf, size, offset, fi);
5943 case LXC_TYPE_PROC_DISKSTATS:
5944 return proc_diskstats_read(buf, size, offset, fi);
5945 case LXC_TYPE_PROC_SWAPS:
5946 return proc_swaps_read(buf, size, offset, fi);
5947 case LXC_TYPE_PROC_LOADAVG:
5948 return proc_loadavg_read(buf, size, offset, fi);
5949 default:
5950 return -EINVAL;
5951 }
5952 }
5953
5954 /*
5955 * Functions needed to setup cgroups in the __constructor__.
5956 */
5957
5958 static bool mkdir_p(const char *dir, mode_t mode)
5959 {
5960 const char *tmp = dir;
5961 const char *orig = dir;
5962 char *makeme;
5963
5964 do {
5965 dir = tmp + strspn(tmp, "/");
5966 tmp = dir + strcspn(dir, "/");
5967 makeme = strndup(orig, dir - orig);
5968 if (!makeme)
5969 return false;
5970 if (mkdir(makeme, mode) && errno != EEXIST) {
5971 lxcfs_error("Failed to create directory '%s': %s.\n",
5972 makeme, strerror(errno));
5973 free(makeme);
5974 return false;
5975 }
5976 free(makeme);
5977 } while(tmp != dir);
5978
5979 return true;
5980 }
5981
5982 static bool umount_if_mounted(void)
5983 {
5984 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5985 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5986 return false;
5987 }
5988 return true;
5989 }
5990
5991 /* __typeof__ should be safe to use with all compilers. */
5992 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5993 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5994 {
5995 return (fs->f_type == (fs_type_magic)magic_val);
5996 }
5997
5998 /*
5999 * looking at fs/proc_namespace.c, it appears we can
6000 * actually expect the rootfs entry to very specifically contain
6001 * " - rootfs rootfs "
6002 * IIUC, so long as we've chrooted so that rootfs is not our root,
6003 * the rootfs entry should always be skipped in mountinfo contents.
6004 */
6005 static bool is_on_ramfs(void)
6006 {
6007 FILE *f;
6008 char *p, *p2;
6009 char *line = NULL;
6010 size_t len = 0;
6011 int i;
6012
6013 f = fopen("/proc/self/mountinfo", "r");
6014 if (!f)
6015 return false;
6016
6017 while (getline(&line, &len, f) != -1) {
6018 for (p = line, i = 0; p && i < 4; i++)
6019 p = strchr(p + 1, ' ');
6020 if (!p)
6021 continue;
6022 p2 = strchr(p + 1, ' ');
6023 if (!p2)
6024 continue;
6025 *p2 = '\0';
6026 if (strcmp(p + 1, "/") == 0) {
6027 // this is '/'. is it the ramfs?
6028 p = strchr(p2 + 1, '-');
6029 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
6030 free(line);
6031 fclose(f);
6032 return true;
6033 }
6034 }
6035 }
6036 free(line);
6037 fclose(f);
6038 return false;
6039 }
6040
6041 static int pivot_enter()
6042 {
6043 int ret = -1, oldroot = -1, newroot = -1;
6044
6045 oldroot = open("/", O_DIRECTORY | O_RDONLY);
6046 if (oldroot < 0) {
6047 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
6048 return ret;
6049 }
6050
6051 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
6052 if (newroot < 0) {
6053 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
6054 goto err;
6055 }
6056
6057 /* change into new root fs */
6058 if (fchdir(newroot) < 0) {
6059 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
6060 goto err;
6061 }
6062
6063 /* pivot_root into our new root fs */
6064 if (pivot_root(".", ".") < 0) {
6065 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
6066 goto err;
6067 }
6068
6069 /*
6070 * At this point the old-root is mounted on top of our new-root.
6071 * To unmounted it we must not be chdir'd into it, so escape back
6072 * to the old-root.
6073 */
6074 if (fchdir(oldroot) < 0) {
6075 lxcfs_error("%s\n", "Failed to enter old root.");
6076 goto err;
6077 }
6078
6079 if (umount2(".", MNT_DETACH) < 0) {
6080 lxcfs_error("%s\n", "Failed to detach old root.");
6081 goto err;
6082 }
6083
6084 if (fchdir(newroot) < 0) {
6085 lxcfs_error("%s\n", "Failed to re-enter new root.");
6086 goto err;
6087 }
6088
6089 ret = 0;
6090
6091 err:
6092 if (oldroot > 0)
6093 close(oldroot);
6094 if (newroot > 0)
6095 close(newroot);
6096
6097 return ret;
6098 }
6099
6100 static int chroot_enter()
6101 {
6102 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
6103 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
6104 return -1;
6105 }
6106
6107 if (chroot(".") < 0) {
6108 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
6109 return -1;
6110 }
6111
6112 if (chdir("/") < 0) {
6113 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
6114 return -1;
6115 }
6116
6117 return 0;
6118 }
6119
6120 static int permute_and_enter(void)
6121 {
6122 struct statfs sb;
6123
6124 if (statfs("/", &sb) < 0) {
6125 lxcfs_error("%s\n", "Could not stat / mountpoint.");
6126 return -1;
6127 }
6128
6129 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
6130 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
6131 * /proc/1/mountinfo. */
6132 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
6133 return chroot_enter();
6134
6135 if (pivot_enter() < 0) {
6136 lxcfs_error("%s\n", "Could not perform pivot root.");
6137 return -1;
6138 }
6139
6140 return 0;
6141 }
6142
6143 /* Prepare our new clean root. */
6144 static int permute_prepare(void)
6145 {
6146 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
6147 lxcfs_error("%s\n", "Failed to create directory for new root.");
6148 return -1;
6149 }
6150
6151 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
6152 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
6153 return -1;
6154 }
6155
6156 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
6157 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
6158 return -1;
6159 }
6160
6161 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
6162 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
6163 return -1;
6164 }
6165
6166 return 0;
6167 }
6168
6169 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6170 static bool permute_root(void)
6171 {
6172 /* Prepare new root. */
6173 if (permute_prepare() < 0)
6174 return false;
6175
6176 /* Pivot into new root. */
6177 if (permute_and_enter() < 0)
6178 return false;
6179
6180 return true;
6181 }
6182
6183 static int preserve_mnt_ns(int pid)
6184 {
6185 int ret;
6186 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6187 char path[len];
6188
6189 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6190 if (ret < 0 || (size_t)ret >= len)
6191 return -1;
6192
6193 return open(path, O_RDONLY | O_CLOEXEC);
6194 }
6195
6196 static bool cgfs_prepare_mounts(void)
6197 {
6198 if (!mkdir_p(BASEDIR, 0700)) {
6199 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6200 return false;
6201 }
6202
6203 if (!umount_if_mounted()) {
6204 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6205 return false;
6206 }
6207
6208 if (unshare(CLONE_NEWNS) < 0) {
6209 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6210 return false;
6211 }
6212
6213 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6214 if (cgroup_mount_ns_fd < 0) {
6215 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6216 return false;
6217 }
6218
6219 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6220 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6221 return false;
6222 }
6223
6224 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6225 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6226 return false;
6227 }
6228
6229 return true;
6230 }
6231
6232 static bool cgfs_mount_hierarchies(void)
6233 {
6234 char *target;
6235 size_t clen, len;
6236 int i, ret;
6237
6238 for (i = 0; i < num_hierarchies; i++) {
6239 char *controller = hierarchies[i];
6240
6241 clen = strlen(controller);
6242 len = strlen(BASEDIR) + clen + 2;
6243 target = malloc(len);
6244 if (!target)
6245 return false;
6246
6247 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
6248 if (ret < 0 || ret >= len) {
6249 free(target);
6250 return false;
6251 }
6252 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
6253 free(target);
6254 return false;
6255 }
6256 if (!strcmp(controller, "unified"))
6257 ret = mount("none", target, "cgroup2", 0, NULL);
6258 else
6259 ret = mount(controller, target, "cgroup", 0, controller);
6260 if (ret < 0) {
6261 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
6262 free(target);
6263 return false;
6264 }
6265
6266 fd_hierarchies[i] = open(target, O_DIRECTORY);
6267 if (fd_hierarchies[i] < 0) {
6268 free(target);
6269 return false;
6270 }
6271 free(target);
6272 }
6273 return true;
6274 }
6275
6276 static bool cgfs_setup_controllers(void)
6277 {
6278 if (!cgfs_prepare_mounts())
6279 return false;
6280
6281 if (!cgfs_mount_hierarchies()) {
6282 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6283 return false;
6284 }
6285
6286 if (!permute_root())
6287 return false;
6288
6289 return true;
6290 }
6291
6292 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
6293 {
6294 FILE *f;
6295 char *cret, *line = NULL;
6296 char cwd[MAXPATHLEN];
6297 size_t len = 0;
6298 int i, init_ns = -1;
6299 bool found_unified = false;
6300
6301 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
6302 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
6303 return;
6304 }
6305
6306 while (getline(&line, &len, f) != -1) {
6307 char *idx, *p, *p2;
6308
6309 p = strchr(line, ':');
6310 if (!p)
6311 goto out;
6312 idx = line;
6313 *(p++) = '\0';
6314
6315 p2 = strrchr(p, ':');
6316 if (!p2)
6317 goto out;
6318 *p2 = '\0';
6319
6320 /* With cgroupv2 /proc/self/cgroup can contain entries of the
6321 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
6322 * because it parses out the empty string "" and later on passes
6323 * it to mount(). Let's skip such entries.
6324 */
6325 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
6326 found_unified = true;
6327 p = "unified";
6328 }
6329
6330 if (!store_hierarchy(line, p))
6331 goto out;
6332 }
6333
6334 /* Preserve initial namespace. */
6335 init_ns = preserve_mnt_ns(getpid());
6336 if (init_ns < 0) {
6337 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
6338 goto out;
6339 }
6340
6341 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
6342 if (!fd_hierarchies) {
6343 lxcfs_error("%s\n", strerror(errno));
6344 goto out;
6345 }
6346
6347 for (i = 0; i < num_hierarchies; i++)
6348 fd_hierarchies[i] = -1;
6349
6350 cret = getcwd(cwd, MAXPATHLEN);
6351 if (!cret)
6352 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6353
6354 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6355 * to privately mount lxcfs cgroups. */
6356 if (!cgfs_setup_controllers()) {
6357 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
6358 goto out;
6359 }
6360
6361 if (setns(init_ns, 0) < 0) {
6362 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
6363 goto out;
6364 }
6365
6366 if (!cret || chdir(cwd) < 0)
6367 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6368
6369 if (!init_cpuview()) {
6370 lxcfs_error("%s\n", "failed to init CPU view");
6371 goto out;
6372 }
6373
6374 print_subsystems();
6375
6376 out:
6377 free(line);
6378 fclose(f);
6379 if (init_ns >= 0)
6380 close(init_ns);
6381 }
6382
6383 static void __attribute__((destructor)) free_subsystems(void)
6384 {
6385 int i;
6386
6387 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6388
6389 for (i = 0; i < num_hierarchies; i++) {
6390 if (hierarchies[i])
6391 free(hierarchies[i]);
6392 if (fd_hierarchies && fd_hierarchies[i] >= 0)
6393 close(fd_hierarchies[i]);
6394 }
6395 free(hierarchies);
6396 free(fd_hierarchies);
6397 free_cpuview();
6398
6399 if (cgroup_mount_ns_fd >= 0)
6400 close(cgroup_mount_ns_fd);
6401 }