]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
Merge pull request #303 from yinhongbo/master
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #define __STDC_FORMAT_MACROS
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <fuse.h>
16 #include <inttypes.h>
17 #include <libgen.h>
18 #include <pthread.h>
19 #include <sched.h>
20 #include <stdarg.h>
21 #include <stdbool.h>
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <time.h>
27 #include <unistd.h>
28 #include <wait.h>
29 #include <linux/magic.h>
30 #include <linux/sched.h>
31 #include <sys/epoll.h>
32 #include <sys/mman.h>
33 #include <sys/mount.h>
34 #include <sys/param.h>
35 #include <sys/socket.h>
36 #include <sys/syscall.h>
37 #include <sys/sysinfo.h>
38 #include <sys/vfs.h>
39
40 #include "bindings.h"
41 #include "config.h" // for VERSION
42
43 /* Define pivot_root() if missing from the C library */
44 #ifndef HAVE_PIVOT_ROOT
45 static int pivot_root(const char * new_root, const char * put_old)
46 {
47 #ifdef __NR_pivot_root
48 return syscall(__NR_pivot_root, new_root, put_old);
49 #else
50 errno = ENOSYS;
51 return -1;
52 #endif
53 }
54 #else
55 extern int pivot_root(const char * new_root, const char * put_old);
56 #endif
57
58 struct cpuacct_usage {
59 uint64_t user;
60 uint64_t system;
61 uint64_t idle;
62 bool online;
63 };
64
65 /* The function of hash table.*/
66 #define LOAD_SIZE 100 /*the size of hash_table */
67 #define FLUSH_TIME 5 /*the flush rate */
68 #define DEPTH_DIR 3 /*the depth of per cgroup */
69 /* The function of calculate loadavg .*/
70 #define FSHIFT 11 /* nr of bits of precision */
71 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
72 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
73 #define EXP_5 2014 /* 1/exp(5sec/5min) */
74 #define EXP_15 2037 /* 1/exp(5sec/15min) */
75 #define LOAD_INT(x) ((x) >> FSHIFT)
76 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
77 /*
78 * This parameter is used for proc_loadavg_read().
79 * 1 means use loadavg, 0 means not use.
80 */
81 static int loadavg = 0;
82 static volatile sig_atomic_t loadavg_stop = 0;
83 static int calc_hash(const char *name)
84 {
85 unsigned int hash = 0;
86 unsigned int x = 0;
87 /* ELFHash algorithm. */
88 while (*name) {
89 hash = (hash << 4) + *name++;
90 x = hash & 0xf0000000;
91 if (x != 0)
92 hash ^= (x >> 24);
93 hash &= ~x;
94 }
95 return (hash & 0x7fffffff);
96 }
97
98 struct load_node {
99 char *cg; /*cg */
100 unsigned long avenrun[3]; /* Load averages */
101 unsigned int run_pid;
102 unsigned int total_pid;
103 unsigned int last_pid;
104 int cfd; /* The file descriptor of the mounted cgroup */
105 struct load_node *next;
106 struct load_node **pre;
107 };
108
109 struct load_head {
110 /*
111 * The lock is about insert load_node and refresh load_node.To the first
112 * load_node of each hash bucket, insert and refresh in this hash bucket is
113 * mutually exclusive.
114 */
115 pthread_mutex_t lock;
116 /*
117 * The rdlock is about read loadavg and delete load_node.To each hash
118 * bucket, read and delete is mutually exclusive. But at the same time, we
119 * allow paratactic read operation. This rdlock is at list level.
120 */
121 pthread_rwlock_t rdlock;
122 /*
123 * The rilock is about read loadavg and insert load_node.To the first
124 * load_node of each hash bucket, read and insert is mutually exclusive.
125 * But at the same time, we allow paratactic read operation.
126 */
127 pthread_rwlock_t rilock;
128 struct load_node *next;
129 };
130
131 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
132 /*
133 * init_load initialize the hash table.
134 * Return 0 on success, return -1 on failure.
135 */
136 static int init_load(void)
137 {
138 int i;
139 int ret;
140
141 for (i = 0; i < LOAD_SIZE; i++) {
142 load_hash[i].next = NULL;
143 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
144 if (ret != 0) {
145 lxcfs_error("%s\n", "Failed to initialize lock");
146 goto out3;
147 }
148 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
149 if (ret != 0) {
150 lxcfs_error("%s\n", "Failed to initialize rdlock");
151 goto out2;
152 }
153 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
154 if (ret != 0) {
155 lxcfs_error("%s\n", "Failed to initialize rilock");
156 goto out1;
157 }
158 }
159 return 0;
160 out1:
161 pthread_rwlock_destroy(&load_hash[i].rdlock);
162 out2:
163 pthread_mutex_destroy(&load_hash[i].lock);
164 out3:
165 while (i > 0) {
166 i--;
167 pthread_mutex_destroy(&load_hash[i].lock);
168 pthread_rwlock_destroy(&load_hash[i].rdlock);
169 pthread_rwlock_destroy(&load_hash[i].rilock);
170 }
171 return -1;
172 }
173
174 static void insert_node(struct load_node **n, int locate)
175 {
176 struct load_node *f;
177
178 pthread_mutex_lock(&load_hash[locate].lock);
179 pthread_rwlock_wrlock(&load_hash[locate].rilock);
180 f = load_hash[locate].next;
181 load_hash[locate].next = *n;
182
183 (*n)->pre = &(load_hash[locate].next);
184 if (f)
185 f->pre = &((*n)->next);
186 (*n)->next = f;
187 pthread_mutex_unlock(&load_hash[locate].lock);
188 pthread_rwlock_unlock(&load_hash[locate].rilock);
189 }
190 /*
191 * locate_node() finds special node. Not return NULL means success.
192 * It should be noted that rdlock isn't unlocked at the end of code
193 * because this function is used to read special node. Delete is not
194 * allowed before read has ended.
195 * unlock rdlock only in proc_loadavg_read().
196 */
197 static struct load_node *locate_node(char *cg, int locate)
198 {
199 struct load_node *f = NULL;
200 int i = 0;
201
202 pthread_rwlock_rdlock(&load_hash[locate].rilock);
203 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
204 if (load_hash[locate].next == NULL) {
205 pthread_rwlock_unlock(&load_hash[locate].rilock);
206 return f;
207 }
208 f = load_hash[locate].next;
209 pthread_rwlock_unlock(&load_hash[locate].rilock);
210 while (f && ((i = strcmp(f->cg, cg)) != 0))
211 f = f->next;
212 return f;
213 }
214 /* Delete the load_node n and return the next node of it. */
215 static struct load_node *del_node(struct load_node *n, int locate)
216 {
217 struct load_node *g;
218
219 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
220 if (n->next == NULL) {
221 *(n->pre) = NULL;
222 } else {
223 *(n->pre) = n->next;
224 n->next->pre = n->pre;
225 }
226 g = n->next;
227 free(n->cg);
228 free(n);
229 pthread_rwlock_unlock(&load_hash[locate].rdlock);
230 return g;
231 }
232
233 static void load_free(void)
234 {
235 int i;
236 struct load_node *f, *p;
237
238 for (i = 0; i < LOAD_SIZE; i++) {
239 pthread_mutex_lock(&load_hash[i].lock);
240 pthread_rwlock_wrlock(&load_hash[i].rilock);
241 pthread_rwlock_wrlock(&load_hash[i].rdlock);
242 if (load_hash[i].next == NULL) {
243 pthread_mutex_unlock(&load_hash[i].lock);
244 pthread_mutex_destroy(&load_hash[i].lock);
245 pthread_rwlock_unlock(&load_hash[i].rilock);
246 pthread_rwlock_destroy(&load_hash[i].rilock);
247 pthread_rwlock_unlock(&load_hash[i].rdlock);
248 pthread_rwlock_destroy(&load_hash[i].rdlock);
249 continue;
250 }
251 for (f = load_hash[i].next; f; ) {
252 free(f->cg);
253 p = f->next;
254 free(f);
255 f = p;
256 }
257 pthread_mutex_unlock(&load_hash[i].lock);
258 pthread_mutex_destroy(&load_hash[i].lock);
259 pthread_rwlock_unlock(&load_hash[i].rilock);
260 pthread_rwlock_destroy(&load_hash[i].rilock);
261 pthread_rwlock_unlock(&load_hash[i].rdlock);
262 pthread_rwlock_destroy(&load_hash[i].rdlock);
263 }
264 }
265
266 /* Data for CPU view */
267 struct cg_proc_stat {
268 char *cg;
269 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
270 struct cpuacct_usage *view; // Usage stats reported to the container
271 int cpu_count;
272 pthread_mutex_t lock; // For node manipulation
273 struct cg_proc_stat *next;
274 };
275
276 struct cg_proc_stat_head {
277 struct cg_proc_stat *next;
278 time_t lastcheck;
279
280 /*
281 * For access to the list. Reading can be parallel, pruning is exclusive.
282 */
283 pthread_rwlock_t lock;
284 };
285
286 #define CPUVIEW_HASH_SIZE 100
287 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
288
289 static bool cpuview_init_head(struct cg_proc_stat_head **head)
290 {
291 *head = malloc(sizeof(struct cg_proc_stat_head));
292 if (!(*head)) {
293 lxcfs_error("%s\n", strerror(errno));
294 return false;
295 }
296
297 (*head)->lastcheck = time(NULL);
298 (*head)->next = NULL;
299
300 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
301 lxcfs_error("%s\n", "Failed to initialize list lock");
302 free(*head);
303 return false;
304 }
305
306 return true;
307 }
308
309 static bool init_cpuview()
310 {
311 int i;
312
313 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
314 proc_stat_history[i] = NULL;
315
316 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
317 if (!cpuview_init_head(&proc_stat_history[i]))
318 goto err;
319 }
320
321 return true;
322
323 err:
324 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
325 if (proc_stat_history[i]) {
326 free(proc_stat_history[i]);
327 proc_stat_history[i] = NULL;
328 }
329 }
330
331 return false;
332 }
333
334 static void free_proc_stat_node(struct cg_proc_stat *node)
335 {
336 pthread_mutex_destroy(&node->lock);
337 free(node->cg);
338 free(node->usage);
339 free(node->view);
340 free(node);
341 }
342
343 static void cpuview_free_head(struct cg_proc_stat_head *head)
344 {
345 struct cg_proc_stat *node, *tmp;
346
347 if (head->next) {
348 node = head->next;
349
350 for (;;) {
351 tmp = node;
352 node = node->next;
353 free_proc_stat_node(tmp);
354
355 if (!node)
356 break;
357 }
358 }
359
360 pthread_rwlock_destroy(&head->lock);
361 free(head);
362 }
363
364 static void free_cpuview()
365 {
366 int i;
367
368 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
369 if (proc_stat_history[i])
370 cpuview_free_head(proc_stat_history[i]);
371 }
372 }
373
374 /*
375 * A table caching which pid is init for a pid namespace.
376 * When looking up which pid is init for $qpid, we first
377 * 1. Stat /proc/$qpid/ns/pid.
378 * 2. Check whether the ino_t is in our store.
379 * a. if not, fork a child in qpid's ns to send us
380 * ucred.pid = 1, and read the initpid. Cache
381 * initpid and creation time for /proc/initpid
382 * in a new store entry.
383 * b. if so, verify that /proc/initpid still matches
384 * what we have saved. If not, clear the store
385 * entry and go back to a. If so, return the
386 * cached initpid.
387 */
388 struct pidns_init_store {
389 ino_t ino; // inode number for /proc/$pid/ns/pid
390 pid_t initpid; // the pid of nit in that ns
391 long int ctime; // the time at which /proc/$initpid was created
392 struct pidns_init_store *next;
393 long int lastcheck;
394 };
395
396 /* lol - look at how they are allocated in the kernel */
397 #define PIDNS_HASH_SIZE 4096
398 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
399
400 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
401 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
402 static void lock_mutex(pthread_mutex_t *l)
403 {
404 int ret;
405
406 if ((ret = pthread_mutex_lock(l)) != 0) {
407 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
408 exit(1);
409 }
410 }
411
412 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
413 * Number of hierarchies mounted. */
414 static int num_hierarchies;
415
416 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
417 * Hierachies mounted {cpuset, blkio, ...}:
418 * Initialized via __constructor__ collect_and_mount_subsystems(). */
419 static char **hierarchies;
420
421 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
422 * Open file descriptors:
423 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
424 * private mount namespace.
425 * Initialized via __constructor__ collect_and_mount_subsystems().
426 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
427 * mounts and respective files in the private namespace even when located in
428 * another namespace using the *at() family of functions
429 * {openat(), fchownat(), ...}. */
430 static int *fd_hierarchies;
431 static int cgroup_mount_ns_fd = -1;
432
433 static void unlock_mutex(pthread_mutex_t *l)
434 {
435 int ret;
436
437 if ((ret = pthread_mutex_unlock(l)) != 0) {
438 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
439 exit(1);
440 }
441 }
442
443 static void store_lock(void)
444 {
445 lock_mutex(&pidns_store_mutex);
446 }
447
448 static void store_unlock(void)
449 {
450 unlock_mutex(&pidns_store_mutex);
451 }
452
453 /* Must be called under store_lock */
454 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
455 {
456 struct stat initsb;
457 char fnam[100];
458
459 snprintf(fnam, 100, "/proc/%d", e->initpid);
460 if (stat(fnam, &initsb) < 0)
461 return false;
462
463 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
464 initsb.st_ctime, e->initpid);
465
466 if (e->ctime != initsb.st_ctime)
467 return false;
468 return true;
469 }
470
471 /* Must be called under store_lock */
472 static void remove_initpid(struct pidns_init_store *e)
473 {
474 struct pidns_init_store *tmp;
475 int h;
476
477 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
478
479 h = HASH(e->ino);
480 if (pidns_hash_table[h] == e) {
481 pidns_hash_table[h] = e->next;
482 free(e);
483 return;
484 }
485
486 tmp = pidns_hash_table[h];
487 while (tmp) {
488 if (tmp->next == e) {
489 tmp->next = e->next;
490 free(e);
491 return;
492 }
493 tmp = tmp->next;
494 }
495 }
496
497 #define PURGE_SECS 5
498 /* Must be called under store_lock */
499 static void prune_initpid_store(void)
500 {
501 static long int last_prune = 0;
502 struct pidns_init_store *e, *prev, *delme;
503 long int now, threshold;
504 int i;
505
506 if (!last_prune) {
507 last_prune = time(NULL);
508 return;
509 }
510 now = time(NULL);
511 if (now < last_prune + PURGE_SECS)
512 return;
513
514 lxcfs_debug("%s\n", "Pruning.");
515
516 last_prune = now;
517 threshold = now - 2 * PURGE_SECS;
518
519 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
520 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
521 if (e->lastcheck < threshold) {
522
523 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
524
525 delme = e;
526 if (prev)
527 prev->next = e->next;
528 else
529 pidns_hash_table[i] = e->next;
530 e = e->next;
531 free(delme);
532 } else {
533 prev = e;
534 e = e->next;
535 }
536 }
537 }
538 }
539
540 /* Must be called under store_lock */
541 static void save_initpid(struct stat *sb, pid_t pid)
542 {
543 struct pidns_init_store *e;
544 char fpath[100];
545 struct stat procsb;
546 int h;
547
548 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
549
550 snprintf(fpath, 100, "/proc/%d", pid);
551 if (stat(fpath, &procsb) < 0)
552 return;
553 do {
554 e = malloc(sizeof(*e));
555 } while (!e);
556 e->ino = sb->st_ino;
557 e->initpid = pid;
558 e->ctime = procsb.st_ctime;
559 h = HASH(e->ino);
560 e->next = pidns_hash_table[h];
561 e->lastcheck = time(NULL);
562 pidns_hash_table[h] = e;
563 }
564
565 /*
566 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
567 * entry for the inode number and creation time. Verify that the init pid
568 * is still valid. If not, remove it. Return the entry if valid, NULL
569 * otherwise.
570 * Must be called under store_lock
571 */
572 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
573 {
574 int h = HASH(sb->st_ino);
575 struct pidns_init_store *e = pidns_hash_table[h];
576
577 while (e) {
578 if (e->ino == sb->st_ino) {
579 if (initpid_still_valid(e, sb)) {
580 e->lastcheck = time(NULL);
581 return e;
582 }
583 remove_initpid(e);
584 return NULL;
585 }
586 e = e->next;
587 }
588
589 return NULL;
590 }
591
592 static int is_dir(const char *path, int fd)
593 {
594 struct stat statbuf;
595 int ret = fstatat(fd, path, &statbuf, fd);
596 if (ret == 0 && S_ISDIR(statbuf.st_mode))
597 return 1;
598 return 0;
599 }
600
601 static char *must_copy_string(const char *str)
602 {
603 char *dup = NULL;
604 if (!str)
605 return NULL;
606 do {
607 dup = strdup(str);
608 } while (!dup);
609
610 return dup;
611 }
612
613 static inline void drop_trailing_newlines(char *s)
614 {
615 int l;
616
617 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
618 s[l-1] = '\0';
619 }
620
621 #define BATCH_SIZE 50
622 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
623 {
624 int newbatches = (newlen / BATCH_SIZE) + 1;
625 int oldbatches = (oldlen / BATCH_SIZE) + 1;
626
627 if (!*mem || newbatches > oldbatches) {
628 char *tmp;
629 do {
630 tmp = realloc(*mem, newbatches * BATCH_SIZE);
631 } while (!tmp);
632 *mem = tmp;
633 }
634 }
635 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
636 {
637 size_t newlen = *len + linelen;
638 dorealloc(contents, *len, newlen + 1);
639 memcpy(*contents + *len, line, linelen+1);
640 *len = newlen;
641 }
642
643 static char *slurp_file(const char *from, int fd)
644 {
645 char *line = NULL;
646 char *contents = NULL;
647 FILE *f = fdopen(fd, "r");
648 size_t len = 0, fulllen = 0;
649 ssize_t linelen;
650
651 if (!f)
652 return NULL;
653
654 while ((linelen = getline(&line, &len, f)) != -1) {
655 append_line(&contents, &fulllen, line, linelen);
656 }
657 fclose(f);
658
659 if (contents)
660 drop_trailing_newlines(contents);
661 free(line);
662 return contents;
663 }
664
665 static bool write_string(const char *fnam, const char *string, int fd)
666 {
667 FILE *f;
668 size_t len, ret;
669
670 f = fdopen(fd, "w");
671 if (!f)
672 return false;
673
674 len = strlen(string);
675 ret = fwrite(string, 1, len, f);
676 if (ret != len) {
677 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
678 strerror(errno), string, fnam);
679 fclose(f);
680 return false;
681 }
682
683 if (fclose(f) < 0) {
684 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
685 return false;
686 }
687
688 return true;
689 }
690
691 struct cgfs_files {
692 char *name;
693 uint32_t uid, gid;
694 uint32_t mode;
695 };
696
697 #define ALLOC_NUM 20
698 static bool store_hierarchy(char *stridx, char *h)
699 {
700 if (num_hierarchies % ALLOC_NUM == 0) {
701 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
702 n *= ALLOC_NUM;
703 char **tmp = realloc(hierarchies, n * sizeof(char *));
704 if (!tmp) {
705 lxcfs_error("%s\n", strerror(errno));
706 exit(1);
707 }
708 hierarchies = tmp;
709 }
710
711 hierarchies[num_hierarchies++] = must_copy_string(h);
712 return true;
713 }
714
715 static void print_subsystems(void)
716 {
717 int i;
718
719 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
720 fprintf(stderr, "hierarchies:\n");
721 for (i = 0; i < num_hierarchies; i++) {
722 if (hierarchies[i])
723 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
724 fd_hierarchies[i], hierarchies[i]);
725 }
726 }
727
728 static bool in_comma_list(const char *needle, const char *haystack)
729 {
730 const char *s = haystack, *e;
731 size_t nlen = strlen(needle);
732
733 while (*s && (e = strchr(s, ','))) {
734 if (nlen != e - s) {
735 s = e + 1;
736 continue;
737 }
738 if (strncmp(needle, s, nlen) == 0)
739 return true;
740 s = e + 1;
741 }
742 if (strcmp(needle, s) == 0)
743 return true;
744 return false;
745 }
746
747 /* do we need to do any massaging here? I'm not sure... */
748 /* Return the mounted controller and store the corresponding open file descriptor
749 * referring to the controller mountpoint in the private lxcfs namespace in
750 * @cfd.
751 */
752 static char *find_mounted_controller(const char *controller, int *cfd)
753 {
754 int i;
755
756 for (i = 0; i < num_hierarchies; i++) {
757 if (!hierarchies[i])
758 continue;
759 if (strcmp(hierarchies[i], controller) == 0) {
760 *cfd = fd_hierarchies[i];
761 return hierarchies[i];
762 }
763 if (in_comma_list(controller, hierarchies[i])) {
764 *cfd = fd_hierarchies[i];
765 return hierarchies[i];
766 }
767 }
768
769 return NULL;
770 }
771
772 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
773 const char *value)
774 {
775 int ret, fd, cfd;
776 size_t len;
777 char *fnam, *tmpc;
778
779 tmpc = find_mounted_controller(controller, &cfd);
780 if (!tmpc)
781 return false;
782
783 /* Make sure we pass a relative path to *at() family of functions.
784 * . + /cgroup + / + file + \0
785 */
786 len = strlen(cgroup) + strlen(file) + 3;
787 fnam = alloca(len);
788 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
789 if (ret < 0 || (size_t)ret >= len)
790 return false;
791
792 fd = openat(cfd, fnam, O_WRONLY);
793 if (fd < 0)
794 return false;
795
796 return write_string(fnam, value, fd);
797 }
798
799 // Chown all the files in the cgroup directory. We do this when we create
800 // a cgroup on behalf of a user.
801 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
802 {
803 struct dirent *direntp;
804 char path[MAXPATHLEN];
805 size_t len;
806 DIR *d;
807 int fd1, ret;
808
809 len = strlen(dirname);
810 if (len >= MAXPATHLEN) {
811 lxcfs_error("Pathname too long: %s\n", dirname);
812 return;
813 }
814
815 fd1 = openat(fd, dirname, O_DIRECTORY);
816 if (fd1 < 0)
817 return;
818
819 d = fdopendir(fd1);
820 if (!d) {
821 lxcfs_error("Failed to open %s\n", dirname);
822 return;
823 }
824
825 while ((direntp = readdir(d))) {
826 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
827 continue;
828 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
829 if (ret < 0 || ret >= MAXPATHLEN) {
830 lxcfs_error("Pathname too long under %s\n", dirname);
831 continue;
832 }
833 if (fchownat(fd, path, uid, gid, 0) < 0)
834 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
835 }
836 closedir(d);
837 }
838
839 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
840 {
841 int cfd;
842 size_t len;
843 char *dirnam, *tmpc;
844
845 tmpc = find_mounted_controller(controller, &cfd);
846 if (!tmpc)
847 return -EINVAL;
848
849 /* Make sure we pass a relative path to *at() family of functions.
850 * . + /cg + \0
851 */
852 len = strlen(cg) + 2;
853 dirnam = alloca(len);
854 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
855
856 if (mkdirat(cfd, dirnam, 0755) < 0)
857 return -errno;
858
859 if (uid == 0 && gid == 0)
860 return 0;
861
862 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
863 return -errno;
864
865 chown_all_cgroup_files(dirnam, uid, gid, cfd);
866
867 return 0;
868 }
869
870 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
871 {
872 struct dirent *direntp;
873 DIR *dir;
874 bool ret = false;
875 char pathname[MAXPATHLEN];
876 int dupfd;
877
878 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
879 if (dupfd < 0)
880 return false;
881
882 dir = fdopendir(dupfd);
883 if (!dir) {
884 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
885 close(dupfd);
886 return false;
887 }
888
889 while ((direntp = readdir(dir))) {
890 struct stat mystat;
891 int rc;
892
893 if (!strcmp(direntp->d_name, ".") ||
894 !strcmp(direntp->d_name, ".."))
895 continue;
896
897 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
898 if (rc < 0 || rc >= MAXPATHLEN) {
899 lxcfs_error("%s\n", "Pathname too long.");
900 continue;
901 }
902
903 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
904 if (rc) {
905 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
906 continue;
907 }
908 if (S_ISDIR(mystat.st_mode))
909 if (!recursive_rmdir(pathname, fd, cfd))
910 lxcfs_debug("Error removing %s.\n", pathname);
911 }
912
913 ret = true;
914 if (closedir(dir) < 0) {
915 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
916 ret = false;
917 }
918
919 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
920 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
921 ret = false;
922 }
923
924 close(dupfd);
925
926 return ret;
927 }
928
929 bool cgfs_remove(const char *controller, const char *cg)
930 {
931 int fd, cfd;
932 size_t len;
933 char *dirnam, *tmpc;
934 bool bret;
935
936 tmpc = find_mounted_controller(controller, &cfd);
937 if (!tmpc)
938 return false;
939
940 /* Make sure we pass a relative path to *at() family of functions.
941 * . + /cg + \0
942 */
943 len = strlen(cg) + 2;
944 dirnam = alloca(len);
945 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
946
947 fd = openat(cfd, dirnam, O_DIRECTORY);
948 if (fd < 0)
949 return false;
950
951 bret = recursive_rmdir(dirnam, fd, cfd);
952 close(fd);
953 return bret;
954 }
955
956 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
957 {
958 int cfd;
959 size_t len;
960 char *pathname, *tmpc;
961
962 tmpc = find_mounted_controller(controller, &cfd);
963 if (!tmpc)
964 return false;
965
966 /* Make sure we pass a relative path to *at() family of functions.
967 * . + /file + \0
968 */
969 len = strlen(file) + 2;
970 pathname = alloca(len);
971 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
972 if (fchmodat(cfd, pathname, mode, 0) < 0)
973 return false;
974 return true;
975 }
976
977 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
978 {
979 size_t len;
980 char *fname;
981
982 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
983 fname = alloca(len);
984 snprintf(fname, len, "%s/tasks", dirname);
985 if (fchownat(fd, fname, uid, gid, 0) != 0)
986 return -errno;
987 snprintf(fname, len, "%s/cgroup.procs", dirname);
988 if (fchownat(fd, fname, uid, gid, 0) != 0)
989 return -errno;
990 return 0;
991 }
992
993 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
994 {
995 int cfd;
996 size_t len;
997 char *pathname, *tmpc;
998
999 tmpc = find_mounted_controller(controller, &cfd);
1000 if (!tmpc)
1001 return -EINVAL;
1002
1003 /* Make sure we pass a relative path to *at() family of functions.
1004 * . + /file + \0
1005 */
1006 len = strlen(file) + 2;
1007 pathname = alloca(len);
1008 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1009 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
1010 return -errno;
1011
1012 if (is_dir(pathname, cfd))
1013 // like cgmanager did, we want to chown the tasks file as well
1014 return chown_tasks_files(pathname, uid, gid, cfd);
1015
1016 return 0;
1017 }
1018
1019 FILE *open_pids_file(const char *controller, const char *cgroup)
1020 {
1021 int fd, cfd;
1022 size_t len;
1023 char *pathname, *tmpc;
1024
1025 tmpc = find_mounted_controller(controller, &cfd);
1026 if (!tmpc)
1027 return NULL;
1028
1029 /* Make sure we pass a relative path to *at() family of functions.
1030 * . + /cgroup + / "cgroup.procs" + \0
1031 */
1032 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
1033 pathname = alloca(len);
1034 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1035
1036 fd = openat(cfd, pathname, O_WRONLY);
1037 if (fd < 0)
1038 return NULL;
1039
1040 return fdopen(fd, "w");
1041 }
1042
1043 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1044 void ***list, size_t typesize,
1045 void* (*iterator)(const char*, const char*, const char*))
1046 {
1047 int cfd, fd, ret;
1048 size_t len;
1049 char *cg, *tmpc;
1050 char pathname[MAXPATHLEN];
1051 size_t sz = 0, asz = 0;
1052 struct dirent *dirent;
1053 DIR *dir;
1054
1055 tmpc = find_mounted_controller(controller, &cfd);
1056 *list = NULL;
1057 if (!tmpc)
1058 return false;
1059
1060 /* Make sure we pass a relative path to *at() family of functions. */
1061 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1062 cg = alloca(len);
1063 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1064 if (ret < 0 || (size_t)ret >= len) {
1065 lxcfs_error("Pathname too long under %s\n", cgroup);
1066 return false;
1067 }
1068
1069 fd = openat(cfd, cg, O_DIRECTORY);
1070 if (fd < 0)
1071 return false;
1072
1073 dir = fdopendir(fd);
1074 if (!dir)
1075 return false;
1076
1077 while ((dirent = readdir(dir))) {
1078 struct stat mystat;
1079
1080 if (!strcmp(dirent->d_name, ".") ||
1081 !strcmp(dirent->d_name, ".."))
1082 continue;
1083
1084 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1085 if (ret < 0 || ret >= MAXPATHLEN) {
1086 lxcfs_error("Pathname too long under %s\n", cg);
1087 continue;
1088 }
1089
1090 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1091 if (ret) {
1092 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1093 continue;
1094 }
1095 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1096 (directories && !S_ISDIR(mystat.st_mode)))
1097 continue;
1098
1099 if (sz+2 >= asz) {
1100 void **tmp;
1101 asz += BATCH_SIZE;
1102 do {
1103 tmp = realloc(*list, asz * typesize);
1104 } while (!tmp);
1105 *list = tmp;
1106 }
1107 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1108 (*list)[sz+1] = NULL;
1109 sz++;
1110 }
1111 if (closedir(dir) < 0) {
1112 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1113 return false;
1114 }
1115 return true;
1116 }
1117
1118 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1119 {
1120 char *dup;
1121 do {
1122 dup = strdup(dir_entry);
1123 } while (!dup);
1124 return dup;
1125 }
1126
1127 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1128 {
1129 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1130 }
1131
1132 void free_key(struct cgfs_files *k)
1133 {
1134 if (!k)
1135 return;
1136 free(k->name);
1137 free(k);
1138 }
1139
1140 void free_keys(struct cgfs_files **keys)
1141 {
1142 int i;
1143
1144 if (!keys)
1145 return;
1146 for (i = 0; keys[i]; i++) {
1147 free_key(keys[i]);
1148 }
1149 free(keys);
1150 }
1151
1152 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1153 {
1154 int ret, fd, cfd;
1155 size_t len;
1156 char *fnam, *tmpc;
1157
1158 tmpc = find_mounted_controller(controller, &cfd);
1159 if (!tmpc)
1160 return false;
1161
1162 /* Make sure we pass a relative path to *at() family of functions.
1163 * . + /cgroup + / + file + \0
1164 */
1165 len = strlen(cgroup) + strlen(file) + 3;
1166 fnam = alloca(len);
1167 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1168 if (ret < 0 || (size_t)ret >= len)
1169 return false;
1170
1171 fd = openat(cfd, fnam, O_RDONLY);
1172 if (fd < 0)
1173 return false;
1174
1175 *value = slurp_file(fnam, fd);
1176 return *value != NULL;
1177 }
1178
1179 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1180 {
1181 int ret, cfd;
1182 size_t len;
1183 char *fnam, *tmpc;
1184
1185 tmpc = find_mounted_controller(controller, &cfd);
1186 if (!tmpc)
1187 return false;
1188
1189 /* Make sure we pass a relative path to *at() family of functions.
1190 * . + /cgroup + / + file + \0
1191 */
1192 len = strlen(cgroup) + strlen(file) + 3;
1193 fnam = alloca(len);
1194 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1195 if (ret < 0 || (size_t)ret >= len)
1196 return false;
1197
1198 return (faccessat(cfd, fnam, F_OK, 0) == 0);
1199 }
1200
1201 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1202 {
1203 int ret, cfd;
1204 size_t len;
1205 char *fnam, *tmpc;
1206 struct stat sb;
1207 struct cgfs_files *newkey;
1208
1209 tmpc = find_mounted_controller(controller, &cfd);
1210 if (!tmpc)
1211 return false;
1212
1213 if (file && *file == '/')
1214 file++;
1215
1216 if (file && strchr(file, '/'))
1217 return NULL;
1218
1219 /* Make sure we pass a relative path to *at() family of functions.
1220 * . + /cgroup + / + file + \0
1221 */
1222 len = strlen(cgroup) + 3;
1223 if (file)
1224 len += strlen(file) + 1;
1225 fnam = alloca(len);
1226 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1227 file ? "/" : "", file ? file : "");
1228
1229 ret = fstatat(cfd, fnam, &sb, 0);
1230 if (ret < 0)
1231 return NULL;
1232
1233 do {
1234 newkey = malloc(sizeof(struct cgfs_files));
1235 } while (!newkey);
1236 if (file)
1237 newkey->name = must_copy_string(file);
1238 else if (strrchr(cgroup, '/'))
1239 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1240 else
1241 newkey->name = must_copy_string(cgroup);
1242 newkey->uid = sb.st_uid;
1243 newkey->gid = sb.st_gid;
1244 newkey->mode = sb.st_mode;
1245
1246 return newkey;
1247 }
1248
1249 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1250 {
1251 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1252 if (!entry) {
1253 lxcfs_error("Error getting files under %s:%s\n", controller,
1254 cgroup);
1255 }
1256 return entry;
1257 }
1258
1259 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1260 {
1261 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1262 }
1263
1264 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1265 {
1266 int cfd;
1267 size_t len;
1268 char *fnam, *tmpc;
1269 int ret;
1270 struct stat sb;
1271
1272 tmpc = find_mounted_controller(controller, &cfd);
1273 if (!tmpc)
1274 return false;
1275
1276 /* Make sure we pass a relative path to *at() family of functions.
1277 * . + /cgroup + / + f + \0
1278 */
1279 len = strlen(cgroup) + strlen(f) + 3;
1280 fnam = alloca(len);
1281 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1282 if (ret < 0 || (size_t)ret >= len)
1283 return false;
1284
1285 ret = fstatat(cfd, fnam, &sb, 0);
1286 if (ret < 0 || !S_ISDIR(sb.st_mode))
1287 return false;
1288
1289 return true;
1290 }
1291
1292 #define SEND_CREDS_OK 0
1293 #define SEND_CREDS_NOTSK 1
1294 #define SEND_CREDS_FAIL 2
1295 static bool recv_creds(int sock, struct ucred *cred, char *v);
1296 static int wait_for_pid(pid_t pid);
1297 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1298 static int send_creds_clone_wrapper(void *arg);
1299
1300 /*
1301 * clone a task which switches to @task's namespace and writes '1'.
1302 * over a unix sock so we can read the task's reaper's pid in our
1303 * namespace
1304 *
1305 * Note: glibc's fork() does not respect pidns, which can lead to failed
1306 * assertions inside glibc (and thus failed forks) if the child's pid in
1307 * the pidns and the parent pid outside are identical. Using clone prevents
1308 * this issue.
1309 */
1310 static void write_task_init_pid_exit(int sock, pid_t target)
1311 {
1312 char fnam[100];
1313 pid_t pid;
1314 int fd, ret;
1315 size_t stack_size = sysconf(_SC_PAGESIZE);
1316 void *stack = alloca(stack_size);
1317
1318 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1319 if (ret < 0 || ret >= sizeof(fnam))
1320 _exit(1);
1321
1322 fd = open(fnam, O_RDONLY);
1323 if (fd < 0) {
1324 perror("write_task_init_pid_exit open of ns/pid");
1325 _exit(1);
1326 }
1327 if (setns(fd, 0)) {
1328 perror("write_task_init_pid_exit setns 1");
1329 close(fd);
1330 _exit(1);
1331 }
1332 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1333 if (pid < 0)
1334 _exit(1);
1335 if (pid != 0) {
1336 if (!wait_for_pid(pid))
1337 _exit(1);
1338 _exit(0);
1339 }
1340 }
1341
1342 static int send_creds_clone_wrapper(void *arg) {
1343 struct ucred cred;
1344 char v;
1345 int sock = *(int *)arg;
1346
1347 /* we are the child */
1348 cred.uid = 0;
1349 cred.gid = 0;
1350 cred.pid = 1;
1351 v = '1';
1352 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1353 return 1;
1354 return 0;
1355 }
1356
1357 static pid_t get_init_pid_for_task(pid_t task)
1358 {
1359 int sock[2];
1360 pid_t pid;
1361 pid_t ret = -1;
1362 char v = '0';
1363 struct ucred cred;
1364
1365 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1366 perror("socketpair");
1367 return -1;
1368 }
1369
1370 pid = fork();
1371 if (pid < 0)
1372 goto out;
1373 if (!pid) {
1374 close(sock[1]);
1375 write_task_init_pid_exit(sock[0], task);
1376 _exit(0);
1377 }
1378
1379 if (!recv_creds(sock[1], &cred, &v))
1380 goto out;
1381 ret = cred.pid;
1382
1383 out:
1384 close(sock[0]);
1385 close(sock[1]);
1386 if (pid > 0)
1387 wait_for_pid(pid);
1388 return ret;
1389 }
1390
1391 pid_t lookup_initpid_in_store(pid_t qpid)
1392 {
1393 pid_t answer = 0;
1394 struct stat sb;
1395 struct pidns_init_store *e;
1396 char fnam[100];
1397
1398 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1399 store_lock();
1400 if (stat(fnam, &sb) < 0)
1401 goto out;
1402 e = lookup_verify_initpid(&sb);
1403 if (e) {
1404 answer = e->initpid;
1405 goto out;
1406 }
1407 answer = get_init_pid_for_task(qpid);
1408 if (answer > 0)
1409 save_initpid(&sb, answer);
1410
1411 out:
1412 /* we prune at end in case we are returning
1413 * the value we were about to return */
1414 prune_initpid_store();
1415 store_unlock();
1416 return answer;
1417 }
1418
1419 static int wait_for_pid(pid_t pid)
1420 {
1421 int status, ret;
1422
1423 if (pid <= 0)
1424 return -1;
1425
1426 again:
1427 ret = waitpid(pid, &status, 0);
1428 if (ret == -1) {
1429 if (errno == EINTR)
1430 goto again;
1431 return -1;
1432 }
1433 if (ret != pid)
1434 goto again;
1435 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1436 return -1;
1437 return 0;
1438 }
1439
1440 /*
1441 * append the given formatted string to *src.
1442 * src: a pointer to a char* in which to append the formatted string.
1443 * sz: the number of characters printed so far, minus trailing \0.
1444 * asz: the allocated size so far
1445 * format: string format. See printf for details.
1446 * ...: varargs. See printf for details.
1447 */
1448 static void must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...)
1449 {
1450 char tmp[BUF_RESERVE_SIZE];
1451 va_list args;
1452
1453 va_start (args, format);
1454 int tmplen = vsnprintf(tmp, BUF_RESERVE_SIZE, format, args);
1455 va_end(args);
1456
1457 if (!*src || tmplen + *sz + 1 >= *asz) {
1458 char *tmp;
1459 do {
1460 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1461 } while (!tmp);
1462 *src = tmp;
1463 *asz += BUF_RESERVE_SIZE;
1464 }
1465 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1466 *sz += tmplen;
1467 }
1468
1469 /*
1470 * append pid to *src.
1471 * src: a pointer to a char* in which ot append the pid.
1472 * sz: the number of characters printed so far, minus trailing \0.
1473 * asz: the allocated size so far
1474 * pid: the pid to append
1475 */
1476 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1477 {
1478 must_strcat(src, sz, asz, "%d\n", (int)pid);
1479 }
1480
1481 /*
1482 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1483 * valid in the caller's namespace, return the id mapped into
1484 * pid's namespace.
1485 * Returns the mapped id, or -1 on error.
1486 */
1487 unsigned int
1488 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1489 {
1490 unsigned int nsuid, // base id for a range in the idfile's namespace
1491 hostuid, // base id for a range in the caller's namespace
1492 count; // number of ids in this range
1493 char line[400];
1494 int ret;
1495
1496 fseek(idfile, 0L, SEEK_SET);
1497 while (fgets(line, 400, idfile)) {
1498 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1499 if (ret != 3)
1500 continue;
1501 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1502 /*
1503 * uids wrapped around - unexpected as this is a procfile,
1504 * so just bail.
1505 */
1506 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1507 nsuid, hostuid, count, line);
1508 return -1;
1509 }
1510 if (hostuid <= in_id && hostuid+count > in_id) {
1511 /*
1512 * now since hostuid <= in_id < hostuid+count, and
1513 * hostuid+count and nsuid+count do not wrap around,
1514 * we know that nsuid+(in_id-hostuid) which must be
1515 * less that nsuid+(count) must not wrap around
1516 */
1517 return (in_id - hostuid) + nsuid;
1518 }
1519 }
1520
1521 // no answer found
1522 return -1;
1523 }
1524
1525 /*
1526 * for is_privileged_over,
1527 * specify whether we require the calling uid to be root in his
1528 * namespace
1529 */
1530 #define NS_ROOT_REQD true
1531 #define NS_ROOT_OPT false
1532
1533 #define PROCLEN 100
1534
1535 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1536 {
1537 char fpath[PROCLEN];
1538 int ret;
1539 bool answer = false;
1540 uid_t nsuid;
1541
1542 if (victim == -1 || uid == -1)
1543 return false;
1544
1545 /*
1546 * If the request is one not requiring root in the namespace,
1547 * then having the same uid suffices. (i.e. uid 1000 has write
1548 * access to files owned by uid 1000
1549 */
1550 if (!req_ns_root && uid == victim)
1551 return true;
1552
1553 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1554 if (ret < 0 || ret >= PROCLEN)
1555 return false;
1556 FILE *f = fopen(fpath, "r");
1557 if (!f)
1558 return false;
1559
1560 /* if caller's not root in his namespace, reject */
1561 nsuid = convert_id_to_ns(f, uid);
1562 if (nsuid)
1563 goto out;
1564
1565 /*
1566 * If victim is not mapped into caller's ns, reject.
1567 * XXX I'm not sure this check is needed given that fuse
1568 * will be sending requests where the vfs has converted
1569 */
1570 nsuid = convert_id_to_ns(f, victim);
1571 if (nsuid == -1)
1572 goto out;
1573
1574 answer = true;
1575
1576 out:
1577 fclose(f);
1578 return answer;
1579 }
1580
1581 static bool perms_include(int fmode, mode_t req_mode)
1582 {
1583 mode_t r;
1584
1585 switch (req_mode & O_ACCMODE) {
1586 case O_RDONLY:
1587 r = S_IROTH;
1588 break;
1589 case O_WRONLY:
1590 r = S_IWOTH;
1591 break;
1592 case O_RDWR:
1593 r = S_IROTH | S_IWOTH;
1594 break;
1595 default:
1596 return false;
1597 }
1598 return ((fmode & r) == r);
1599 }
1600
1601
1602 /*
1603 * taskcg is a/b/c
1604 * querycg is /a/b/c/d/e
1605 * we return 'd'
1606 */
1607 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1608 {
1609 char *start, *end;
1610
1611 if (strlen(taskcg) <= strlen(querycg)) {
1612 lxcfs_error("%s\n", "I was fed bad input.");
1613 return NULL;
1614 }
1615
1616 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1617 start = strdup(taskcg + 1);
1618 else
1619 start = strdup(taskcg + strlen(querycg) + 1);
1620 if (!start)
1621 return NULL;
1622 end = strchr(start, '/');
1623 if (end)
1624 *end = '\0';
1625 return start;
1626 }
1627
1628 static void stripnewline(char *x)
1629 {
1630 size_t l = strlen(x);
1631 if (l && x[l-1] == '\n')
1632 x[l-1] = '\0';
1633 }
1634
1635 char *get_pid_cgroup(pid_t pid, const char *contrl)
1636 {
1637 int cfd;
1638 char fnam[PROCLEN];
1639 FILE *f;
1640 char *answer = NULL;
1641 char *line = NULL;
1642 size_t len = 0;
1643 int ret;
1644 const char *h = find_mounted_controller(contrl, &cfd);
1645 if (!h)
1646 return NULL;
1647
1648 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1649 if (ret < 0 || ret >= PROCLEN)
1650 return NULL;
1651 if (!(f = fopen(fnam, "r")))
1652 return NULL;
1653
1654 while (getline(&line, &len, f) != -1) {
1655 char *c1, *c2;
1656 if (!line[0])
1657 continue;
1658 c1 = strchr(line, ':');
1659 if (!c1)
1660 goto out;
1661 c1++;
1662 c2 = strchr(c1, ':');
1663 if (!c2)
1664 goto out;
1665 *c2 = '\0';
1666 if (strcmp(c1, h) != 0)
1667 continue;
1668 c2++;
1669 stripnewline(c2);
1670 do {
1671 answer = strdup(c2);
1672 } while (!answer);
1673 break;
1674 }
1675
1676 out:
1677 fclose(f);
1678 free(line);
1679 return answer;
1680 }
1681
1682 /*
1683 * check whether a fuse context may access a cgroup dir or file
1684 *
1685 * If file is not null, it is a cgroup file to check under cg.
1686 * If file is null, then we are checking perms on cg itself.
1687 *
1688 * For files we can check the mode of the list_keys result.
1689 * For cgroups, we must make assumptions based on the files under the
1690 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1691 * yet.
1692 */
1693 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1694 {
1695 struct cgfs_files *k = NULL;
1696 bool ret = false;
1697
1698 k = cgfs_get_key(contrl, cg, file);
1699 if (!k)
1700 return false;
1701
1702 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1703 if (perms_include(k->mode >> 6, mode)) {
1704 ret = true;
1705 goto out;
1706 }
1707 }
1708 if (fc->gid == k->gid) {
1709 if (perms_include(k->mode >> 3, mode)) {
1710 ret = true;
1711 goto out;
1712 }
1713 }
1714 ret = perms_include(k->mode, mode);
1715
1716 out:
1717 free_key(k);
1718 return ret;
1719 }
1720
1721 #define INITSCOPE "/init.scope"
1722 void prune_init_slice(char *cg)
1723 {
1724 char *point;
1725 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1726
1727 if (cg_len < initscope_len)
1728 return;
1729
1730 point = cg + cg_len - initscope_len;
1731 if (strcmp(point, INITSCOPE) == 0) {
1732 if (point == cg)
1733 *(point+1) = '\0';
1734 else
1735 *point = '\0';
1736 }
1737 }
1738
1739 /*
1740 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1741 * If pid is in /a, he may act on /a/b, but not on /b.
1742 * if the answer is false and nextcg is not NULL, then *nextcg will point
1743 * to a string containing the next cgroup directory under cg, which must be
1744 * freed by the caller.
1745 */
1746 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1747 {
1748 bool answer = false;
1749 char *c2 = get_pid_cgroup(pid, contrl);
1750 char *linecmp;
1751
1752 if (!c2)
1753 return false;
1754 prune_init_slice(c2);
1755
1756 /*
1757 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1758 * they pass in a cgroup without leading '/'
1759 *
1760 * The original line here was:
1761 * linecmp = *cg == '/' ? c2 : c2+1;
1762 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1763 * Serge, do you know?
1764 */
1765 if (*cg == '/' || !strncmp(cg, "./", 2))
1766 linecmp = c2;
1767 else
1768 linecmp = c2 + 1;
1769 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1770 if (nextcg) {
1771 *nextcg = get_next_cgroup_dir(linecmp, cg);
1772 }
1773 goto out;
1774 }
1775 answer = true;
1776
1777 out:
1778 free(c2);
1779 return answer;
1780 }
1781
1782 /*
1783 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1784 */
1785 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1786 {
1787 bool answer = false;
1788 char *c2, *task_cg;
1789 size_t target_len, task_len;
1790
1791 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1792 return true;
1793
1794 c2 = get_pid_cgroup(pid, contrl);
1795 if (!c2)
1796 return false;
1797 prune_init_slice(c2);
1798
1799 task_cg = c2 + 1;
1800 target_len = strlen(cg);
1801 task_len = strlen(task_cg);
1802 if (task_len == 0) {
1803 /* Task is in the root cg, it can see everything. This case is
1804 * not handled by the strmcps below, since they test for the
1805 * last /, but that is the first / that we've chopped off
1806 * above.
1807 */
1808 answer = true;
1809 goto out;
1810 }
1811 if (strcmp(cg, task_cg) == 0) {
1812 answer = true;
1813 goto out;
1814 }
1815 if (target_len < task_len) {
1816 /* looking up a parent dir */
1817 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1818 answer = true;
1819 goto out;
1820 }
1821 if (target_len > task_len) {
1822 /* looking up a child dir */
1823 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1824 answer = true;
1825 goto out;
1826 }
1827
1828 out:
1829 free(c2);
1830 return answer;
1831 }
1832
1833 /*
1834 * given /cgroup/freezer/a/b, return "freezer".
1835 * the returned char* should NOT be freed.
1836 */
1837 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1838 {
1839 const char *p1;
1840 char *contr, *slash;
1841
1842 if (strlen(path) < 9) {
1843 errno = EACCES;
1844 return NULL;
1845 }
1846 if (*(path + 7) != '/') {
1847 errno = EINVAL;
1848 return NULL;
1849 }
1850 p1 = path + 8;
1851 contr = strdupa(p1);
1852 if (!contr) {
1853 errno = ENOMEM;
1854 return NULL;
1855 }
1856 slash = strstr(contr, "/");
1857 if (slash)
1858 *slash = '\0';
1859
1860 int i;
1861 for (i = 0; i < num_hierarchies; i++) {
1862 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1863 return hierarchies[i];
1864 }
1865 errno = ENOENT;
1866 return NULL;
1867 }
1868
1869 /*
1870 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1871 * Note that the returned value may include files (keynames) etc
1872 */
1873 static const char *find_cgroup_in_path(const char *path)
1874 {
1875 const char *p1;
1876
1877 if (strlen(path) < 9) {
1878 errno = EACCES;
1879 return NULL;
1880 }
1881 p1 = strstr(path + 8, "/");
1882 if (!p1) {
1883 errno = EINVAL;
1884 return NULL;
1885 }
1886 errno = 0;
1887 return p1 + 1;
1888 }
1889
1890 /*
1891 * split the last path element from the path in @cg.
1892 * @dir is newly allocated and should be freed, @last not
1893 */
1894 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1895 {
1896 char *p;
1897
1898 do {
1899 *dir = strdup(cg);
1900 } while (!*dir);
1901 *last = strrchr(cg, '/');
1902 if (!*last) {
1903 *last = NULL;
1904 return;
1905 }
1906 p = strrchr(*dir, '/');
1907 *p = '\0';
1908 }
1909
1910 /*
1911 * FUSE ops for /cgroup
1912 */
1913
1914 int cg_getattr(const char *path, struct stat *sb)
1915 {
1916 struct timespec now;
1917 struct fuse_context *fc = fuse_get_context();
1918 char * cgdir = NULL;
1919 char *last = NULL, *path1, *path2;
1920 struct cgfs_files *k = NULL;
1921 const char *cgroup;
1922 const char *controller = NULL;
1923 int ret = -ENOENT;
1924
1925
1926 if (!fc)
1927 return -EIO;
1928
1929 memset(sb, 0, sizeof(struct stat));
1930
1931 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1932 return -EINVAL;
1933
1934 sb->st_uid = sb->st_gid = 0;
1935 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1936 sb->st_size = 0;
1937
1938 if (strcmp(path, "/cgroup") == 0) {
1939 sb->st_mode = S_IFDIR | 00755;
1940 sb->st_nlink = 2;
1941 return 0;
1942 }
1943
1944 controller = pick_controller_from_path(fc, path);
1945 if (!controller)
1946 return -errno;
1947 cgroup = find_cgroup_in_path(path);
1948 if (!cgroup) {
1949 /* this is just /cgroup/controller, return it as a dir */
1950 sb->st_mode = S_IFDIR | 00755;
1951 sb->st_nlink = 2;
1952 return 0;
1953 }
1954
1955 get_cgdir_and_path(cgroup, &cgdir, &last);
1956
1957 if (!last) {
1958 path1 = "/";
1959 path2 = cgdir;
1960 } else {
1961 path1 = cgdir;
1962 path2 = last;
1963 }
1964
1965 pid_t initpid = lookup_initpid_in_store(fc->pid);
1966 if (initpid <= 0)
1967 initpid = fc->pid;
1968 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1969 * Then check that caller's cgroup is under path if last is a child
1970 * cgroup, or cgdir if last is a file */
1971
1972 if (is_child_cgroup(controller, path1, path2)) {
1973 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1974 ret = -ENOENT;
1975 goto out;
1976 }
1977 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1978 /* this is just /cgroup/controller, return it as a dir */
1979 sb->st_mode = S_IFDIR | 00555;
1980 sb->st_nlink = 2;
1981 ret = 0;
1982 goto out;
1983 }
1984 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1985 ret = -EACCES;
1986 goto out;
1987 }
1988
1989 // get uid, gid, from '/tasks' file and make up a mode
1990 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1991 sb->st_mode = S_IFDIR | 00755;
1992 k = cgfs_get_key(controller, cgroup, NULL);
1993 if (!k) {
1994 sb->st_uid = sb->st_gid = 0;
1995 } else {
1996 sb->st_uid = k->uid;
1997 sb->st_gid = k->gid;
1998 }
1999 free_key(k);
2000 sb->st_nlink = 2;
2001 ret = 0;
2002 goto out;
2003 }
2004
2005 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
2006 sb->st_mode = S_IFREG | k->mode;
2007 sb->st_nlink = 1;
2008 sb->st_uid = k->uid;
2009 sb->st_gid = k->gid;
2010 sb->st_size = 0;
2011 free_key(k);
2012 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2013 ret = -ENOENT;
2014 goto out;
2015 }
2016 ret = 0;
2017 }
2018
2019 out:
2020 free(cgdir);
2021 return ret;
2022 }
2023
2024 int cg_opendir(const char *path, struct fuse_file_info *fi)
2025 {
2026 struct fuse_context *fc = fuse_get_context();
2027 const char *cgroup;
2028 struct file_info *dir_info;
2029 char *controller = NULL;
2030
2031 if (!fc)
2032 return -EIO;
2033
2034 if (strcmp(path, "/cgroup") == 0) {
2035 cgroup = NULL;
2036 controller = NULL;
2037 } else {
2038 // return list of keys for the controller, and list of child cgroups
2039 controller = pick_controller_from_path(fc, path);
2040 if (!controller)
2041 return -errno;
2042
2043 cgroup = find_cgroup_in_path(path);
2044 if (!cgroup) {
2045 /* this is just /cgroup/controller, return its contents */
2046 cgroup = "/";
2047 }
2048 }
2049
2050 pid_t initpid = lookup_initpid_in_store(fc->pid);
2051 if (initpid <= 0)
2052 initpid = fc->pid;
2053 if (cgroup) {
2054 if (!caller_may_see_dir(initpid, controller, cgroup))
2055 return -ENOENT;
2056 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2057 return -EACCES;
2058 }
2059
2060 /* we'll free this at cg_releasedir */
2061 dir_info = malloc(sizeof(*dir_info));
2062 if (!dir_info)
2063 return -ENOMEM;
2064 dir_info->controller = must_copy_string(controller);
2065 dir_info->cgroup = must_copy_string(cgroup);
2066 dir_info->type = LXC_TYPE_CGDIR;
2067 dir_info->buf = NULL;
2068 dir_info->file = NULL;
2069 dir_info->buflen = 0;
2070
2071 fi->fh = (unsigned long)dir_info;
2072 return 0;
2073 }
2074
2075 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2076 struct fuse_file_info *fi)
2077 {
2078 struct file_info *d = (struct file_info *)fi->fh;
2079 struct cgfs_files **list = NULL;
2080 int i, ret;
2081 char *nextcg = NULL;
2082 struct fuse_context *fc = fuse_get_context();
2083 char **clist = NULL;
2084
2085 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2086 return -EIO;
2087
2088 if (d->type != LXC_TYPE_CGDIR) {
2089 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
2090 return -EIO;
2091 }
2092 if (!d->cgroup && !d->controller) {
2093 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2094 int i;
2095
2096 for (i = 0; i < num_hierarchies; i++) {
2097 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2098 return -EIO;
2099 }
2100 }
2101 return 0;
2102 }
2103
2104 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2105 // not a valid cgroup
2106 ret = -EINVAL;
2107 goto out;
2108 }
2109
2110 pid_t initpid = lookup_initpid_in_store(fc->pid);
2111 if (initpid <= 0)
2112 initpid = fc->pid;
2113 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2114 if (nextcg) {
2115 ret = filler(buf, nextcg, NULL, 0);
2116 free(nextcg);
2117 if (ret != 0) {
2118 ret = -EIO;
2119 goto out;
2120 }
2121 }
2122 ret = 0;
2123 goto out;
2124 }
2125
2126 for (i = 0; list && list[i]; i++) {
2127 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2128 ret = -EIO;
2129 goto out;
2130 }
2131 }
2132
2133 // now get the list of child cgroups
2134
2135 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2136 ret = 0;
2137 goto out;
2138 }
2139 if (clist) {
2140 for (i = 0; clist[i]; i++) {
2141 if (filler(buf, clist[i], NULL, 0) != 0) {
2142 ret = -EIO;
2143 goto out;
2144 }
2145 }
2146 }
2147 ret = 0;
2148
2149 out:
2150 free_keys(list);
2151 if (clist) {
2152 for (i = 0; clist[i]; i++)
2153 free(clist[i]);
2154 free(clist);
2155 }
2156 return ret;
2157 }
2158
2159 void do_release_file_info(struct fuse_file_info *fi)
2160 {
2161 struct file_info *f = (struct file_info *)fi->fh;
2162
2163 if (!f)
2164 return;
2165
2166 fi->fh = 0;
2167
2168 free(f->controller);
2169 f->controller = NULL;
2170 free(f->cgroup);
2171 f->cgroup = NULL;
2172 free(f->file);
2173 f->file = NULL;
2174 free(f->buf);
2175 f->buf = NULL;
2176 free(f);
2177 f = NULL;
2178 }
2179
2180 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2181 {
2182 do_release_file_info(fi);
2183 return 0;
2184 }
2185
2186 int cg_open(const char *path, struct fuse_file_info *fi)
2187 {
2188 const char *cgroup;
2189 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2190 struct cgfs_files *k = NULL;
2191 struct file_info *file_info;
2192 struct fuse_context *fc = fuse_get_context();
2193 int ret;
2194
2195 if (!fc)
2196 return -EIO;
2197
2198 controller = pick_controller_from_path(fc, path);
2199 if (!controller)
2200 return -errno;
2201 cgroup = find_cgroup_in_path(path);
2202 if (!cgroup)
2203 return -errno;
2204
2205 get_cgdir_and_path(cgroup, &cgdir, &last);
2206 if (!last) {
2207 path1 = "/";
2208 path2 = cgdir;
2209 } else {
2210 path1 = cgdir;
2211 path2 = last;
2212 }
2213
2214 k = cgfs_get_key(controller, path1, path2);
2215 if (!k) {
2216 ret = -EINVAL;
2217 goto out;
2218 }
2219 free_key(k);
2220
2221 pid_t initpid = lookup_initpid_in_store(fc->pid);
2222 if (initpid <= 0)
2223 initpid = fc->pid;
2224 if (!caller_may_see_dir(initpid, controller, path1)) {
2225 ret = -ENOENT;
2226 goto out;
2227 }
2228 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2229 ret = -EACCES;
2230 goto out;
2231 }
2232
2233 /* we'll free this at cg_release */
2234 file_info = malloc(sizeof(*file_info));
2235 if (!file_info) {
2236 ret = -ENOMEM;
2237 goto out;
2238 }
2239 file_info->controller = must_copy_string(controller);
2240 file_info->cgroup = must_copy_string(path1);
2241 file_info->file = must_copy_string(path2);
2242 file_info->type = LXC_TYPE_CGFILE;
2243 file_info->buf = NULL;
2244 file_info->buflen = 0;
2245
2246 fi->fh = (unsigned long)file_info;
2247 ret = 0;
2248
2249 out:
2250 free(cgdir);
2251 return ret;
2252 }
2253
2254 int cg_access(const char *path, int mode)
2255 {
2256 int ret;
2257 const char *cgroup;
2258 char *path1, *path2, *controller;
2259 char *last = NULL, *cgdir = NULL;
2260 struct cgfs_files *k = NULL;
2261 struct fuse_context *fc = fuse_get_context();
2262
2263 if (strcmp(path, "/cgroup") == 0)
2264 return 0;
2265
2266 if (!fc)
2267 return -EIO;
2268
2269 controller = pick_controller_from_path(fc, path);
2270 if (!controller)
2271 return -errno;
2272 cgroup = find_cgroup_in_path(path);
2273 if (!cgroup) {
2274 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2275 if ((mode & W_OK) == 0)
2276 return 0;
2277 return -EACCES;
2278 }
2279
2280 get_cgdir_and_path(cgroup, &cgdir, &last);
2281 if (!last) {
2282 path1 = "/";
2283 path2 = cgdir;
2284 } else {
2285 path1 = cgdir;
2286 path2 = last;
2287 }
2288
2289 k = cgfs_get_key(controller, path1, path2);
2290 if (!k) {
2291 if ((mode & W_OK) == 0)
2292 ret = 0;
2293 else
2294 ret = -EACCES;
2295 goto out;
2296 }
2297 free_key(k);
2298
2299 pid_t initpid = lookup_initpid_in_store(fc->pid);
2300 if (initpid <= 0)
2301 initpid = fc->pid;
2302 if (!caller_may_see_dir(initpid, controller, path1)) {
2303 ret = -ENOENT;
2304 goto out;
2305 }
2306 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2307 ret = -EACCES;
2308 goto out;
2309 }
2310
2311 ret = 0;
2312
2313 out:
2314 free(cgdir);
2315 return ret;
2316 }
2317
2318 int cg_release(const char *path, struct fuse_file_info *fi)
2319 {
2320 do_release_file_info(fi);
2321 return 0;
2322 }
2323
2324 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2325
2326 static bool wait_for_sock(int sock, int timeout)
2327 {
2328 struct epoll_event ev;
2329 int epfd, ret, now, starttime, deltatime, saved_errno;
2330
2331 if ((starttime = time(NULL)) < 0)
2332 return false;
2333
2334 if ((epfd = epoll_create(1)) < 0) {
2335 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2336 return false;
2337 }
2338
2339 ev.events = POLLIN_SET;
2340 ev.data.fd = sock;
2341 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2342 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2343 close(epfd);
2344 return false;
2345 }
2346
2347 again:
2348 if ((now = time(NULL)) < 0) {
2349 close(epfd);
2350 return false;
2351 }
2352
2353 deltatime = (starttime + timeout) - now;
2354 if (deltatime < 0) { // timeout
2355 errno = 0;
2356 close(epfd);
2357 return false;
2358 }
2359 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2360 if (ret < 0 && errno == EINTR)
2361 goto again;
2362 saved_errno = errno;
2363 close(epfd);
2364
2365 if (ret <= 0) {
2366 errno = saved_errno;
2367 return false;
2368 }
2369 return true;
2370 }
2371
2372 static int msgrecv(int sockfd, void *buf, size_t len)
2373 {
2374 if (!wait_for_sock(sockfd, 2))
2375 return -1;
2376 return recv(sockfd, buf, len, MSG_DONTWAIT);
2377 }
2378
2379 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2380 {
2381 struct msghdr msg = { 0 };
2382 struct iovec iov;
2383 struct cmsghdr *cmsg;
2384 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2385 char buf[1];
2386 buf[0] = 'p';
2387
2388 if (pingfirst) {
2389 if (msgrecv(sock, buf, 1) != 1) {
2390 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2391 return SEND_CREDS_FAIL;
2392 }
2393 }
2394
2395 msg.msg_control = cmsgbuf;
2396 msg.msg_controllen = sizeof(cmsgbuf);
2397
2398 cmsg = CMSG_FIRSTHDR(&msg);
2399 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2400 cmsg->cmsg_level = SOL_SOCKET;
2401 cmsg->cmsg_type = SCM_CREDENTIALS;
2402 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2403
2404 msg.msg_name = NULL;
2405 msg.msg_namelen = 0;
2406
2407 buf[0] = v;
2408 iov.iov_base = buf;
2409 iov.iov_len = sizeof(buf);
2410 msg.msg_iov = &iov;
2411 msg.msg_iovlen = 1;
2412
2413 if (sendmsg(sock, &msg, 0) < 0) {
2414 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2415 if (errno == 3)
2416 return SEND_CREDS_NOTSK;
2417 return SEND_CREDS_FAIL;
2418 }
2419
2420 return SEND_CREDS_OK;
2421 }
2422
2423 static bool recv_creds(int sock, struct ucred *cred, char *v)
2424 {
2425 struct msghdr msg = { 0 };
2426 struct iovec iov;
2427 struct cmsghdr *cmsg;
2428 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2429 char buf[1];
2430 int ret;
2431 int optval = 1;
2432
2433 *v = '1';
2434
2435 cred->pid = -1;
2436 cred->uid = -1;
2437 cred->gid = -1;
2438
2439 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2440 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2441 return false;
2442 }
2443 buf[0] = '1';
2444 if (write(sock, buf, 1) != 1) {
2445 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2446 return false;
2447 }
2448
2449 msg.msg_name = NULL;
2450 msg.msg_namelen = 0;
2451 msg.msg_control = cmsgbuf;
2452 msg.msg_controllen = sizeof(cmsgbuf);
2453
2454 iov.iov_base = buf;
2455 iov.iov_len = sizeof(buf);
2456 msg.msg_iov = &iov;
2457 msg.msg_iovlen = 1;
2458
2459 if (!wait_for_sock(sock, 2)) {
2460 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2461 return false;
2462 }
2463 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2464 if (ret < 0) {
2465 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2466 return false;
2467 }
2468
2469 cmsg = CMSG_FIRSTHDR(&msg);
2470
2471 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2472 cmsg->cmsg_level == SOL_SOCKET &&
2473 cmsg->cmsg_type == SCM_CREDENTIALS) {
2474 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2475 }
2476 *v = buf[0];
2477
2478 return true;
2479 }
2480
2481 struct pid_ns_clone_args {
2482 int *cpipe;
2483 int sock;
2484 pid_t tpid;
2485 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2486 };
2487
2488 /*
2489 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2490 * with clone(). This simply writes '1' as ACK back to the parent
2491 * before calling the actual wrapped function.
2492 */
2493 static int pid_ns_clone_wrapper(void *arg) {
2494 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2495 char b = '1';
2496
2497 close(args->cpipe[0]);
2498 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2499 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2500 close(args->cpipe[1]);
2501 return args->wrapped(args->sock, args->tpid);
2502 }
2503
2504 /*
2505 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2506 * int value back over the socket. This shifts the pid from the
2507 * sender's pidns into tpid's pidns.
2508 */
2509 static int pid_to_ns(int sock, pid_t tpid)
2510 {
2511 char v = '0';
2512 struct ucred cred;
2513
2514 while (recv_creds(sock, &cred, &v)) {
2515 if (v == '1')
2516 return 0;
2517 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2518 return 1;
2519 }
2520 return 0;
2521 }
2522
2523
2524 /*
2525 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2526 * in your old pidns. Only children which you clone will be in the target
2527 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2528 * actually convert pids.
2529 *
2530 * Note: glibc's fork() does not respect pidns, which can lead to failed
2531 * assertions inside glibc (and thus failed forks) if the child's pid in
2532 * the pidns and the parent pid outside are identical. Using clone prevents
2533 * this issue.
2534 */
2535 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2536 {
2537 int newnsfd = -1, ret, cpipe[2];
2538 char fnam[100];
2539 pid_t cpid;
2540 char v;
2541
2542 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2543 if (ret < 0 || ret >= sizeof(fnam))
2544 _exit(1);
2545 newnsfd = open(fnam, O_RDONLY);
2546 if (newnsfd < 0)
2547 _exit(1);
2548 if (setns(newnsfd, 0) < 0)
2549 _exit(1);
2550 close(newnsfd);
2551
2552 if (pipe(cpipe) < 0)
2553 _exit(1);
2554
2555 struct pid_ns_clone_args args = {
2556 .cpipe = cpipe,
2557 .sock = sock,
2558 .tpid = tpid,
2559 .wrapped = &pid_to_ns
2560 };
2561 size_t stack_size = sysconf(_SC_PAGESIZE);
2562 void *stack = alloca(stack_size);
2563
2564 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2565 if (cpid < 0)
2566 _exit(1);
2567
2568 // give the child 1 second to be done forking and
2569 // write its ack
2570 if (!wait_for_sock(cpipe[0], 1))
2571 _exit(1);
2572 ret = read(cpipe[0], &v, 1);
2573 if (ret != sizeof(char) || v != '1')
2574 _exit(1);
2575
2576 if (!wait_for_pid(cpid))
2577 _exit(1);
2578 _exit(0);
2579 }
2580
2581 /*
2582 * To read cgroup files with a particular pid, we will setns into the child
2583 * pidns, open a pipe, fork a child - which will be the first to really be in
2584 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2585 */
2586 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2587 {
2588 int sock[2] = {-1, -1};
2589 char *tmpdata = NULL;
2590 int ret;
2591 pid_t qpid, cpid = -1;
2592 bool answer = false;
2593 char v = '0';
2594 struct ucred cred;
2595 size_t sz = 0, asz = 0;
2596
2597 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2598 return false;
2599
2600 /*
2601 * Now we read the pids from returned data one by one, pass
2602 * them into a child in the target namespace, read back the
2603 * translated pids, and put them into our to-return data
2604 */
2605
2606 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2607 perror("socketpair");
2608 free(tmpdata);
2609 return false;
2610 }
2611
2612 cpid = fork();
2613 if (cpid == -1)
2614 goto out;
2615
2616 if (!cpid) // child - exits when done
2617 pid_to_ns_wrapper(sock[1], tpid);
2618
2619 char *ptr = tmpdata;
2620 cred.uid = 0;
2621 cred.gid = 0;
2622 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2623 cred.pid = qpid;
2624 ret = send_creds(sock[0], &cred, v, true);
2625
2626 if (ret == SEND_CREDS_NOTSK)
2627 goto next;
2628 if (ret == SEND_CREDS_FAIL)
2629 goto out;
2630
2631 // read converted results
2632 if (!wait_for_sock(sock[0], 2)) {
2633 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2634 goto out;
2635 }
2636 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2637 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2638 goto out;
2639 }
2640 must_strcat_pid(d, &sz, &asz, qpid);
2641 next:
2642 ptr = strchr(ptr, '\n');
2643 if (!ptr)
2644 break;
2645 ptr++;
2646 }
2647
2648 cred.pid = getpid();
2649 v = '1';
2650 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2651 // failed to ask child to exit
2652 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2653 goto out;
2654 }
2655
2656 answer = true;
2657
2658 out:
2659 free(tmpdata);
2660 if (cpid != -1)
2661 wait_for_pid(cpid);
2662 if (sock[0] != -1) {
2663 close(sock[0]);
2664 close(sock[1]);
2665 }
2666 return answer;
2667 }
2668
2669 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2670 struct fuse_file_info *fi)
2671 {
2672 struct fuse_context *fc = fuse_get_context();
2673 struct file_info *f = (struct file_info *)fi->fh;
2674 struct cgfs_files *k = NULL;
2675 char *data = NULL;
2676 int ret, s;
2677 bool r;
2678
2679 if (f->type != LXC_TYPE_CGFILE) {
2680 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2681 return -EIO;
2682 }
2683
2684 if (offset)
2685 return 0;
2686
2687 if (!fc)
2688 return -EIO;
2689
2690 if (!f->controller)
2691 return -EINVAL;
2692
2693 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2694 return -EINVAL;
2695 }
2696 free_key(k);
2697
2698
2699 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2700 ret = -EACCES;
2701 goto out;
2702 }
2703
2704 if (strcmp(f->file, "tasks") == 0 ||
2705 strcmp(f->file, "/tasks") == 0 ||
2706 strcmp(f->file, "/cgroup.procs") == 0 ||
2707 strcmp(f->file, "cgroup.procs") == 0)
2708 // special case - we have to translate the pids
2709 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2710 else
2711 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2712
2713 if (!r) {
2714 ret = -EINVAL;
2715 goto out;
2716 }
2717
2718 if (!data) {
2719 ret = 0;
2720 goto out;
2721 }
2722 s = strlen(data);
2723 if (s > size)
2724 s = size;
2725 memcpy(buf, data, s);
2726 if (s > 0 && s < size && data[s-1] != '\n')
2727 buf[s++] = '\n';
2728
2729 ret = s;
2730
2731 out:
2732 free(data);
2733 return ret;
2734 }
2735
2736 static int pid_from_ns(int sock, pid_t tpid)
2737 {
2738 pid_t vpid;
2739 struct ucred cred;
2740 char v;
2741 int ret;
2742
2743 cred.uid = 0;
2744 cred.gid = 0;
2745 while (1) {
2746 if (!wait_for_sock(sock, 2)) {
2747 lxcfs_error("%s\n", "Timeout reading from parent.");
2748 return 1;
2749 }
2750 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2751 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2752 return 1;
2753 }
2754 if (vpid == -1) // done
2755 break;
2756 v = '0';
2757 cred.pid = vpid;
2758 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2759 v = '1';
2760 cred.pid = getpid();
2761 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2762 return 1;
2763 }
2764 }
2765 return 0;
2766 }
2767
2768 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2769 {
2770 int newnsfd = -1, ret, cpipe[2];
2771 char fnam[100];
2772 pid_t cpid;
2773 char v;
2774
2775 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2776 if (ret < 0 || ret >= sizeof(fnam))
2777 _exit(1);
2778 newnsfd = open(fnam, O_RDONLY);
2779 if (newnsfd < 0)
2780 _exit(1);
2781 if (setns(newnsfd, 0) < 0)
2782 _exit(1);
2783 close(newnsfd);
2784
2785 if (pipe(cpipe) < 0)
2786 _exit(1);
2787
2788 struct pid_ns_clone_args args = {
2789 .cpipe = cpipe,
2790 .sock = sock,
2791 .tpid = tpid,
2792 .wrapped = &pid_from_ns
2793 };
2794 size_t stack_size = sysconf(_SC_PAGESIZE);
2795 void *stack = alloca(stack_size);
2796
2797 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2798 if (cpid < 0)
2799 _exit(1);
2800
2801 // give the child 1 second to be done forking and
2802 // write its ack
2803 if (!wait_for_sock(cpipe[0], 1))
2804 _exit(1);
2805 ret = read(cpipe[0], &v, 1);
2806 if (ret != sizeof(char) || v != '1')
2807 _exit(1);
2808
2809 if (!wait_for_pid(cpid))
2810 _exit(1);
2811 _exit(0);
2812 }
2813
2814 /*
2815 * Given host @uid, return the uid to which it maps in
2816 * @pid's user namespace, or -1 if none.
2817 */
2818 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2819 {
2820 FILE *f;
2821 char line[400];
2822
2823 sprintf(line, "/proc/%d/uid_map", pid);
2824 if ((f = fopen(line, "r")) == NULL) {
2825 return false;
2826 }
2827
2828 *answer = convert_id_to_ns(f, uid);
2829 fclose(f);
2830
2831 if (*answer == -1)
2832 return false;
2833 return true;
2834 }
2835
2836 /*
2837 * get_pid_creds: get the real uid and gid of @pid from
2838 * /proc/$$/status
2839 * (XXX should we use euid here?)
2840 */
2841 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2842 {
2843 char line[400];
2844 uid_t u;
2845 gid_t g;
2846 FILE *f;
2847
2848 *uid = -1;
2849 *gid = -1;
2850 sprintf(line, "/proc/%d/status", pid);
2851 if ((f = fopen(line, "r")) == NULL) {
2852 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2853 return;
2854 }
2855 while (fgets(line, 400, f)) {
2856 if (strncmp(line, "Uid:", 4) == 0) {
2857 if (sscanf(line+4, "%u", &u) != 1) {
2858 lxcfs_error("bad uid line for pid %u\n", pid);
2859 fclose(f);
2860 return;
2861 }
2862 *uid = u;
2863 } else if (strncmp(line, "Gid:", 4) == 0) {
2864 if (sscanf(line+4, "%u", &g) != 1) {
2865 lxcfs_error("bad gid line for pid %u\n", pid);
2866 fclose(f);
2867 return;
2868 }
2869 *gid = g;
2870 }
2871 }
2872 fclose(f);
2873 }
2874
2875 /*
2876 * May the requestor @r move victim @v to a new cgroup?
2877 * This is allowed if
2878 * . they are the same task
2879 * . they are ownedy by the same uid
2880 * . @r is root on the host, or
2881 * . @v's uid is mapped into @r's where @r is root.
2882 */
2883 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2884 {
2885 uid_t v_uid, tmpuid;
2886 gid_t v_gid;
2887
2888 if (r == v)
2889 return true;
2890 if (r_uid == 0)
2891 return true;
2892 get_pid_creds(v, &v_uid, &v_gid);
2893 if (r_uid == v_uid)
2894 return true;
2895 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2896 && hostuid_to_ns(v_uid, r, &tmpuid))
2897 return true;
2898 return false;
2899 }
2900
2901 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2902 const char *file, const char *buf)
2903 {
2904 int sock[2] = {-1, -1};
2905 pid_t qpid, cpid = -1;
2906 FILE *pids_file = NULL;
2907 bool answer = false, fail = false;
2908
2909 pids_file = open_pids_file(contrl, cg);
2910 if (!pids_file)
2911 return false;
2912
2913 /*
2914 * write the pids to a socket, have helper in writer's pidns
2915 * call movepid for us
2916 */
2917 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2918 perror("socketpair");
2919 goto out;
2920 }
2921
2922 cpid = fork();
2923 if (cpid == -1)
2924 goto out;
2925
2926 if (!cpid) { // child
2927 fclose(pids_file);
2928 pid_from_ns_wrapper(sock[1], tpid);
2929 }
2930
2931 const char *ptr = buf;
2932 while (sscanf(ptr, "%d", &qpid) == 1) {
2933 struct ucred cred;
2934 char v;
2935
2936 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2937 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2938 goto out;
2939 }
2940
2941 if (recv_creds(sock[0], &cred, &v)) {
2942 if (v == '0') {
2943 if (!may_move_pid(tpid, tuid, cred.pid)) {
2944 fail = true;
2945 break;
2946 }
2947 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2948 fail = true;
2949 }
2950 }
2951
2952 ptr = strchr(ptr, '\n');
2953 if (!ptr)
2954 break;
2955 ptr++;
2956 }
2957
2958 /* All good, write the value */
2959 qpid = -1;
2960 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2961 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2962
2963 if (!fail)
2964 answer = true;
2965
2966 out:
2967 if (cpid != -1)
2968 wait_for_pid(cpid);
2969 if (sock[0] != -1) {
2970 close(sock[0]);
2971 close(sock[1]);
2972 }
2973 if (pids_file) {
2974 if (fclose(pids_file) != 0)
2975 answer = false;
2976 }
2977 return answer;
2978 }
2979
2980 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2981 struct fuse_file_info *fi)
2982 {
2983 struct fuse_context *fc = fuse_get_context();
2984 char *localbuf = NULL;
2985 struct cgfs_files *k = NULL;
2986 struct file_info *f = (struct file_info *)fi->fh;
2987 bool r;
2988
2989 if (f->type != LXC_TYPE_CGFILE) {
2990 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2991 return -EIO;
2992 }
2993
2994 if (offset)
2995 return 0;
2996
2997 if (!fc)
2998 return -EIO;
2999
3000 localbuf = alloca(size+1);
3001 localbuf[size] = '\0';
3002 memcpy(localbuf, buf, size);
3003
3004 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
3005 size = -EINVAL;
3006 goto out;
3007 }
3008
3009 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
3010 size = -EACCES;
3011 goto out;
3012 }
3013
3014 if (strcmp(f->file, "tasks") == 0 ||
3015 strcmp(f->file, "/tasks") == 0 ||
3016 strcmp(f->file, "/cgroup.procs") == 0 ||
3017 strcmp(f->file, "cgroup.procs") == 0)
3018 // special case - we have to translate the pids
3019 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
3020 else
3021 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
3022
3023 if (!r)
3024 size = -EINVAL;
3025
3026 out:
3027 free_key(k);
3028 return size;
3029 }
3030
3031 int cg_chown(const char *path, uid_t uid, gid_t gid)
3032 {
3033 struct fuse_context *fc = fuse_get_context();
3034 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3035 struct cgfs_files *k = NULL;
3036 const char *cgroup;
3037 int ret;
3038
3039 if (!fc)
3040 return -EIO;
3041
3042 if (strcmp(path, "/cgroup") == 0)
3043 return -EPERM;
3044
3045 controller = pick_controller_from_path(fc, path);
3046 if (!controller)
3047 return errno == ENOENT ? -EPERM : -errno;
3048
3049 cgroup = find_cgroup_in_path(path);
3050 if (!cgroup)
3051 /* this is just /cgroup/controller */
3052 return -EPERM;
3053
3054 get_cgdir_and_path(cgroup, &cgdir, &last);
3055
3056 if (!last) {
3057 path1 = "/";
3058 path2 = cgdir;
3059 } else {
3060 path1 = cgdir;
3061 path2 = last;
3062 }
3063
3064 if (is_child_cgroup(controller, path1, path2)) {
3065 // get uid, gid, from '/tasks' file and make up a mode
3066 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3067 k = cgfs_get_key(controller, cgroup, "tasks");
3068
3069 } else
3070 k = cgfs_get_key(controller, path1, path2);
3071
3072 if (!k) {
3073 ret = -EINVAL;
3074 goto out;
3075 }
3076
3077 /*
3078 * This being a fuse request, the uid and gid must be valid
3079 * in the caller's namespace. So we can just check to make
3080 * sure that the caller is root in his uid, and privileged
3081 * over the file's current owner.
3082 */
3083 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3084 ret = -EACCES;
3085 goto out;
3086 }
3087
3088 ret = cgfs_chown_file(controller, cgroup, uid, gid);
3089
3090 out:
3091 free_key(k);
3092 free(cgdir);
3093
3094 return ret;
3095 }
3096
3097 int cg_chmod(const char *path, mode_t mode)
3098 {
3099 struct fuse_context *fc = fuse_get_context();
3100 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3101 struct cgfs_files *k = NULL;
3102 const char *cgroup;
3103 int ret;
3104
3105 if (!fc)
3106 return -EIO;
3107
3108 if (strcmp(path, "/cgroup") == 0)
3109 return -EPERM;
3110
3111 controller = pick_controller_from_path(fc, path);
3112 if (!controller)
3113 return errno == ENOENT ? -EPERM : -errno;
3114
3115 cgroup = find_cgroup_in_path(path);
3116 if (!cgroup)
3117 /* this is just /cgroup/controller */
3118 return -EPERM;
3119
3120 get_cgdir_and_path(cgroup, &cgdir, &last);
3121
3122 if (!last) {
3123 path1 = "/";
3124 path2 = cgdir;
3125 } else {
3126 path1 = cgdir;
3127 path2 = last;
3128 }
3129
3130 if (is_child_cgroup(controller, path1, path2)) {
3131 // get uid, gid, from '/tasks' file and make up a mode
3132 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3133 k = cgfs_get_key(controller, cgroup, "tasks");
3134
3135 } else
3136 k = cgfs_get_key(controller, path1, path2);
3137
3138 if (!k) {
3139 ret = -EINVAL;
3140 goto out;
3141 }
3142
3143 /*
3144 * This being a fuse request, the uid and gid must be valid
3145 * in the caller's namespace. So we can just check to make
3146 * sure that the caller is root in his uid, and privileged
3147 * over the file's current owner.
3148 */
3149 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3150 ret = -EPERM;
3151 goto out;
3152 }
3153
3154 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3155 ret = -EINVAL;
3156 goto out;
3157 }
3158
3159 ret = 0;
3160 out:
3161 free_key(k);
3162 free(cgdir);
3163 return ret;
3164 }
3165
3166 int cg_mkdir(const char *path, mode_t mode)
3167 {
3168 struct fuse_context *fc = fuse_get_context();
3169 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3170 const char *cgroup;
3171 int ret;
3172
3173 if (!fc)
3174 return -EIO;
3175
3176 controller = pick_controller_from_path(fc, path);
3177 if (!controller)
3178 return errno == ENOENT ? -EPERM : -errno;
3179
3180 cgroup = find_cgroup_in_path(path);
3181 if (!cgroup)
3182 return -errno;
3183
3184 get_cgdir_and_path(cgroup, &cgdir, &last);
3185 if (!last)
3186 path1 = "/";
3187 else
3188 path1 = cgdir;
3189
3190 pid_t initpid = lookup_initpid_in_store(fc->pid);
3191 if (initpid <= 0)
3192 initpid = fc->pid;
3193 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3194 if (!next)
3195 ret = -EINVAL;
3196 else if (last && strcmp(next, last) == 0)
3197 ret = -EEXIST;
3198 else
3199 ret = -EPERM;
3200 goto out;
3201 }
3202
3203 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3204 ret = -EACCES;
3205 goto out;
3206 }
3207 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3208 ret = -EACCES;
3209 goto out;
3210 }
3211
3212 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3213
3214 out:
3215 free(cgdir);
3216 free(next);
3217 return ret;
3218 }
3219
3220 int cg_rmdir(const char *path)
3221 {
3222 struct fuse_context *fc = fuse_get_context();
3223 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3224 const char *cgroup;
3225 int ret;
3226
3227 if (!fc)
3228 return -EIO;
3229
3230 controller = pick_controller_from_path(fc, path);
3231 if (!controller) /* Someone's trying to delete "/cgroup". */
3232 return -EPERM;
3233
3234 cgroup = find_cgroup_in_path(path);
3235 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3236 return -EPERM;
3237
3238 get_cgdir_and_path(cgroup, &cgdir, &last);
3239 if (!last) {
3240 /* Someone's trying to delete a cgroup on the same level as the
3241 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3242 * rmdir "/cgroup/blkio/init.slice".
3243 */
3244 ret = -EPERM;
3245 goto out;
3246 }
3247
3248 pid_t initpid = lookup_initpid_in_store(fc->pid);
3249 if (initpid <= 0)
3250 initpid = fc->pid;
3251 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3252 if (!last || (next && (strcmp(next, last) == 0)))
3253 ret = -EBUSY;
3254 else
3255 ret = -ENOENT;
3256 goto out;
3257 }
3258
3259 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3260 ret = -EACCES;
3261 goto out;
3262 }
3263 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3264 ret = -EACCES;
3265 goto out;
3266 }
3267
3268 if (!cgfs_remove(controller, cgroup)) {
3269 ret = -EINVAL;
3270 goto out;
3271 }
3272
3273 ret = 0;
3274
3275 out:
3276 free(cgdir);
3277 free(next);
3278 return ret;
3279 }
3280
3281 static bool startswith(const char *line, const char *pref)
3282 {
3283 if (strncmp(line, pref, strlen(pref)) == 0)
3284 return true;
3285 return false;
3286 }
3287
3288 static void parse_memstat(char *memstat, unsigned long *cached,
3289 unsigned long *active_anon, unsigned long *inactive_anon,
3290 unsigned long *active_file, unsigned long *inactive_file,
3291 unsigned long *unevictable, unsigned long *shmem)
3292 {
3293 char *eol;
3294
3295 while (*memstat) {
3296 if (startswith(memstat, "total_cache")) {
3297 sscanf(memstat + 11, "%lu", cached);
3298 *cached /= 1024;
3299 } else if (startswith(memstat, "total_active_anon")) {
3300 sscanf(memstat + 17, "%lu", active_anon);
3301 *active_anon /= 1024;
3302 } else if (startswith(memstat, "total_inactive_anon")) {
3303 sscanf(memstat + 19, "%lu", inactive_anon);
3304 *inactive_anon /= 1024;
3305 } else if (startswith(memstat, "total_active_file")) {
3306 sscanf(memstat + 17, "%lu", active_file);
3307 *active_file /= 1024;
3308 } else if (startswith(memstat, "total_inactive_file")) {
3309 sscanf(memstat + 19, "%lu", inactive_file);
3310 *inactive_file /= 1024;
3311 } else if (startswith(memstat, "total_unevictable")) {
3312 sscanf(memstat + 17, "%lu", unevictable);
3313 *unevictable /= 1024;
3314 } else if (startswith(memstat, "total_shmem")) {
3315 sscanf(memstat + 11, "%lu", shmem);
3316 *shmem /= 1024;
3317 }
3318 eol = strchr(memstat, '\n');
3319 if (!eol)
3320 return;
3321 memstat = eol+1;
3322 }
3323 }
3324
3325 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3326 {
3327 char *eol;
3328 char key[32];
3329
3330 memset(key, 0, 32);
3331 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3332
3333 size_t len = strlen(key);
3334 *v = 0;
3335
3336 while (*str) {
3337 if (startswith(str, key)) {
3338 sscanf(str + len, "%lu", v);
3339 return;
3340 }
3341 eol = strchr(str, '\n');
3342 if (!eol)
3343 return;
3344 str = eol+1;
3345 }
3346 }
3347
3348 int read_file(const char *path, char *buf, size_t size, struct file_info *d)
3349 {
3350 size_t linelen = 0, total_len = 0, rv = 0;
3351 char *line = NULL;
3352 char *cache = d->buf;
3353 size_t cache_size = d->buflen;
3354 FILE *f = fopen(path, "r");
3355 if (!f)
3356 return 0;
3357
3358 while (getline(&line, &linelen, f) != -1) {
3359 ssize_t l = snprintf(cache, cache_size, "%s", line);
3360 if (l < 0) {
3361 perror("Error writing to cache");
3362 rv = 0;
3363 goto err;
3364 }
3365 if (l >= cache_size) {
3366 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3367 rv = 0;
3368 goto err;
3369 }
3370 cache += l;
3371 cache_size -= l;
3372 total_len += l;
3373 }
3374
3375 d->size = total_len;
3376 if (total_len > size)
3377 total_len = size;
3378
3379 /* read from off 0 */
3380 memcpy(buf, d->buf, total_len);
3381 rv = total_len;
3382 err:
3383 fclose(f);
3384 free(line);
3385 return rv;
3386 }
3387
3388 /*
3389 * FUSE ops for /proc
3390 */
3391
3392 static unsigned long get_memlimit(const char *cgroup, const char *file)
3393 {
3394 char *memlimit_str = NULL;
3395 unsigned long memlimit = -1;
3396
3397 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3398 memlimit = strtoul(memlimit_str, NULL, 10);
3399
3400 free(memlimit_str);
3401
3402 return memlimit;
3403 }
3404
3405 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3406 {
3407 char *copy = strdupa(cgroup);
3408 unsigned long memlimit = 0, retlimit;
3409
3410 retlimit = get_memlimit(copy, file);
3411
3412 while (strcmp(copy, "/") != 0) {
3413 copy = dirname(copy);
3414 memlimit = get_memlimit(copy, file);
3415 if (memlimit != -1 && memlimit < retlimit)
3416 retlimit = memlimit;
3417 };
3418
3419 return retlimit;
3420 }
3421
3422 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3423 struct fuse_file_info *fi)
3424 {
3425 struct fuse_context *fc = fuse_get_context();
3426 struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
3427 struct file_info *d = (struct file_info *)fi->fh;
3428 char *cg;
3429 char *memusage_str = NULL, *memstat_str = NULL,
3430 *memswlimit_str = NULL, *memswusage_str = NULL;
3431 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3432 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3433 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
3434 hostswtotal = 0;
3435 char *line = NULL;
3436 size_t linelen = 0, total_len = 0, rv = 0;
3437 char *cache = d->buf;
3438 size_t cache_size = d->buflen;
3439 FILE *f = NULL;
3440
3441 if (offset){
3442 if (offset > d->size)
3443 return -EINVAL;
3444 if (!d->cached)
3445 return 0;
3446 int left = d->size - offset;
3447 total_len = left > size ? size: left;
3448 memcpy(buf, cache + offset, total_len);
3449 return total_len;
3450 }
3451
3452 pid_t initpid = lookup_initpid_in_store(fc->pid);
3453 if (initpid <= 0)
3454 initpid = fc->pid;
3455 cg = get_pid_cgroup(initpid, "memory");
3456 if (!cg)
3457 return read_file("/proc/meminfo", buf, size, d);
3458 prune_init_slice(cg);
3459
3460 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3461 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3462 goto err;
3463 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3464 goto err;
3465
3466 // Following values are allowed to fail, because swapaccount might be turned
3467 // off for current kernel
3468 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3469 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3470 {
3471 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3472 memswusage = strtoul(memswusage_str, NULL, 10);
3473
3474 memswlimit = memswlimit / 1024;
3475 memswusage = memswusage / 1024;
3476 }
3477
3478 memusage = strtoul(memusage_str, NULL, 10);
3479 memlimit /= 1024;
3480 memusage /= 1024;
3481
3482 parse_memstat(memstat_str, &cached, &active_anon,
3483 &inactive_anon, &active_file, &inactive_file,
3484 &unevictable, &shmem);
3485
3486 f = fopen("/proc/meminfo", "r");
3487 if (!f)
3488 goto err;
3489
3490 while (getline(&line, &linelen, f) != -1) {
3491 ssize_t l;
3492 char *printme, lbuf[100];
3493
3494 memset(lbuf, 0, 100);
3495 if (startswith(line, "MemTotal:")) {
3496 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3497 if (hosttotal < memlimit)
3498 memlimit = hosttotal;
3499 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3500 printme = lbuf;
3501 } else if (startswith(line, "MemFree:")) {
3502 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3503 printme = lbuf;
3504 } else if (startswith(line, "MemAvailable:")) {
3505 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
3506 printme = lbuf;
3507 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 && opts && opts->swap_off == false) {
3508 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3509 if (hostswtotal < memswlimit)
3510 memswlimit = hostswtotal;
3511 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
3512 printme = lbuf;
3513 } else if (startswith(line, "SwapTotal:") && opts && opts->swap_off == true) {
3514 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", 0UL);
3515 printme = lbuf;
3516 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0 && opts && opts->swap_off == false) {
3517 unsigned long swaptotal = memswlimit,
3518 swapusage = memswusage - memusage,
3519 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3520 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3521 printme = lbuf;
3522 } else if (startswith(line, "SwapFree:") && opts && opts->swap_off == true) {
3523 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", 0UL);
3524 printme = lbuf;
3525 } else if (startswith(line, "Slab:")) {
3526 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3527 printme = lbuf;
3528 } else if (startswith(line, "Buffers:")) {
3529 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3530 printme = lbuf;
3531 } else if (startswith(line, "Cached:")) {
3532 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3533 printme = lbuf;
3534 } else if (startswith(line, "SwapCached:")) {
3535 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3536 printme = lbuf;
3537 } else if (startswith(line, "Active:")) {
3538 snprintf(lbuf, 100, "Active: %8lu kB\n",
3539 active_anon + active_file);
3540 printme = lbuf;
3541 } else if (startswith(line, "Inactive:")) {
3542 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3543 inactive_anon + inactive_file);
3544 printme = lbuf;
3545 } else if (startswith(line, "Active(anon)")) {
3546 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3547 printme = lbuf;
3548 } else if (startswith(line, "Inactive(anon)")) {
3549 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3550 printme = lbuf;
3551 } else if (startswith(line, "Active(file)")) {
3552 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3553 printme = lbuf;
3554 } else if (startswith(line, "Inactive(file)")) {
3555 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3556 printme = lbuf;
3557 } else if (startswith(line, "Unevictable")) {
3558 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3559 printme = lbuf;
3560 } else if (startswith(line, "SReclaimable")) {
3561 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3562 printme = lbuf;
3563 } else if (startswith(line, "SUnreclaim")) {
3564 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3565 printme = lbuf;
3566 } else if (startswith(line, "Shmem:")) {
3567 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3568 printme = lbuf;
3569 } else if (startswith(line, "ShmemHugePages")) {
3570 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3571 printme = lbuf;
3572 } else if (startswith(line, "ShmemPmdMapped")) {
3573 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3574 printme = lbuf;
3575 } else
3576 printme = line;
3577
3578 l = snprintf(cache, cache_size, "%s", printme);
3579 if (l < 0) {
3580 perror("Error writing to cache");
3581 rv = 0;
3582 goto err;
3583
3584 }
3585 if (l >= cache_size) {
3586 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3587 rv = 0;
3588 goto err;
3589 }
3590
3591 cache += l;
3592 cache_size -= l;
3593 total_len += l;
3594 }
3595
3596 d->cached = 1;
3597 d->size = total_len;
3598 if (total_len > size ) total_len = size;
3599 memcpy(buf, d->buf, total_len);
3600
3601 rv = total_len;
3602 err:
3603 if (f)
3604 fclose(f);
3605 free(line);
3606 free(cg);
3607 free(memusage_str);
3608 free(memswlimit_str);
3609 free(memswusage_str);
3610 free(memstat_str);
3611 return rv;
3612 }
3613
3614 /*
3615 * Read the cpuset.cpus for cg
3616 * Return the answer in a newly allocated string which must be freed
3617 */
3618 char *get_cpuset(const char *cg)
3619 {
3620 char *answer;
3621
3622 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3623 return NULL;
3624 return answer;
3625 }
3626
3627 bool cpu_in_cpuset(int cpu, const char *cpuset);
3628
3629 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3630 {
3631 int cpu;
3632
3633 if (sscanf(line, "processor : %d", &cpu) != 1)
3634 return false;
3635 return cpu_in_cpuset(cpu, cpuset);
3636 }
3637
3638 /*
3639 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3640 * depending on `param`. Parameter value is returned throuh `value`.
3641 */
3642 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3643 {
3644 bool rv = false;
3645 char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3646 char *str = NULL;
3647
3648 sprintf(file, "cpu.cfs_%s_us", param);
3649
3650 if (!cgfs_get_value("cpu", cg, file, &str))
3651 goto err;
3652
3653 if (sscanf(str, "%ld", value) != 1)
3654 goto err;
3655
3656 rv = true;
3657
3658 err:
3659 if (str)
3660 free(str);
3661 return rv;
3662 }
3663
3664 /*
3665 * Return the maximum number of visible CPUs based on CPU quotas.
3666 * If there is no quota set, zero is returned.
3667 */
3668 int max_cpu_count(const char *cg)
3669 {
3670 int rv, nprocs;
3671 int64_t cfs_quota, cfs_period;
3672
3673 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3674 return 0;
3675
3676 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3677 return 0;
3678
3679 if (cfs_quota <= 0 || cfs_period <= 0)
3680 return 0;
3681
3682 rv = cfs_quota / cfs_period;
3683
3684 /* In case quota/period does not yield a whole number, add one CPU for
3685 * the remainder.
3686 */
3687 if ((cfs_quota % cfs_period) > 0)
3688 rv += 1;
3689
3690 nprocs = get_nprocs();
3691
3692 if (rv > nprocs)
3693 rv = nprocs;
3694
3695 return rv;
3696 }
3697
3698 /*
3699 * Return the exact number of visible CPUs based on CPU quotas.
3700 * If there is no quota set, zero is returned.
3701 */
3702 static double exact_cpu_count(const char *cg)
3703 {
3704 double rv;
3705 int nprocs;
3706 int64_t cfs_quota, cfs_period;
3707
3708 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3709 return 0;
3710
3711 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3712 return 0;
3713
3714 if (cfs_quota <= 0 || cfs_period <= 0)
3715 return 0;
3716
3717 rv = (double)cfs_quota / (double)cfs_period;
3718
3719 nprocs = get_nprocs();
3720
3721 if (rv > nprocs)
3722 rv = nprocs;
3723
3724 return rv;
3725 }
3726
3727 /*
3728 * Determine whether CPU views should be used or not.
3729 */
3730 bool use_cpuview(const char *cg)
3731 {
3732 int cfd;
3733 char *tmpc;
3734
3735 tmpc = find_mounted_controller("cpu", &cfd);
3736 if (!tmpc)
3737 return false;
3738
3739 tmpc = find_mounted_controller("cpuacct", &cfd);
3740 if (!tmpc)
3741 return false;
3742
3743 return true;
3744 }
3745
3746 /*
3747 * check whether this is a '^processor" line in /proc/cpuinfo
3748 */
3749 static bool is_processor_line(const char *line)
3750 {
3751 int cpu;
3752
3753 if (sscanf(line, "processor : %d", &cpu) == 1)
3754 return true;
3755 return false;
3756 }
3757
3758 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3759 struct fuse_file_info *fi)
3760 {
3761 struct fuse_context *fc = fuse_get_context();
3762 struct file_info *d = (struct file_info *)fi->fh;
3763 char *cg;
3764 char *cpuset = NULL;
3765 char *line = NULL;
3766 size_t linelen = 0, total_len = 0, rv = 0;
3767 bool am_printing = false, firstline = true, is_s390x = false;
3768 int curcpu = -1, cpu, max_cpus = 0;
3769 bool use_view;
3770 char *cache = d->buf;
3771 size_t cache_size = d->buflen;
3772 FILE *f = NULL;
3773
3774 if (offset){
3775 if (offset > d->size)
3776 return -EINVAL;
3777 if (!d->cached)
3778 return 0;
3779 int left = d->size - offset;
3780 total_len = left > size ? size: left;
3781 memcpy(buf, cache + offset, total_len);
3782 return total_len;
3783 }
3784
3785 pid_t initpid = lookup_initpid_in_store(fc->pid);
3786 if (initpid <= 0)
3787 initpid = fc->pid;
3788 cg = get_pid_cgroup(initpid, "cpuset");
3789 if (!cg)
3790 return read_file("proc/cpuinfo", buf, size, d);
3791 prune_init_slice(cg);
3792
3793 cpuset = get_cpuset(cg);
3794 if (!cpuset)
3795 goto err;
3796
3797 use_view = use_cpuview(cg);
3798
3799 if (use_view)
3800 max_cpus = max_cpu_count(cg);
3801
3802 f = fopen("/proc/cpuinfo", "r");
3803 if (!f)
3804 goto err;
3805
3806 while (getline(&line, &linelen, f) != -1) {
3807 ssize_t l;
3808 if (firstline) {
3809 firstline = false;
3810 if (strstr(line, "IBM/S390") != NULL) {
3811 is_s390x = true;
3812 am_printing = true;
3813 continue;
3814 }
3815 }
3816 if (strncmp(line, "# processors:", 12) == 0)
3817 continue;
3818 if (is_processor_line(line)) {
3819 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3820 break;
3821 am_printing = cpuline_in_cpuset(line, cpuset);
3822 if (am_printing) {
3823 curcpu ++;
3824 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3825 if (l < 0) {
3826 perror("Error writing to cache");
3827 rv = 0;
3828 goto err;
3829 }
3830 if (l >= cache_size) {
3831 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3832 rv = 0;
3833 goto err;
3834 }
3835 cache += l;
3836 cache_size -= l;
3837 total_len += l;
3838 }
3839 continue;
3840 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3841 char *p;
3842 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3843 break;
3844 if (!cpu_in_cpuset(cpu, cpuset))
3845 continue;
3846 curcpu ++;
3847 p = strchr(line, ':');
3848 if (!p || !*p)
3849 goto err;
3850 p++;
3851 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3852 if (l < 0) {
3853 perror("Error writing to cache");
3854 rv = 0;
3855 goto err;
3856 }
3857 if (l >= cache_size) {
3858 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3859 rv = 0;
3860 goto err;
3861 }
3862 cache += l;
3863 cache_size -= l;
3864 total_len += l;
3865 continue;
3866
3867 }
3868 if (am_printing) {
3869 l = snprintf(cache, cache_size, "%s", line);
3870 if (l < 0) {
3871 perror("Error writing to cache");
3872 rv = 0;
3873 goto err;
3874 }
3875 if (l >= cache_size) {
3876 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3877 rv = 0;
3878 goto err;
3879 }
3880 cache += l;
3881 cache_size -= l;
3882 total_len += l;
3883 }
3884 }
3885
3886 if (is_s390x) {
3887 char *origcache = d->buf;
3888 ssize_t l;
3889 do {
3890 d->buf = malloc(d->buflen);
3891 } while (!d->buf);
3892 cache = d->buf;
3893 cache_size = d->buflen;
3894 total_len = 0;
3895 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3896 if (l < 0 || l >= cache_size) {
3897 free(origcache);
3898 goto err;
3899 }
3900 cache_size -= l;
3901 cache += l;
3902 total_len += l;
3903 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3904 if (l < 0 || l >= cache_size) {
3905 free(origcache);
3906 goto err;
3907 }
3908 cache_size -= l;
3909 cache += l;
3910 total_len += l;
3911 l = snprintf(cache, cache_size, "%s", origcache);
3912 free(origcache);
3913 if (l < 0 || l >= cache_size)
3914 goto err;
3915 total_len += l;
3916 }
3917
3918 d->cached = 1;
3919 d->size = total_len;
3920 if (total_len > size ) total_len = size;
3921
3922 /* read from off 0 */
3923 memcpy(buf, d->buf, total_len);
3924 rv = total_len;
3925 err:
3926 if (f)
3927 fclose(f);
3928 free(line);
3929 free(cpuset);
3930 free(cg);
3931 return rv;
3932 }
3933
3934 static uint64_t get_reaper_start_time(pid_t pid)
3935 {
3936 int ret;
3937 FILE *f;
3938 uint64_t starttime;
3939 /* strlen("/proc/") = 6
3940 * +
3941 * LXCFS_NUMSTRLEN64
3942 * +
3943 * strlen("/stat") = 5
3944 * +
3945 * \0 = 1
3946 * */
3947 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3948 char path[__PROC_PID_STAT_LEN];
3949 pid_t qpid;
3950
3951 qpid = lookup_initpid_in_store(pid);
3952 if (qpid <= 0) {
3953 /* Caller can check for EINVAL on 0. */
3954 errno = EINVAL;
3955 return 0;
3956 }
3957
3958 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3959 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3960 /* Caller can check for EINVAL on 0. */
3961 errno = EINVAL;
3962 return 0;
3963 }
3964
3965 f = fopen(path, "r");
3966 if (!f) {
3967 /* Caller can check for EINVAL on 0. */
3968 errno = EINVAL;
3969 return 0;
3970 }
3971
3972 /* Note that the *scanf() argument supression requires that length
3973 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3974 * at us. It's like telling someone you're not married and then asking
3975 * if you can bring your wife to the party.
3976 */
3977 ret = fscanf(f, "%*d " /* (1) pid %d */
3978 "%*s " /* (2) comm %s */
3979 "%*c " /* (3) state %c */
3980 "%*d " /* (4) ppid %d */
3981 "%*d " /* (5) pgrp %d */
3982 "%*d " /* (6) session %d */
3983 "%*d " /* (7) tty_nr %d */
3984 "%*d " /* (8) tpgid %d */
3985 "%*u " /* (9) flags %u */
3986 "%*u " /* (10) minflt %lu */
3987 "%*u " /* (11) cminflt %lu */
3988 "%*u " /* (12) majflt %lu */
3989 "%*u " /* (13) cmajflt %lu */
3990 "%*u " /* (14) utime %lu */
3991 "%*u " /* (15) stime %lu */
3992 "%*d " /* (16) cutime %ld */
3993 "%*d " /* (17) cstime %ld */
3994 "%*d " /* (18) priority %ld */
3995 "%*d " /* (19) nice %ld */
3996 "%*d " /* (20) num_threads %ld */
3997 "%*d " /* (21) itrealvalue %ld */
3998 "%" PRIu64, /* (22) starttime %llu */
3999 &starttime);
4000 if (ret != 1) {
4001 fclose(f);
4002 /* Caller can check for EINVAL on 0. */
4003 errno = EINVAL;
4004 return 0;
4005 }
4006
4007 fclose(f);
4008
4009 errno = 0;
4010 return starttime;
4011 }
4012
4013 static double get_reaper_start_time_in_sec(pid_t pid)
4014 {
4015 uint64_t clockticks, ticks_per_sec;
4016 int64_t ret;
4017 double res = 0;
4018
4019 clockticks = get_reaper_start_time(pid);
4020 if (clockticks == 0 && errno == EINVAL) {
4021 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
4022 return 0;
4023 }
4024
4025 ret = sysconf(_SC_CLK_TCK);
4026 if (ret < 0 && errno == EINVAL) {
4027 lxcfs_debug(
4028 "%s\n",
4029 "failed to determine number of clock ticks in a second");
4030 return 0;
4031 }
4032
4033 ticks_per_sec = (uint64_t)ret;
4034 res = (double)clockticks / ticks_per_sec;
4035 return res;
4036 }
4037
4038 static double get_reaper_age(pid_t pid)
4039 {
4040 uint64_t uptime_ms;
4041 double procstart, procage;
4042
4043 /* We need to substract the time the process has started since system
4044 * boot minus the time when the system has started to get the actual
4045 * reaper age.
4046 */
4047 procstart = get_reaper_start_time_in_sec(pid);
4048 procage = procstart;
4049 if (procstart > 0) {
4050 int ret;
4051 struct timespec spec;
4052
4053 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
4054 if (ret < 0)
4055 return 0;
4056
4057 /* We could make this more precise here by using the tv_nsec
4058 * field in the timespec struct and convert it to milliseconds
4059 * and then create a double for the seconds and milliseconds but
4060 * that seems more work than it is worth.
4061 */
4062 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
4063 procage = (uptime_ms - (procstart * 1000)) / 1000;
4064 }
4065
4066 return procage;
4067 }
4068
4069 /*
4070 * Returns 0 on success.
4071 * It is the caller's responsibility to free `return_usage`, unless this
4072 * function returns an error.
4073 */
4074 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
4075 {
4076 int cpucount = get_nprocs_conf();
4077 struct cpuacct_usage *cpu_usage;
4078 int rv = 0, i, j, ret;
4079 int cg_cpu;
4080 uint64_t cg_user, cg_system;
4081 int64_t ticks_per_sec;
4082 char *usage_str = NULL;
4083
4084 ticks_per_sec = sysconf(_SC_CLK_TCK);
4085
4086 if (ticks_per_sec < 0 && errno == EINVAL) {
4087 lxcfs_v(
4088 "%s\n",
4089 "read_cpuacct_usage_all failed to determine number of clock ticks "
4090 "in a second");
4091 return -1;
4092 }
4093
4094 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4095 if (!cpu_usage)
4096 return -ENOMEM;
4097
4098 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
4099 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4100 // read cpuacct.usage_percpu instead
4101 lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", "");
4102 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_percpu", &usage_str)) {
4103 rv = -1;
4104 goto err;
4105 }
4106 lxcfs_v("usage_str: %s\n", usage_str);
4107
4108 // convert cpuacct.usage_percpu into cpuacct.usage_all
4109 lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", "");
4110
4111 char *data = NULL;
4112 size_t sz = 0, asz = 0;
4113
4114 must_strcat(&data, &sz, &asz, "cpu user system\n");
4115
4116 int i = 0, read_pos = 0, read_cnt=0;
4117 while (sscanf(usage_str + read_pos, "%lu %n", &cg_user, &read_cnt) > 0) {
4118 lxcfs_debug("i: %d, cg_user: %lu, read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt);
4119 must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user);
4120 i++;
4121 read_pos += read_cnt;
4122 }
4123
4124 free(usage_str);
4125 usage_str = data;
4126
4127 lxcfs_v("usage_str: %s\n", usage_str);
4128 }
4129
4130 int read_pos = 0, read_cnt=0;
4131 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4132 lxcfs_error("read_cpuacct_usage_all reading first line from "
4133 "%s/cpuacct.usage_all failed.\n", cg);
4134 rv = -1;
4135 goto err;
4136 }
4137
4138 read_pos += read_cnt;
4139
4140 for (i = 0, j = 0; i < cpucount; i++) {
4141 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4142 &cg_system, &read_cnt);
4143
4144 if (ret == EOF)
4145 break;
4146
4147 if (ret != 3) {
4148 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4149 "failed.\n", cg);
4150 rv = -1;
4151 goto err;
4152 }
4153
4154 read_pos += read_cnt;
4155
4156 /* Convert the time from nanoseconds to USER_HZ */
4157 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4158 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4159 j++;
4160 }
4161
4162 rv = 0;
4163 *return_usage = cpu_usage;
4164 *size = cpucount;
4165
4166 err:
4167 if (usage_str)
4168 free(usage_str);
4169
4170 if (rv != 0) {
4171 free(cpu_usage);
4172 *return_usage = NULL;
4173 }
4174
4175 return rv;
4176 }
4177
4178 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4179 {
4180 int i;
4181 unsigned long sum = 0;
4182
4183 for (i = 0; i < cpu_count; i++) {
4184 if (!newer[i].online)
4185 continue;
4186
4187 /* When cpuset is changed on the fly, the CPUs might get reordered.
4188 * We could either reset all counters, or check that the substractions
4189 * below will return expected results.
4190 */
4191 if (newer[i].user > older[i].user)
4192 diff[i].user = newer[i].user - older[i].user;
4193 else
4194 diff[i].user = 0;
4195
4196 if (newer[i].system > older[i].system)
4197 diff[i].system = newer[i].system - older[i].system;
4198 else
4199 diff[i].system = 0;
4200
4201 if (newer[i].idle > older[i].idle)
4202 diff[i].idle = newer[i].idle - older[i].idle;
4203 else
4204 diff[i].idle = 0;
4205
4206 sum += diff[i].user;
4207 sum += diff[i].system;
4208 sum += diff[i].idle;
4209 }
4210
4211 return sum;
4212 }
4213
4214 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4215 {
4216 unsigned long free_space, to_add;
4217
4218 free_space = threshold - usage->user - usage->system;
4219
4220 if (free_space > usage->idle)
4221 free_space = usage->idle;
4222
4223 to_add = free_space > *surplus ? *surplus : free_space;
4224
4225 *counter += to_add;
4226 usage->idle -= to_add;
4227 *surplus -= to_add;
4228 }
4229
4230 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4231 {
4232 struct cg_proc_stat *first = NULL, *prev, *tmp;
4233
4234 for (prev = NULL; node; ) {
4235 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4236 tmp = node;
4237 lxcfs_debug("Removing stat node for %s\n", node->cg);
4238
4239 if (prev)
4240 prev->next = node->next;
4241 else
4242 first = node->next;
4243
4244 node = node->next;
4245 free_proc_stat_node(tmp);
4246 } else {
4247 if (!first)
4248 first = node;
4249 prev = node;
4250 node = node->next;
4251 }
4252 }
4253
4254 return first;
4255 }
4256
4257 #define PROC_STAT_PRUNE_INTERVAL 10
4258 static void prune_proc_stat_history(void)
4259 {
4260 int i;
4261 time_t now = time(NULL);
4262
4263 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4264 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4265
4266 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4267 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4268 return;
4269 }
4270
4271 if (proc_stat_history[i]->next) {
4272 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4273 proc_stat_history[i]->lastcheck = now;
4274 }
4275
4276 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4277 }
4278 }
4279
4280 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4281 {
4282 struct cg_proc_stat *node;
4283
4284 pthread_rwlock_rdlock(&head->lock);
4285
4286 if (!head->next) {
4287 pthread_rwlock_unlock(&head->lock);
4288 return NULL;
4289 }
4290
4291 node = head->next;
4292
4293 do {
4294 if (strcmp(cg, node->cg) == 0)
4295 goto out;
4296 } while ((node = node->next));
4297
4298 node = NULL;
4299
4300 out:
4301 pthread_rwlock_unlock(&head->lock);
4302 prune_proc_stat_history();
4303 return node;
4304 }
4305
4306 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4307 {
4308 struct cg_proc_stat *node;
4309 int i;
4310
4311 node = malloc(sizeof(struct cg_proc_stat));
4312 if (!node)
4313 goto err;
4314
4315 node->cg = NULL;
4316 node->usage = NULL;
4317 node->view = NULL;
4318
4319 node->cg = malloc(strlen(cg) + 1);
4320 if (!node->cg)
4321 goto err;
4322
4323 strcpy(node->cg, cg);
4324
4325 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4326 if (!node->usage)
4327 goto err;
4328
4329 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4330
4331 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4332 if (!node->view)
4333 goto err;
4334
4335 node->cpu_count = cpu_count;
4336 node->next = NULL;
4337
4338 if (pthread_mutex_init(&node->lock, NULL) != 0) {
4339 lxcfs_error("%s\n", "Failed to initialize node lock");
4340 goto err;
4341 }
4342
4343 for (i = 0; i < cpu_count; i++) {
4344 node->view[i].user = 0;
4345 node->view[i].system = 0;
4346 node->view[i].idle = 0;
4347 }
4348
4349 return node;
4350
4351 err:
4352 if (node && node->cg)
4353 free(node->cg);
4354 if (node && node->usage)
4355 free(node->usage);
4356 if (node && node->view)
4357 free(node->view);
4358 if (node)
4359 free(node);
4360
4361 return NULL;
4362 }
4363
4364 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4365 {
4366 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4367 struct cg_proc_stat_head *head = proc_stat_history[hash];
4368 struct cg_proc_stat *node, *rv = new_node;
4369
4370 pthread_rwlock_wrlock(&head->lock);
4371
4372 if (!head->next) {
4373 head->next = new_node;
4374 goto out;
4375 }
4376
4377 node = head->next;
4378
4379 for (;;) {
4380 if (strcmp(node->cg, new_node->cg) == 0) {
4381 /* The node is already present, return it */
4382 free_proc_stat_node(new_node);
4383 rv = node;
4384 goto out;
4385 }
4386
4387 if (node->next) {
4388 node = node->next;
4389 continue;
4390 }
4391
4392 node->next = new_node;
4393 goto out;
4394 }
4395
4396 out:
4397 pthread_rwlock_unlock(&head->lock);
4398 return rv;
4399 }
4400
4401 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4402 {
4403 struct cpuacct_usage *new_usage, *new_view;
4404 int i;
4405
4406 /* Allocate new memory */
4407 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4408 if (!new_usage)
4409 return false;
4410
4411 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4412 if (!new_view) {
4413 free(new_usage);
4414 return false;
4415 }
4416
4417 /* Copy existing data & initialize new elements */
4418 for (i = 0; i < cpu_count; i++) {
4419 if (i < node->cpu_count) {
4420 new_usage[i].user = node->usage[i].user;
4421 new_usage[i].system = node->usage[i].system;
4422 new_usage[i].idle = node->usage[i].idle;
4423
4424 new_view[i].user = node->view[i].user;
4425 new_view[i].system = node->view[i].system;
4426 new_view[i].idle = node->view[i].idle;
4427 } else {
4428 new_usage[i].user = 0;
4429 new_usage[i].system = 0;
4430 new_usage[i].idle = 0;
4431
4432 new_view[i].user = 0;
4433 new_view[i].system = 0;
4434 new_view[i].idle = 0;
4435 }
4436 }
4437
4438 free(node->usage);
4439 free(node->view);
4440
4441 node->usage = new_usage;
4442 node->view = new_view;
4443 node->cpu_count = cpu_count;
4444
4445 return true;
4446 }
4447
4448 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4449 {
4450 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4451 struct cg_proc_stat_head *head = proc_stat_history[hash];
4452 struct cg_proc_stat *node;
4453
4454 node = find_proc_stat_node(head, cg);
4455
4456 if (!node) {
4457 node = new_proc_stat_node(usage, cpu_count, cg);
4458 if (!node)
4459 return NULL;
4460
4461 node = add_proc_stat_node(node);
4462 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4463 }
4464
4465 pthread_mutex_lock(&node->lock);
4466
4467 /* If additional CPUs on the host have been enabled, CPU usage counter
4468 * arrays have to be expanded */
4469 if (node->cpu_count < cpu_count) {
4470 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4471 node->cpu_count, cpu_count, cg);
4472
4473 if (!expand_proc_stat_node(node, cpu_count)) {
4474 pthread_mutex_unlock(&node->lock);
4475 lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4476 node->cpu_count, cpu_count, cg);
4477 return NULL;
4478 }
4479 }
4480
4481 return node;
4482 }
4483
4484 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4485 {
4486 int i;
4487
4488 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4489 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4490
4491 for (i = 0; i < cpu_count; i++) {
4492 node->view[i].user = 0;
4493 node->view[i].system = 0;
4494 node->view[i].idle = 0;
4495 }
4496
4497 node->cpu_count = cpu_count;
4498 }
4499
4500 static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, FILE *f, char *buf, size_t buf_size)
4501 {
4502 char *line = NULL;
4503 size_t linelen = 0, total_len = 0, rv = 0, l;
4504 int curcpu = -1; /* cpu numbering starts at 0 */
4505 int physcpu, i;
4506 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4507 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4508 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4509 unsigned long user_surplus = 0, system_surplus = 0;
4510 unsigned long total_sum, threshold;
4511 struct cg_proc_stat *stat_node;
4512 struct cpuacct_usage *diff = NULL;
4513 int nprocs = get_nprocs_conf();
4514
4515 if (cg_cpu_usage_size < nprocs)
4516 nprocs = cg_cpu_usage_size;
4517
4518 /* Read all CPU stats and stop when we've encountered other lines */
4519 while (getline(&line, &linelen, f) != -1) {
4520 int ret;
4521 char cpu_char[10]; /* That's a lot of cores */
4522 uint64_t all_used, cg_used;
4523
4524 if (strlen(line) == 0)
4525 continue;
4526 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4527 /* not a ^cpuN line containing a number N */
4528 break;
4529 }
4530
4531 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4532 continue;
4533
4534 if (physcpu >= cg_cpu_usage_size)
4535 continue;
4536
4537 curcpu ++;
4538 cpu_cnt ++;
4539
4540 if (!cpu_in_cpuset(physcpu, cpuset)) {
4541 for (i = curcpu; i <= physcpu; i++) {
4542 cg_cpu_usage[i].online = false;
4543 }
4544 continue;
4545 }
4546
4547 if (curcpu < physcpu) {
4548 /* Some CPUs may be disabled */
4549 for (i = curcpu; i < physcpu; i++)
4550 cg_cpu_usage[i].online = false;
4551
4552 curcpu = physcpu;
4553 }
4554
4555 cg_cpu_usage[curcpu].online = true;
4556
4557 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4558 &user,
4559 &nice,
4560 &system,
4561 &idle,
4562 &iowait,
4563 &irq,
4564 &softirq,
4565 &steal,
4566 &guest,
4567 &guest_nice);
4568
4569 if (ret != 10)
4570 continue;
4571
4572 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4573 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4574
4575 if (all_used >= cg_used) {
4576 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4577
4578 } else {
4579 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4580 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4581 curcpu, cg, all_used, cg_used);
4582 cg_cpu_usage[curcpu].idle = idle;
4583 }
4584 }
4585
4586 /* Cannot use more CPUs than is available due to cpuset */
4587 if (max_cpus > cpu_cnt)
4588 max_cpus = cpu_cnt;
4589
4590 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4591
4592 if (!stat_node) {
4593 lxcfs_error("unable to find/create stat node for %s\n", cg);
4594 rv = 0;
4595 goto err;
4596 }
4597
4598 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4599 if (!diff) {
4600 rv = 0;
4601 goto err;
4602 }
4603
4604 /*
4605 * If the new values are LOWER than values stored in memory, it means
4606 * the cgroup has been reset/recreated and we should reset too.
4607 */
4608 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4609 if (!cg_cpu_usage[curcpu].online)
4610 continue;
4611
4612 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4613 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4614
4615 break;
4616 }
4617
4618 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4619
4620 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4621 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4622
4623 if (!stat_node->usage[curcpu].online)
4624 continue;
4625
4626 i++;
4627
4628 stat_node->usage[curcpu].user += diff[curcpu].user;
4629 stat_node->usage[curcpu].system += diff[curcpu].system;
4630 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4631
4632 if (max_cpus > 0 && i >= max_cpus) {
4633 user_surplus += diff[curcpu].user;
4634 system_surplus += diff[curcpu].system;
4635 }
4636 }
4637
4638 /* Calculate usage counters of visible CPUs */
4639 if (max_cpus > 0) {
4640 /* threshold = maximum usage per cpu, including idle */
4641 threshold = total_sum / cpu_cnt * max_cpus;
4642
4643 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4644 if (!stat_node->usage[curcpu].online)
4645 continue;
4646
4647 i++;
4648
4649 if (i == max_cpus)
4650 break;
4651
4652 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4653 continue;
4654
4655 /* Add user */
4656 add_cpu_usage(
4657 &user_surplus,
4658 &diff[curcpu],
4659 &diff[curcpu].user,
4660 threshold);
4661
4662 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4663 continue;
4664
4665 /* If there is still room, add system */
4666 add_cpu_usage(
4667 &system_surplus,
4668 &diff[curcpu],
4669 &diff[curcpu].system,
4670 threshold);
4671 }
4672
4673 if (user_surplus > 0)
4674 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4675 if (system_surplus > 0)
4676 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4677
4678 unsigned long diff_user = 0;
4679 unsigned long diff_system = 0;
4680 unsigned long diff_idle = 0;
4681 unsigned long max_diff_idle = 0;
4682 unsigned long max_diff_idle_index = 0;
4683 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4684 if (!stat_node->usage[curcpu].online)
4685 continue;
4686
4687 i++;
4688
4689 if (i == max_cpus)
4690 break;
4691
4692 stat_node->view[curcpu].user += diff[curcpu].user;
4693 stat_node->view[curcpu].system += diff[curcpu].system;
4694 stat_node->view[curcpu].idle += diff[curcpu].idle;
4695
4696 user_sum += stat_node->view[curcpu].user;
4697 system_sum += stat_node->view[curcpu].system;
4698 idle_sum += stat_node->view[curcpu].idle;
4699
4700 diff_user += diff[curcpu].user;
4701 diff_system += diff[curcpu].system;
4702 diff_idle += diff[curcpu].idle;
4703 if (diff[curcpu].idle > max_diff_idle) {
4704 max_diff_idle = diff[curcpu].idle;
4705 max_diff_idle_index = curcpu;
4706 }
4707
4708 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
4709 }
4710 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
4711
4712 // revise cpu usage view to support partial cpu case
4713 double exact_cpus = exact_cpu_count(cg);
4714 if (exact_cpus < (double)max_cpus){
4715 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
4716 unsigned long delta = (unsigned long)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
4717 lxcfs_v("delta: %lu\n", delta);
4718 lxcfs_v("idle_sum before: %lu\n", idle_sum);
4719 idle_sum = idle_sum > delta ? idle_sum - delta : 0;
4720 lxcfs_v("idle_sum after: %lu\n", idle_sum);
4721
4722 curcpu = max_diff_idle_index;
4723 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
4724 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
4725 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
4726 }
4727 } else {
4728 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4729 if (!stat_node->usage[curcpu].online)
4730 continue;
4731
4732 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4733 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4734 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4735
4736 user_sum += stat_node->view[curcpu].user;
4737 system_sum += stat_node->view[curcpu].system;
4738 idle_sum += stat_node->view[curcpu].idle;
4739 }
4740 }
4741
4742 /* Render the file */
4743 /* cpu-all */
4744 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4745 user_sum,
4746 system_sum,
4747 idle_sum);
4748 lxcfs_v("cpu-all: %s\n", buf);
4749
4750 if (l < 0) {
4751 perror("Error writing to cache");
4752 rv = 0;
4753 goto err;
4754 }
4755 if (l >= buf_size) {
4756 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4757 rv = 0;
4758 goto err;
4759 }
4760
4761 buf += l;
4762 buf_size -= l;
4763 total_len += l;
4764
4765 /* Render visible CPUs */
4766 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4767 if (!stat_node->usage[curcpu].online)
4768 continue;
4769
4770 i++;
4771
4772 if (max_cpus > 0 && i == max_cpus)
4773 break;
4774
4775 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4776 i,
4777 stat_node->view[curcpu].user,
4778 stat_node->view[curcpu].system,
4779 stat_node->view[curcpu].idle);
4780 lxcfs_v("cpu: %s\n", buf);
4781
4782 if (l < 0) {
4783 perror("Error writing to cache");
4784 rv = 0;
4785 goto err;
4786
4787 }
4788 if (l >= buf_size) {
4789 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4790 rv = 0;
4791 goto err;
4792 }
4793
4794 buf += l;
4795 buf_size -= l;
4796 total_len += l;
4797 }
4798
4799 /* Pass the rest of /proc/stat, start with the last line read */
4800 l = snprintf(buf, buf_size, "%s", line);
4801
4802 if (l < 0) {
4803 perror("Error writing to cache");
4804 rv = 0;
4805 goto err;
4806
4807 }
4808 if (l >= buf_size) {
4809 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4810 rv = 0;
4811 goto err;
4812 }
4813
4814 buf += l;
4815 buf_size -= l;
4816 total_len += l;
4817
4818 /* Pass the rest of the host's /proc/stat */
4819 while (getline(&line, &linelen, f) != -1) {
4820 l = snprintf(buf, buf_size, "%s", line);
4821 if (l < 0) {
4822 perror("Error writing to cache");
4823 rv = 0;
4824 goto err;
4825 }
4826 if (l >= buf_size) {
4827 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4828 rv = 0;
4829 goto err;
4830 }
4831 buf += l;
4832 buf_size -= l;
4833 total_len += l;
4834 }
4835
4836 rv = total_len;
4837
4838 err:
4839 if (stat_node)
4840 pthread_mutex_unlock(&stat_node->lock);
4841 if (line)
4842 free(line);
4843 if (diff)
4844 free(diff);
4845 return rv;
4846 }
4847
4848 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4849 static int proc_stat_read(char *buf, size_t size, off_t offset,
4850 struct fuse_file_info *fi)
4851 {
4852 struct fuse_context *fc = fuse_get_context();
4853 struct file_info *d = (struct file_info *)fi->fh;
4854 char *cg;
4855 char *cpuset = NULL;
4856 char *line = NULL;
4857 size_t linelen = 0, total_len = 0, rv = 0;
4858 int curcpu = -1; /* cpu numbering starts at 0 */
4859 int physcpu = 0;
4860 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4861 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
4862 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4863 char cpuall[CPUALL_MAX_SIZE];
4864 /* reserve for cpu all */
4865 char *cache = d->buf + CPUALL_MAX_SIZE;
4866 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4867 FILE *f = NULL;
4868 struct cpuacct_usage *cg_cpu_usage = NULL;
4869 int cg_cpu_usage_size = 0;
4870
4871 if (offset){
4872 if (offset > d->size)
4873 return -EINVAL;
4874 if (!d->cached)
4875 return 0;
4876 int left = d->size - offset;
4877 total_len = left > size ? size: left;
4878 memcpy(buf, d->buf + offset, total_len);
4879 return total_len;
4880 }
4881
4882 pid_t initpid = lookup_initpid_in_store(fc->pid);
4883 lxcfs_v("initpid: %d\n", initpid);
4884 if (initpid <= 0)
4885 initpid = fc->pid;
4886
4887 /*
4888 * when container run with host pid namespace initpid == 1, cgroup will "/"
4889 * we should return host os's /proc contents.
4890 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
4891 */
4892 if (initpid == 1) {
4893 return read_file("/proc/stat", buf, size, d);
4894 }
4895
4896 cg = get_pid_cgroup(initpid, "cpuset");
4897 lxcfs_v("cg: %s\n", cg);
4898 if (!cg)
4899 return read_file("/proc/stat", buf, size, d);
4900 prune_init_slice(cg);
4901
4902 cpuset = get_cpuset(cg);
4903 if (!cpuset)
4904 goto err;
4905
4906 /*
4907 * Read cpuacct.usage_all for all CPUs.
4908 * If the cpuacct cgroup is present, it is used to calculate the container's
4909 * CPU usage. If not, values from the host's /proc/stat are used.
4910 */
4911 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
4912 lxcfs_v("%s\n", "proc_stat_read failed to read from cpuacct, "
4913 "falling back to the host's /proc/stat");
4914 }
4915
4916 f = fopen("/proc/stat", "r");
4917 if (!f)
4918 goto err;
4919
4920 //skip first line
4921 if (getline(&line, &linelen, f) < 0) {
4922 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4923 goto err;
4924 }
4925
4926 if (use_cpuview(cg) && cg_cpu_usage) {
4927 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4928 f, d->buf, d->buflen);
4929 goto out;
4930 }
4931
4932 while (getline(&line, &linelen, f) != -1) {
4933 ssize_t l;
4934 char cpu_char[10]; /* That's a lot of cores */
4935 char *c;
4936 uint64_t all_used, cg_used, new_idle;
4937 int ret;
4938
4939 if (strlen(line) == 0)
4940 continue;
4941 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4942 /* not a ^cpuN line containing a number N, just print it */
4943 l = snprintf(cache, cache_size, "%s", line);
4944 if (l < 0) {
4945 perror("Error writing to cache");
4946 rv = 0;
4947 goto err;
4948 }
4949 if (l >= cache_size) {
4950 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4951 rv = 0;
4952 goto err;
4953 }
4954 cache += l;
4955 cache_size -= l;
4956 total_len += l;
4957 continue;
4958 }
4959
4960 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4961 continue;
4962 if (!cpu_in_cpuset(physcpu, cpuset))
4963 continue;
4964 curcpu ++;
4965
4966 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4967 &user,
4968 &nice,
4969 &system,
4970 &idle,
4971 &iowait,
4972 &irq,
4973 &softirq,
4974 &steal,
4975 &guest,
4976 &guest_nice);
4977
4978 if (ret != 10 || !cg_cpu_usage) {
4979 c = strchr(line, ' ');
4980 if (!c)
4981 continue;
4982 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4983 if (l < 0) {
4984 perror("Error writing to cache");
4985 rv = 0;
4986 goto err;
4987
4988 }
4989 if (l >= cache_size) {
4990 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4991 rv = 0;
4992 goto err;
4993 }
4994
4995 cache += l;
4996 cache_size -= l;
4997 total_len += l;
4998
4999 if (ret != 10)
5000 continue;
5001 }
5002
5003 if (cg_cpu_usage) {
5004 if (physcpu >= cg_cpu_usage_size)
5005 break;
5006
5007 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
5008 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
5009
5010 if (all_used >= cg_used) {
5011 new_idle = idle + (all_used - cg_used);
5012
5013 } else {
5014 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
5015 "%lu in cpuacct.usage_all; unable to determine idle time\n",
5016 curcpu, cg, all_used, cg_used);
5017 new_idle = idle;
5018 }
5019
5020 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
5021 curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
5022 new_idle);
5023
5024 if (l < 0) {
5025 perror("Error writing to cache");
5026 rv = 0;
5027 goto err;
5028
5029 }
5030 if (l >= cache_size) {
5031 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5032 rv = 0;
5033 goto err;
5034 }
5035
5036 cache += l;
5037 cache_size -= l;
5038 total_len += l;
5039
5040 user_sum += cg_cpu_usage[physcpu].user;
5041 system_sum += cg_cpu_usage[physcpu].system;
5042 idle_sum += new_idle;
5043
5044 } else {
5045 user_sum += user;
5046 nice_sum += nice;
5047 system_sum += system;
5048 idle_sum += idle;
5049 iowait_sum += iowait;
5050 irq_sum += irq;
5051 softirq_sum += softirq;
5052 steal_sum += steal;
5053 guest_sum += guest;
5054 guest_nice_sum += guest_nice;
5055 }
5056 }
5057
5058 cache = d->buf;
5059
5060 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5061 user_sum,
5062 nice_sum,
5063 system_sum,
5064 idle_sum,
5065 iowait_sum,
5066 irq_sum,
5067 softirq_sum,
5068 steal_sum,
5069 guest_sum,
5070 guest_nice_sum);
5071 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
5072 memcpy(cache, cpuall, cpuall_len);
5073 cache += cpuall_len;
5074 } else {
5075 /* shouldn't happen */
5076 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
5077 cpuall_len = 0;
5078 }
5079
5080 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
5081 total_len += cpuall_len;
5082
5083 out:
5084 d->cached = 1;
5085 d->size = total_len;
5086 if (total_len > size)
5087 total_len = size;
5088
5089 memcpy(buf, d->buf, total_len);
5090 rv = total_len;
5091
5092 err:
5093 if (f)
5094 fclose(f);
5095 if (cg_cpu_usage)
5096 free(cg_cpu_usage);
5097 free(line);
5098 free(cpuset);
5099 free(cg);
5100 return rv;
5101 }
5102
5103 /* This function retrieves the busy time of a group of tasks by looking at
5104 * cpuacct.usage. Unfortunately, this only makes sense when the container has
5105 * been given it's own cpuacct cgroup. If not, this function will take the busy
5106 * time of all other taks that do not actually belong to the container into
5107 * account as well. If someone has a clever solution for this please send a
5108 * patch!
5109 */
5110 static double get_reaper_busy(pid_t task)
5111 {
5112 pid_t initpid = lookup_initpid_in_store(task);
5113 char *cgroup = NULL, *usage_str = NULL;
5114 unsigned long usage = 0;
5115 double res = 0;
5116
5117 if (initpid <= 0)
5118 return 0;
5119
5120 cgroup = get_pid_cgroup(initpid, "cpuacct");
5121 if (!cgroup)
5122 goto out;
5123 prune_init_slice(cgroup);
5124 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
5125 goto out;
5126 usage = strtoul(usage_str, NULL, 10);
5127 res = (double)usage / 1000000000;
5128
5129 out:
5130 free(cgroup);
5131 free(usage_str);
5132 return res;
5133 }
5134
5135 #if RELOADTEST
5136 void iwashere(void)
5137 {
5138 int fd;
5139
5140 fd = creat("/tmp/lxcfs-iwashere", 0644);
5141 if (fd >= 0)
5142 close(fd);
5143 }
5144 #endif
5145
5146 /*
5147 * We read /proc/uptime and reuse its second field.
5148 * For the first field, we use the mtime for the reaper for
5149 * the calling pid as returned by getreaperage
5150 */
5151 static int proc_uptime_read(char *buf, size_t size, off_t offset,
5152 struct fuse_file_info *fi)
5153 {
5154 struct fuse_context *fc = fuse_get_context();
5155 struct file_info *d = (struct file_info *)fi->fh;
5156 double busytime = get_reaper_busy(fc->pid);
5157 char *cache = d->buf;
5158 ssize_t total_len = 0;
5159 double idletime, reaperage;
5160
5161 #if RELOADTEST
5162 iwashere();
5163 #endif
5164
5165 if (offset){
5166 if (!d->cached)
5167 return 0;
5168 if (offset > d->size)
5169 return -EINVAL;
5170 int left = d->size - offset;
5171 total_len = left > size ? size: left;
5172 memcpy(buf, cache + offset, total_len);
5173 return total_len;
5174 }
5175
5176 reaperage = get_reaper_age(fc->pid);
5177 /* To understand why this is done, please read the comment to the
5178 * get_reaper_busy() function.
5179 */
5180 idletime = reaperage;
5181 if (reaperage >= busytime)
5182 idletime = reaperage - busytime;
5183
5184 total_len = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
5185 if (total_len < 0 || total_len >= d->buflen){
5186 lxcfs_error("%s\n", "failed to write to cache");
5187 return 0;
5188 }
5189
5190 d->size = (int)total_len;
5191 d->cached = 1;
5192
5193 if (total_len > size) total_len = size;
5194
5195 memcpy(buf, d->buf, total_len);
5196 return total_len;
5197 }
5198
5199 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5200 struct fuse_file_info *fi)
5201 {
5202 char dev_name[72];
5203 struct fuse_context *fc = fuse_get_context();
5204 struct file_info *d = (struct file_info *)fi->fh;
5205 char *cg;
5206 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5207 *io_wait_time_str = NULL, *io_service_time_str = NULL;
5208 unsigned long read = 0, write = 0;
5209 unsigned long read_merged = 0, write_merged = 0;
5210 unsigned long read_sectors = 0, write_sectors = 0;
5211 unsigned long read_ticks = 0, write_ticks = 0;
5212 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5213 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5214 char *cache = d->buf;
5215 size_t cache_size = d->buflen;
5216 char *line = NULL;
5217 size_t linelen = 0, total_len = 0, rv = 0;
5218 unsigned int major = 0, minor = 0;
5219 int i = 0;
5220 FILE *f = NULL;
5221
5222 if (offset){
5223 if (offset > d->size)
5224 return -EINVAL;
5225 if (!d->cached)
5226 return 0;
5227 int left = d->size - offset;
5228 total_len = left > size ? size: left;
5229 memcpy(buf, cache + offset, total_len);
5230 return total_len;
5231 }
5232
5233 pid_t initpid = lookup_initpid_in_store(fc->pid);
5234 if (initpid <= 0)
5235 initpid = fc->pid;
5236 cg = get_pid_cgroup(initpid, "blkio");
5237 if (!cg)
5238 return read_file("/proc/diskstats", buf, size, d);
5239 prune_init_slice(cg);
5240
5241 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
5242 goto err;
5243 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
5244 goto err;
5245 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
5246 goto err;
5247 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
5248 goto err;
5249 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
5250 goto err;
5251
5252
5253 f = fopen("/proc/diskstats", "r");
5254 if (!f)
5255 goto err;
5256
5257 while (getline(&line, &linelen, f) != -1) {
5258 ssize_t l;
5259 char lbuf[256];
5260
5261 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5262 if (i != 3)
5263 continue;
5264
5265 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5266 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5267 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5268 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5269 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5270 read_sectors = read_sectors/512;
5271 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5272 write_sectors = write_sectors/512;
5273
5274 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5275 rd_svctm = rd_svctm/1000000;
5276 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5277 rd_wait = rd_wait/1000000;
5278 read_ticks = rd_svctm + rd_wait;
5279
5280 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5281 wr_svctm = wr_svctm/1000000;
5282 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5283 wr_wait = wr_wait/1000000;
5284 write_ticks = wr_svctm + wr_wait;
5285
5286 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5287 tot_ticks = tot_ticks/1000000;
5288
5289 memset(lbuf, 0, 256);
5290 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5291 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5292 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5293 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5294 else
5295 continue;
5296
5297 l = snprintf(cache, cache_size, "%s", lbuf);
5298 if (l < 0) {
5299 perror("Error writing to fuse buf");
5300 rv = 0;
5301 goto err;
5302 }
5303 if (l >= cache_size) {
5304 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5305 rv = 0;
5306 goto err;
5307 }
5308 cache += l;
5309 cache_size -= l;
5310 total_len += l;
5311 }
5312
5313 d->cached = 1;
5314 d->size = total_len;
5315 if (total_len > size ) total_len = size;
5316 memcpy(buf, d->buf, total_len);
5317
5318 rv = total_len;
5319 err:
5320 free(cg);
5321 if (f)
5322 fclose(f);
5323 free(line);
5324 free(io_serviced_str);
5325 free(io_merged_str);
5326 free(io_service_bytes_str);
5327 free(io_wait_time_str);
5328 free(io_service_time_str);
5329 return rv;
5330 }
5331
5332 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5333 struct fuse_file_info *fi)
5334 {
5335 struct fuse_context *fc = fuse_get_context();
5336 struct file_info *d = (struct file_info *)fi->fh;
5337 char *cg = NULL;
5338 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
5339 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
5340 ssize_t total_len = 0, rv = 0;
5341 ssize_t l = 0;
5342 char *cache = d->buf;
5343
5344 if (offset) {
5345 if (offset > d->size)
5346 return -EINVAL;
5347 if (!d->cached)
5348 return 0;
5349 int left = d->size - offset;
5350 total_len = left > size ? size: left;
5351 memcpy(buf, cache + offset, total_len);
5352 return total_len;
5353 }
5354
5355 pid_t initpid = lookup_initpid_in_store(fc->pid);
5356 if (initpid <= 0)
5357 initpid = fc->pid;
5358 cg = get_pid_cgroup(initpid, "memory");
5359 if (!cg)
5360 return read_file("/proc/swaps", buf, size, d);
5361 prune_init_slice(cg);
5362
5363 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
5364
5365 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
5366 goto err;
5367
5368 memusage = strtoul(memusage_str, NULL, 10);
5369
5370 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5371 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5372
5373 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
5374 memswusage = strtoul(memswusage_str, NULL, 10);
5375
5376 swap_total = (memswlimit - memlimit) / 1024;
5377 swap_free = (memswusage - memusage) / 1024;
5378 }
5379
5380 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5381
5382 /* When no mem + swap limit is specified or swapaccount=0*/
5383 if (!memswlimit) {
5384 char *line = NULL;
5385 size_t linelen = 0;
5386 FILE *f = fopen("/proc/meminfo", "r");
5387
5388 if (!f)
5389 goto err;
5390
5391 while (getline(&line, &linelen, f) != -1) {
5392 if (startswith(line, "SwapTotal:")) {
5393 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
5394 } else if (startswith(line, "SwapFree:")) {
5395 sscanf(line, "SwapFree: %8lu kB", &swap_free);
5396 }
5397 }
5398
5399 free(line);
5400 fclose(f);
5401 }
5402
5403 if (swap_total > 0) {
5404 l = snprintf(d->buf + total_len, d->size - total_len,
5405 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5406 swap_total, swap_free);
5407 total_len += l;
5408 }
5409
5410 if (total_len < 0 || l < 0) {
5411 perror("Error writing to cache");
5412 rv = 0;
5413 goto err;
5414 }
5415
5416 d->cached = 1;
5417 d->size = (int)total_len;
5418
5419 if (total_len > size) total_len = size;
5420 memcpy(buf, d->buf, total_len);
5421 rv = total_len;
5422
5423 err:
5424 free(cg);
5425 free(memswlimit_str);
5426 free(memlimit_str);
5427 free(memusage_str);
5428 free(memswusage_str);
5429 return rv;
5430 }
5431 /*
5432 * Find the process pid from cgroup path.
5433 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5434 * @pid_buf : put pid to pid_buf.
5435 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5436 * @depth : the depth of cgroup in container.
5437 * @sum : return the number of pid.
5438 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5439 */
5440 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5441 {
5442 DIR *dir;
5443 int fd;
5444 struct dirent *file;
5445 FILE *f = NULL;
5446 size_t linelen = 0;
5447 char *line = NULL;
5448 int pd;
5449 char *path_dir, *path;
5450 char **pid;
5451
5452 /* path = dpath + "/cgroup.procs" + /0 */
5453 do {
5454 path = malloc(strlen(dpath) + 20);
5455 } while (!path);
5456
5457 strcpy(path, dpath);
5458 fd = openat(cfd, path, O_RDONLY);
5459 if (fd < 0)
5460 goto out;
5461
5462 dir = fdopendir(fd);
5463 if (dir == NULL) {
5464 close(fd);
5465 goto out;
5466 }
5467
5468 while (((file = readdir(dir)) != NULL) && depth > 0) {
5469 if (strncmp(file->d_name, ".", 1) == 0)
5470 continue;
5471 if (strncmp(file->d_name, "..", 1) == 0)
5472 continue;
5473 if (file->d_type == DT_DIR) {
5474 /* path + '/' + d_name +/0 */
5475 do {
5476 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5477 } while (!path_dir);
5478 strcpy(path_dir, path);
5479 strcat(path_dir, "/");
5480 strcat(path_dir, file->d_name);
5481 pd = depth - 1;
5482 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5483 free(path_dir);
5484 }
5485 }
5486 closedir(dir);
5487
5488 strcat(path, "/cgroup.procs");
5489 fd = openat(cfd, path, O_RDONLY);
5490 if (fd < 0)
5491 goto out;
5492
5493 f = fdopen(fd, "r");
5494 if (!f) {
5495 close(fd);
5496 goto out;
5497 }
5498
5499 while (getline(&line, &linelen, f) != -1) {
5500 do {
5501 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5502 } while (!pid);
5503 *pid_buf = pid;
5504 do {
5505 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5506 } while (*(*pid_buf + sum) == NULL);
5507 strcpy(*(*pid_buf + sum), line);
5508 sum++;
5509 }
5510 fclose(f);
5511 out:
5512 if (line)
5513 free(line);
5514 free(path);
5515 return sum;
5516 }
5517 /*
5518 * calc_load calculates the load according to the following formula:
5519 * load1 = load0 * exp + active * (1 - exp)
5520 *
5521 * @load1: the new loadavg.
5522 * @load0: the former loadavg.
5523 * @active: the total number of running pid at this moment.
5524 * @exp: the fixed-point defined in the beginning.
5525 */
5526 static unsigned long
5527 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5528 {
5529 unsigned long newload;
5530
5531 active = active > 0 ? active * FIXED_1 : 0;
5532 newload = load * exp + active * (FIXED_1 - exp);
5533 if (active >= load)
5534 newload += FIXED_1 - 1;
5535
5536 return newload / FIXED_1;
5537 }
5538
5539 /*
5540 * Return 0 means that container p->cg is closed.
5541 * Return -1 means that error occurred in refresh.
5542 * Positive num equals the total number of pid.
5543 */
5544 static int refresh_load(struct load_node *p, char *path)
5545 {
5546 FILE *f = NULL;
5547 char **idbuf;
5548 char proc_path[256];
5549 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5550 char *line = NULL;
5551 size_t linelen = 0;
5552 int sum, length;
5553 DIR *dp;
5554 struct dirent *file;
5555
5556 do {
5557 idbuf = malloc(sizeof(char *));
5558 } while (!idbuf);
5559 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5560 /* normal exit */
5561 if (sum == 0)
5562 goto out;
5563
5564 for (i = 0; i < sum; i++) {
5565 /*clean up '\n' */
5566 length = strlen(idbuf[i])-1;
5567 idbuf[i][length] = '\0';
5568 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5569 if (ret < 0 || ret > 255) {
5570 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5571 i = sum;
5572 sum = -1;
5573 goto err_out;
5574 }
5575
5576 dp = opendir(proc_path);
5577 if (!dp) {
5578 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5579 continue;
5580 }
5581 while ((file = readdir(dp)) != NULL) {
5582 if (strncmp(file->d_name, ".", 1) == 0)
5583 continue;
5584 if (strncmp(file->d_name, "..", 1) == 0)
5585 continue;
5586 total_pid++;
5587 /* We make the biggest pid become last_pid.*/
5588 ret = atof(file->d_name);
5589 last_pid = (ret > last_pid) ? ret : last_pid;
5590
5591 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5592 if (ret < 0 || ret > 255) {
5593 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5594 i = sum;
5595 sum = -1;
5596 closedir(dp);
5597 goto err_out;
5598 }
5599 f = fopen(proc_path, "r");
5600 if (f != NULL) {
5601 while (getline(&line, &linelen, f) != -1) {
5602 /* Find State */
5603 if ((line[0] == 'S') && (line[1] == 't'))
5604 break;
5605 }
5606 if ((line[7] == 'R') || (line[7] == 'D'))
5607 run_pid++;
5608 fclose(f);
5609 }
5610 }
5611 closedir(dp);
5612 }
5613 /*Calculate the loadavg.*/
5614 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5615 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5616 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5617 p->run_pid = run_pid;
5618 p->total_pid = total_pid;
5619 p->last_pid = last_pid;
5620
5621 free(line);
5622 err_out:
5623 for (; i > 0; i--)
5624 free(idbuf[i-1]);
5625 out:
5626 free(idbuf);
5627 return sum;
5628 }
5629 /*
5630 * Traverse the hash table and update it.
5631 */
5632 void *load_begin(void *arg)
5633 {
5634
5635 char *path = NULL;
5636 int i, sum, length, ret;
5637 struct load_node *f;
5638 int first_node;
5639 clock_t time1, time2;
5640
5641 while (1) {
5642 if (loadavg_stop == 1)
5643 return NULL;
5644
5645 time1 = clock();
5646 for (i = 0; i < LOAD_SIZE; i++) {
5647 pthread_mutex_lock(&load_hash[i].lock);
5648 if (load_hash[i].next == NULL) {
5649 pthread_mutex_unlock(&load_hash[i].lock);
5650 continue;
5651 }
5652 f = load_hash[i].next;
5653 first_node = 1;
5654 while (f) {
5655 length = strlen(f->cg) + 2;
5656 do {
5657 /* strlen(f->cg) + '.' or '' + \0 */
5658 path = malloc(length);
5659 } while (!path);
5660
5661 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5662 if (ret < 0 || ret > length - 1) {
5663 /* snprintf failed, ignore the node.*/
5664 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5665 goto out;
5666 }
5667 sum = refresh_load(f, path);
5668 if (sum == 0) {
5669 f = del_node(f, i);
5670 } else {
5671 out: f = f->next;
5672 }
5673 free(path);
5674 /* load_hash[i].lock locks only on the first node.*/
5675 if (first_node == 1) {
5676 first_node = 0;
5677 pthread_mutex_unlock(&load_hash[i].lock);
5678 }
5679 }
5680 }
5681
5682 if (loadavg_stop == 1)
5683 return NULL;
5684
5685 time2 = clock();
5686 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5687 }
5688 }
5689
5690 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5691 struct fuse_file_info *fi)
5692 {
5693 struct fuse_context *fc = fuse_get_context();
5694 struct file_info *d = (struct file_info *)fi->fh;
5695 pid_t initpid;
5696 char *cg;
5697 size_t total_len = 0;
5698 char *cache = d->buf;
5699 struct load_node *n;
5700 int hash;
5701 int cfd, rv = 0;
5702 unsigned long a, b, c;
5703
5704 if (offset) {
5705 if (offset > d->size)
5706 return -EINVAL;
5707 if (!d->cached)
5708 return 0;
5709 int left = d->size - offset;
5710 total_len = left > size ? size : left;
5711 memcpy(buf, cache + offset, total_len);
5712 return total_len;
5713 }
5714 if (!loadavg)
5715 return read_file("/proc/loadavg", buf, size, d);
5716
5717 initpid = lookup_initpid_in_store(fc->pid);
5718 if (initpid <= 0)
5719 initpid = fc->pid;
5720 cg = get_pid_cgroup(initpid, "cpu");
5721 if (!cg)
5722 return read_file("/proc/loadavg", buf, size, d);
5723
5724 prune_init_slice(cg);
5725 hash = calc_hash(cg) % LOAD_SIZE;
5726 n = locate_node(cg, hash);
5727
5728 /* First time */
5729 if (n == NULL) {
5730 if (!find_mounted_controller("cpu", &cfd)) {
5731 /*
5732 * In locate_node() above, pthread_rwlock_unlock() isn't used
5733 * because delete is not allowed before read has ended.
5734 */
5735 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5736 rv = 0;
5737 goto err;
5738 }
5739 do {
5740 n = malloc(sizeof(struct load_node));
5741 } while (!n);
5742
5743 do {
5744 n->cg = malloc(strlen(cg)+1);
5745 } while (!n->cg);
5746 strcpy(n->cg, cg);
5747 n->avenrun[0] = 0;
5748 n->avenrun[1] = 0;
5749 n->avenrun[2] = 0;
5750 n->run_pid = 0;
5751 n->total_pid = 1;
5752 n->last_pid = initpid;
5753 n->cfd = cfd;
5754 insert_node(&n, hash);
5755 }
5756 a = n->avenrun[0] + (FIXED_1/200);
5757 b = n->avenrun[1] + (FIXED_1/200);
5758 c = n->avenrun[2] + (FIXED_1/200);
5759 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5760 LOAD_INT(a), LOAD_FRAC(a),
5761 LOAD_INT(b), LOAD_FRAC(b),
5762 LOAD_INT(c), LOAD_FRAC(c),
5763 n->run_pid, n->total_pid, n->last_pid);
5764 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5765 if (total_len < 0 || total_len >= d->buflen) {
5766 lxcfs_error("%s\n", "Failed to write to cache");
5767 rv = 0;
5768 goto err;
5769 }
5770 d->size = (int)total_len;
5771 d->cached = 1;
5772
5773 if (total_len > size)
5774 total_len = size;
5775 memcpy(buf, d->buf, total_len);
5776 rv = total_len;
5777
5778 err:
5779 free(cg);
5780 return rv;
5781 }
5782 /* Return a positive number on success, return 0 on failure.*/
5783 pthread_t load_daemon(int load_use)
5784 {
5785 int ret;
5786 pthread_t pid;
5787
5788 ret = init_load();
5789 if (ret == -1) {
5790 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5791 return 0;
5792 }
5793 ret = pthread_create(&pid, NULL, load_begin, NULL);
5794 if (ret != 0) {
5795 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5796 load_free();
5797 return 0;
5798 }
5799 /* use loadavg, here loadavg = 1*/
5800 loadavg = load_use;
5801 return pid;
5802 }
5803
5804 /* Returns 0 on success. */
5805 int stop_load_daemon(pthread_t pid)
5806 {
5807 int s;
5808
5809 /* Signal the thread to gracefully stop */
5810 loadavg_stop = 1;
5811
5812 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5813 if (s != 0) {
5814 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5815 return -1;
5816 }
5817
5818 load_free();
5819 loadavg_stop = 0;
5820
5821 return 0;
5822 }
5823
5824 static off_t get_procfile_size(const char *which)
5825 {
5826 FILE *f = fopen(which, "r");
5827 char *line = NULL;
5828 size_t len = 0;
5829 ssize_t sz, answer = 0;
5830 if (!f)
5831 return 0;
5832
5833 while ((sz = getline(&line, &len, f)) != -1)
5834 answer += sz;
5835 fclose (f);
5836 free(line);
5837
5838 return answer;
5839 }
5840
5841 int proc_getattr(const char *path, struct stat *sb)
5842 {
5843 struct timespec now;
5844
5845 memset(sb, 0, sizeof(struct stat));
5846 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5847 return -EINVAL;
5848 sb->st_uid = sb->st_gid = 0;
5849 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5850 if (strcmp(path, "/proc") == 0) {
5851 sb->st_mode = S_IFDIR | 00555;
5852 sb->st_nlink = 2;
5853 return 0;
5854 }
5855 if (strcmp(path, "/proc/meminfo") == 0 ||
5856 strcmp(path, "/proc/cpuinfo") == 0 ||
5857 strcmp(path, "/proc/uptime") == 0 ||
5858 strcmp(path, "/proc/stat") == 0 ||
5859 strcmp(path, "/proc/diskstats") == 0 ||
5860 strcmp(path, "/proc/swaps") == 0 ||
5861 strcmp(path, "/proc/loadavg") == 0) {
5862 sb->st_size = 0;
5863 sb->st_mode = S_IFREG | 00444;
5864 sb->st_nlink = 1;
5865 return 0;
5866 }
5867
5868 return -ENOENT;
5869 }
5870
5871 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5872 struct fuse_file_info *fi)
5873 {
5874 if (filler(buf, ".", NULL, 0) != 0 ||
5875 filler(buf, "..", NULL, 0) != 0 ||
5876 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5877 filler(buf, "meminfo", NULL, 0) != 0 ||
5878 filler(buf, "stat", NULL, 0) != 0 ||
5879 filler(buf, "uptime", NULL, 0) != 0 ||
5880 filler(buf, "diskstats", NULL, 0) != 0 ||
5881 filler(buf, "swaps", NULL, 0) != 0 ||
5882 filler(buf, "loadavg", NULL, 0) != 0)
5883 return -EINVAL;
5884 return 0;
5885 }
5886
5887 int proc_open(const char *path, struct fuse_file_info *fi)
5888 {
5889 int type = -1;
5890 struct file_info *info;
5891
5892 if (strcmp(path, "/proc/meminfo") == 0)
5893 type = LXC_TYPE_PROC_MEMINFO;
5894 else if (strcmp(path, "/proc/cpuinfo") == 0)
5895 type = LXC_TYPE_PROC_CPUINFO;
5896 else if (strcmp(path, "/proc/uptime") == 0)
5897 type = LXC_TYPE_PROC_UPTIME;
5898 else if (strcmp(path, "/proc/stat") == 0)
5899 type = LXC_TYPE_PROC_STAT;
5900 else if (strcmp(path, "/proc/diskstats") == 0)
5901 type = LXC_TYPE_PROC_DISKSTATS;
5902 else if (strcmp(path, "/proc/swaps") == 0)
5903 type = LXC_TYPE_PROC_SWAPS;
5904 else if (strcmp(path, "/proc/loadavg") == 0)
5905 type = LXC_TYPE_PROC_LOADAVG;
5906 if (type == -1)
5907 return -ENOENT;
5908
5909 info = malloc(sizeof(*info));
5910 if (!info)
5911 return -ENOMEM;
5912
5913 memset(info, 0, sizeof(*info));
5914 info->type = type;
5915
5916 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5917 do {
5918 info->buf = malloc(info->buflen);
5919 } while (!info->buf);
5920 memset(info->buf, 0, info->buflen);
5921 /* set actual size to buffer size */
5922 info->size = info->buflen;
5923
5924 fi->fh = (unsigned long)info;
5925 return 0;
5926 }
5927
5928 int proc_access(const char *path, int mask)
5929 {
5930 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5931 return 0;
5932
5933 /* these are all read-only */
5934 if ((mask & ~R_OK) != 0)
5935 return -EACCES;
5936 return 0;
5937 }
5938
5939 int proc_release(const char *path, struct fuse_file_info *fi)
5940 {
5941 do_release_file_info(fi);
5942 return 0;
5943 }
5944
5945 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5946 struct fuse_file_info *fi)
5947 {
5948 struct file_info *f = (struct file_info *) fi->fh;
5949
5950 switch (f->type) {
5951 case LXC_TYPE_PROC_MEMINFO:
5952 return proc_meminfo_read(buf, size, offset, fi);
5953 case LXC_TYPE_PROC_CPUINFO:
5954 return proc_cpuinfo_read(buf, size, offset, fi);
5955 case LXC_TYPE_PROC_UPTIME:
5956 return proc_uptime_read(buf, size, offset, fi);
5957 case LXC_TYPE_PROC_STAT:
5958 return proc_stat_read(buf, size, offset, fi);
5959 case LXC_TYPE_PROC_DISKSTATS:
5960 return proc_diskstats_read(buf, size, offset, fi);
5961 case LXC_TYPE_PROC_SWAPS:
5962 return proc_swaps_read(buf, size, offset, fi);
5963 case LXC_TYPE_PROC_LOADAVG:
5964 return proc_loadavg_read(buf, size, offset, fi);
5965 default:
5966 return -EINVAL;
5967 }
5968 }
5969
5970 /*
5971 * Functions needed to setup cgroups in the __constructor__.
5972 */
5973
5974 static bool mkdir_p(const char *dir, mode_t mode)
5975 {
5976 const char *tmp = dir;
5977 const char *orig = dir;
5978 char *makeme;
5979
5980 do {
5981 dir = tmp + strspn(tmp, "/");
5982 tmp = dir + strcspn(dir, "/");
5983 makeme = strndup(orig, dir - orig);
5984 if (!makeme)
5985 return false;
5986 if (mkdir(makeme, mode) && errno != EEXIST) {
5987 lxcfs_error("Failed to create directory '%s': %s.\n",
5988 makeme, strerror(errno));
5989 free(makeme);
5990 return false;
5991 }
5992 free(makeme);
5993 } while(tmp != dir);
5994
5995 return true;
5996 }
5997
5998 static bool umount_if_mounted(void)
5999 {
6000 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
6001 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
6002 return false;
6003 }
6004 return true;
6005 }
6006
6007 /* __typeof__ should be safe to use with all compilers. */
6008 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
6009 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
6010 {
6011 return (fs->f_type == (fs_type_magic)magic_val);
6012 }
6013
6014 /*
6015 * looking at fs/proc_namespace.c, it appears we can
6016 * actually expect the rootfs entry to very specifically contain
6017 * " - rootfs rootfs "
6018 * IIUC, so long as we've chrooted so that rootfs is not our root,
6019 * the rootfs entry should always be skipped in mountinfo contents.
6020 */
6021 static bool is_on_ramfs(void)
6022 {
6023 FILE *f;
6024 char *p, *p2;
6025 char *line = NULL;
6026 size_t len = 0;
6027 int i;
6028
6029 f = fopen("/proc/self/mountinfo", "r");
6030 if (!f)
6031 return false;
6032
6033 while (getline(&line, &len, f) != -1) {
6034 for (p = line, i = 0; p && i < 4; i++)
6035 p = strchr(p + 1, ' ');
6036 if (!p)
6037 continue;
6038 p2 = strchr(p + 1, ' ');
6039 if (!p2)
6040 continue;
6041 *p2 = '\0';
6042 if (strcmp(p + 1, "/") == 0) {
6043 // this is '/'. is it the ramfs?
6044 p = strchr(p2 + 1, '-');
6045 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
6046 free(line);
6047 fclose(f);
6048 return true;
6049 }
6050 }
6051 }
6052 free(line);
6053 fclose(f);
6054 return false;
6055 }
6056
6057 static int pivot_enter()
6058 {
6059 int ret = -1, oldroot = -1, newroot = -1;
6060
6061 oldroot = open("/", O_DIRECTORY | O_RDONLY);
6062 if (oldroot < 0) {
6063 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
6064 return ret;
6065 }
6066
6067 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
6068 if (newroot < 0) {
6069 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
6070 goto err;
6071 }
6072
6073 /* change into new root fs */
6074 if (fchdir(newroot) < 0) {
6075 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
6076 goto err;
6077 }
6078
6079 /* pivot_root into our new root fs */
6080 if (pivot_root(".", ".") < 0) {
6081 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
6082 goto err;
6083 }
6084
6085 /*
6086 * At this point the old-root is mounted on top of our new-root.
6087 * To unmounted it we must not be chdir'd into it, so escape back
6088 * to the old-root.
6089 */
6090 if (fchdir(oldroot) < 0) {
6091 lxcfs_error("%s\n", "Failed to enter old root.");
6092 goto err;
6093 }
6094
6095 if (umount2(".", MNT_DETACH) < 0) {
6096 lxcfs_error("%s\n", "Failed to detach old root.");
6097 goto err;
6098 }
6099
6100 if (fchdir(newroot) < 0) {
6101 lxcfs_error("%s\n", "Failed to re-enter new root.");
6102 goto err;
6103 }
6104
6105 ret = 0;
6106
6107 err:
6108 if (oldroot > 0)
6109 close(oldroot);
6110 if (newroot > 0)
6111 close(newroot);
6112
6113 return ret;
6114 }
6115
6116 static int chroot_enter()
6117 {
6118 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
6119 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
6120 return -1;
6121 }
6122
6123 if (chroot(".") < 0) {
6124 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
6125 return -1;
6126 }
6127
6128 if (chdir("/") < 0) {
6129 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
6130 return -1;
6131 }
6132
6133 return 0;
6134 }
6135
6136 static int permute_and_enter(void)
6137 {
6138 struct statfs sb;
6139
6140 if (statfs("/", &sb) < 0) {
6141 lxcfs_error("%s\n", "Could not stat / mountpoint.");
6142 return -1;
6143 }
6144
6145 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
6146 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
6147 * /proc/1/mountinfo. */
6148 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
6149 return chroot_enter();
6150
6151 if (pivot_enter() < 0) {
6152 lxcfs_error("%s\n", "Could not perform pivot root.");
6153 return -1;
6154 }
6155
6156 return 0;
6157 }
6158
6159 /* Prepare our new clean root. */
6160 static int permute_prepare(void)
6161 {
6162 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
6163 lxcfs_error("%s\n", "Failed to create directory for new root.");
6164 return -1;
6165 }
6166
6167 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
6168 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
6169 return -1;
6170 }
6171
6172 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
6173 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
6174 return -1;
6175 }
6176
6177 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
6178 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
6179 return -1;
6180 }
6181
6182 return 0;
6183 }
6184
6185 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6186 static bool permute_root(void)
6187 {
6188 /* Prepare new root. */
6189 if (permute_prepare() < 0)
6190 return false;
6191
6192 /* Pivot into new root. */
6193 if (permute_and_enter() < 0)
6194 return false;
6195
6196 return true;
6197 }
6198
6199 static int preserve_mnt_ns(int pid)
6200 {
6201 int ret;
6202 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6203 char path[len];
6204
6205 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6206 if (ret < 0 || (size_t)ret >= len)
6207 return -1;
6208
6209 return open(path, O_RDONLY | O_CLOEXEC);
6210 }
6211
6212 static bool cgfs_prepare_mounts(void)
6213 {
6214 if (!mkdir_p(BASEDIR, 0700)) {
6215 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6216 return false;
6217 }
6218
6219 if (!umount_if_mounted()) {
6220 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6221 return false;
6222 }
6223
6224 if (unshare(CLONE_NEWNS) < 0) {
6225 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6226 return false;
6227 }
6228
6229 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6230 if (cgroup_mount_ns_fd < 0) {
6231 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6232 return false;
6233 }
6234
6235 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6236 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6237 return false;
6238 }
6239
6240 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6241 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6242 return false;
6243 }
6244
6245 return true;
6246 }
6247
6248 static bool cgfs_mount_hierarchies(void)
6249 {
6250 char *target;
6251 size_t clen, len;
6252 int i, ret;
6253
6254 for (i = 0; i < num_hierarchies; i++) {
6255 char *controller = hierarchies[i];
6256
6257 clen = strlen(controller);
6258 len = strlen(BASEDIR) + clen + 2;
6259 target = malloc(len);
6260 if (!target)
6261 return false;
6262
6263 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
6264 if (ret < 0 || ret >= len) {
6265 free(target);
6266 return false;
6267 }
6268 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
6269 free(target);
6270 return false;
6271 }
6272 if (!strcmp(controller, "unified"))
6273 ret = mount("none", target, "cgroup2", 0, NULL);
6274 else
6275 ret = mount(controller, target, "cgroup", 0, controller);
6276 if (ret < 0) {
6277 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
6278 free(target);
6279 return false;
6280 }
6281
6282 fd_hierarchies[i] = open(target, O_DIRECTORY);
6283 if (fd_hierarchies[i] < 0) {
6284 free(target);
6285 return false;
6286 }
6287 free(target);
6288 }
6289 return true;
6290 }
6291
6292 static bool cgfs_setup_controllers(void)
6293 {
6294 if (!cgfs_prepare_mounts())
6295 return false;
6296
6297 if (!cgfs_mount_hierarchies()) {
6298 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6299 return false;
6300 }
6301
6302 if (!permute_root())
6303 return false;
6304
6305 return true;
6306 }
6307
6308 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
6309 {
6310 FILE *f;
6311 char *cret, *line = NULL;
6312 char cwd[MAXPATHLEN];
6313 size_t len = 0;
6314 int i, init_ns = -1;
6315 bool found_unified = false;
6316
6317 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
6318 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
6319 return;
6320 }
6321
6322 while (getline(&line, &len, f) != -1) {
6323 char *idx, *p, *p2;
6324
6325 p = strchr(line, ':');
6326 if (!p)
6327 goto out;
6328 idx = line;
6329 *(p++) = '\0';
6330
6331 p2 = strrchr(p, ':');
6332 if (!p2)
6333 goto out;
6334 *p2 = '\0';
6335
6336 /* With cgroupv2 /proc/self/cgroup can contain entries of the
6337 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
6338 * because it parses out the empty string "" and later on passes
6339 * it to mount(). Let's skip such entries.
6340 */
6341 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
6342 found_unified = true;
6343 p = "unified";
6344 }
6345
6346 if (!store_hierarchy(line, p))
6347 goto out;
6348 }
6349
6350 /* Preserve initial namespace. */
6351 init_ns = preserve_mnt_ns(getpid());
6352 if (init_ns < 0) {
6353 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
6354 goto out;
6355 }
6356
6357 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
6358 if (!fd_hierarchies) {
6359 lxcfs_error("%s\n", strerror(errno));
6360 goto out;
6361 }
6362
6363 for (i = 0; i < num_hierarchies; i++)
6364 fd_hierarchies[i] = -1;
6365
6366 cret = getcwd(cwd, MAXPATHLEN);
6367 if (!cret)
6368 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6369
6370 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6371 * to privately mount lxcfs cgroups. */
6372 if (!cgfs_setup_controllers()) {
6373 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
6374 goto out;
6375 }
6376
6377 if (setns(init_ns, 0) < 0) {
6378 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
6379 goto out;
6380 }
6381
6382 if (!cret || chdir(cwd) < 0)
6383 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6384
6385 if (!init_cpuview()) {
6386 lxcfs_error("%s\n", "failed to init CPU view");
6387 goto out;
6388 }
6389
6390 print_subsystems();
6391
6392 out:
6393 free(line);
6394 fclose(f);
6395 if (init_ns >= 0)
6396 close(init_ns);
6397 }
6398
6399 static void __attribute__((destructor)) free_subsystems(void)
6400 {
6401 int i;
6402
6403 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6404
6405 for (i = 0; i < num_hierarchies; i++) {
6406 if (hierarchies[i])
6407 free(hierarchies[i]);
6408 if (fd_hierarchies && fd_hierarchies[i] >= 0)
6409 close(fd_hierarchies[i]);
6410 }
6411 free(hierarchies);
6412 free(fd_hierarchies);
6413 free_cpuview();
6414
6415 if (cgroup_mount_ns_fd >= 0)
6416 close(cgroup_mount_ns_fd);
6417 }