]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
support /sys/devices/system/cpu/online
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #define __STDC_FORMAT_MACROS
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <fuse.h>
16 #include <inttypes.h>
17 #include <libgen.h>
18 #include <pthread.h>
19 #include <sched.h>
20 #include <stdbool.h>
21 #include <stdint.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <time.h>
26 #include <unistd.h>
27 #include <wait.h>
28 #include <linux/magic.h>
29 #include <linux/sched.h>
30 #include <sys/epoll.h>
31 #include <sys/mman.h>
32 #include <sys/mount.h>
33 #include <sys/param.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/sysinfo.h>
37 #include <sys/vfs.h>
38
39 #include "bindings.h"
40 #include "config.h" // for VERSION
41
42 /* Define pivot_root() if missing from the C library */
43 #ifndef HAVE_PIVOT_ROOT
44 static int pivot_root(const char * new_root, const char * put_old)
45 {
46 #ifdef __NR_pivot_root
47 return syscall(__NR_pivot_root, new_root, put_old);
48 #else
49 errno = ENOSYS;
50 return -1;
51 #endif
52 }
53 #else
54 extern int pivot_root(const char * new_root, const char * put_old);
55 #endif
56
57 struct cpuacct_usage {
58 uint64_t user;
59 uint64_t system;
60 uint64_t idle;
61 bool online;
62 };
63
64 /* The function of hash table.*/
65 #define LOAD_SIZE 100 /*the size of hash_table */
66 #define FLUSH_TIME 5 /*the flush rate */
67 #define DEPTH_DIR 3 /*the depth of per cgroup */
68 /* The function of calculate loadavg .*/
69 #define FSHIFT 11 /* nr of bits of precision */
70 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
71 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
72 #define EXP_5 2014 /* 1/exp(5sec/5min) */
73 #define EXP_15 2037 /* 1/exp(5sec/15min) */
74 #define LOAD_INT(x) ((x) >> FSHIFT)
75 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
76 /*
77 * This parameter is used for proc_loadavg_read().
78 * 1 means use loadavg, 0 means not use.
79 */
80 static int loadavg = 0;
81 static volatile sig_atomic_t loadavg_stop = 0;
82 static int calc_hash(const char *name)
83 {
84 unsigned int hash = 0;
85 unsigned int x = 0;
86 /* ELFHash algorithm. */
87 while (*name) {
88 hash = (hash << 4) + *name++;
89 x = hash & 0xf0000000;
90 if (x != 0)
91 hash ^= (x >> 24);
92 hash &= ~x;
93 }
94 return (hash & 0x7fffffff);
95 }
96
97 struct load_node {
98 char *cg; /*cg */
99 unsigned long avenrun[3]; /* Load averages */
100 unsigned int run_pid;
101 unsigned int total_pid;
102 unsigned int last_pid;
103 int cfd; /* The file descriptor of the mounted cgroup */
104 struct load_node *next;
105 struct load_node **pre;
106 };
107
108 struct load_head {
109 /*
110 * The lock is about insert load_node and refresh load_node.To the first
111 * load_node of each hash bucket, insert and refresh in this hash bucket is
112 * mutually exclusive.
113 */
114 pthread_mutex_t lock;
115 /*
116 * The rdlock is about read loadavg and delete load_node.To each hash
117 * bucket, read and delete is mutually exclusive. But at the same time, we
118 * allow paratactic read operation. This rdlock is at list level.
119 */
120 pthread_rwlock_t rdlock;
121 /*
122 * The rilock is about read loadavg and insert load_node.To the first
123 * load_node of each hash bucket, read and insert is mutually exclusive.
124 * But at the same time, we allow paratactic read operation.
125 */
126 pthread_rwlock_t rilock;
127 struct load_node *next;
128 };
129
130 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
131 /*
132 * init_load initialize the hash table.
133 * Return 0 on success, return -1 on failure.
134 */
135 static int init_load(void)
136 {
137 int i;
138 int ret;
139
140 for (i = 0; i < LOAD_SIZE; i++) {
141 load_hash[i].next = NULL;
142 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
143 if (ret != 0) {
144 lxcfs_error("%s\n", "Failed to initialize lock");
145 goto out3;
146 }
147 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
148 if (ret != 0) {
149 lxcfs_error("%s\n", "Failed to initialize rdlock");
150 goto out2;
151 }
152 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
153 if (ret != 0) {
154 lxcfs_error("%s\n", "Failed to initialize rilock");
155 goto out1;
156 }
157 }
158 return 0;
159 out1:
160 pthread_rwlock_destroy(&load_hash[i].rdlock);
161 out2:
162 pthread_mutex_destroy(&load_hash[i].lock);
163 out3:
164 while (i > 0) {
165 i--;
166 pthread_mutex_destroy(&load_hash[i].lock);
167 pthread_rwlock_destroy(&load_hash[i].rdlock);
168 pthread_rwlock_destroy(&load_hash[i].rilock);
169 }
170 return -1;
171 }
172
173 static void insert_node(struct load_node **n, int locate)
174 {
175 struct load_node *f;
176
177 pthread_mutex_lock(&load_hash[locate].lock);
178 pthread_rwlock_wrlock(&load_hash[locate].rilock);
179 f = load_hash[locate].next;
180 load_hash[locate].next = *n;
181
182 (*n)->pre = &(load_hash[locate].next);
183 if (f)
184 f->pre = &((*n)->next);
185 (*n)->next = f;
186 pthread_mutex_unlock(&load_hash[locate].lock);
187 pthread_rwlock_unlock(&load_hash[locate].rilock);
188 }
189 /*
190 * locate_node() finds special node. Not return NULL means success.
191 * It should be noted that rdlock isn't unlocked at the end of code
192 * because this function is used to read special node. Delete is not
193 * allowed before read has ended.
194 * unlock rdlock only in proc_loadavg_read().
195 */
196 static struct load_node *locate_node(char *cg, int locate)
197 {
198 struct load_node *f = NULL;
199 int i = 0;
200
201 pthread_rwlock_rdlock(&load_hash[locate].rilock);
202 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
203 if (load_hash[locate].next == NULL) {
204 pthread_rwlock_unlock(&load_hash[locate].rilock);
205 return f;
206 }
207 f = load_hash[locate].next;
208 pthread_rwlock_unlock(&load_hash[locate].rilock);
209 while (f && ((i = strcmp(f->cg, cg)) != 0))
210 f = f->next;
211 return f;
212 }
213 /* Delete the load_node n and return the next node of it. */
214 static struct load_node *del_node(struct load_node *n, int locate)
215 {
216 struct load_node *g;
217
218 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
219 if (n->next == NULL) {
220 *(n->pre) = NULL;
221 } else {
222 *(n->pre) = n->next;
223 n->next->pre = n->pre;
224 }
225 g = n->next;
226 free(n->cg);
227 free(n);
228 pthread_rwlock_unlock(&load_hash[locate].rdlock);
229 return g;
230 }
231
232 static void load_free(void)
233 {
234 int i;
235 struct load_node *f, *p;
236
237 for (i = 0; i < LOAD_SIZE; i++) {
238 pthread_mutex_lock(&load_hash[i].lock);
239 pthread_rwlock_wrlock(&load_hash[i].rilock);
240 pthread_rwlock_wrlock(&load_hash[i].rdlock);
241 if (load_hash[i].next == NULL) {
242 pthread_mutex_unlock(&load_hash[i].lock);
243 pthread_mutex_destroy(&load_hash[i].lock);
244 pthread_rwlock_unlock(&load_hash[i].rilock);
245 pthread_rwlock_destroy(&load_hash[i].rilock);
246 pthread_rwlock_unlock(&load_hash[i].rdlock);
247 pthread_rwlock_destroy(&load_hash[i].rdlock);
248 continue;
249 }
250 for (f = load_hash[i].next; f; ) {
251 free(f->cg);
252 p = f->next;
253 free(f);
254 f = p;
255 }
256 pthread_mutex_unlock(&load_hash[i].lock);
257 pthread_mutex_destroy(&load_hash[i].lock);
258 pthread_rwlock_unlock(&load_hash[i].rilock);
259 pthread_rwlock_destroy(&load_hash[i].rilock);
260 pthread_rwlock_unlock(&load_hash[i].rdlock);
261 pthread_rwlock_destroy(&load_hash[i].rdlock);
262 }
263 }
264
265 /* Data for CPU view */
266 struct cg_proc_stat {
267 char *cg;
268 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
269 struct cpuacct_usage *view; // Usage stats reported to the container
270 int cpu_count;
271 pthread_mutex_t lock; // For node manipulation
272 struct cg_proc_stat *next;
273 };
274
275 struct cg_proc_stat_head {
276 struct cg_proc_stat *next;
277 time_t lastcheck;
278
279 /*
280 * For access to the list. Reading can be parallel, pruning is exclusive.
281 */
282 pthread_rwlock_t lock;
283 };
284
285 #define CPUVIEW_HASH_SIZE 100
286 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
287
288 static bool cpuview_init_head(struct cg_proc_stat_head **head)
289 {
290 *head = malloc(sizeof(struct cg_proc_stat_head));
291 if (!(*head)) {
292 lxcfs_error("%s\n", strerror(errno));
293 return false;
294 }
295
296 (*head)->lastcheck = time(NULL);
297 (*head)->next = NULL;
298
299 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
300 lxcfs_error("%s\n", "Failed to initialize list lock");
301 free(*head);
302 return false;
303 }
304
305 return true;
306 }
307
308 static bool init_cpuview()
309 {
310 int i;
311
312 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
313 proc_stat_history[i] = NULL;
314
315 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
316 if (!cpuview_init_head(&proc_stat_history[i]))
317 goto err;
318 }
319
320 return true;
321
322 err:
323 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
324 if (proc_stat_history[i]) {
325 free(proc_stat_history[i]);
326 proc_stat_history[i] = NULL;
327 }
328 }
329
330 return false;
331 }
332
333 static void free_proc_stat_node(struct cg_proc_stat *node)
334 {
335 pthread_mutex_destroy(&node->lock);
336 free(node->cg);
337 free(node->usage);
338 free(node->view);
339 free(node);
340 }
341
342 static void cpuview_free_head(struct cg_proc_stat_head *head)
343 {
344 struct cg_proc_stat *node, *tmp;
345
346 if (head->next) {
347 node = head->next;
348
349 for (;;) {
350 tmp = node;
351 node = node->next;
352 free_proc_stat_node(tmp);
353
354 if (!node)
355 break;
356 }
357 }
358
359 pthread_rwlock_destroy(&head->lock);
360 free(head);
361 }
362
363 static void free_cpuview()
364 {
365 int i;
366
367 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
368 if (proc_stat_history[i])
369 cpuview_free_head(proc_stat_history[i]);
370 }
371 }
372
373 /*
374 * A table caching which pid is init for a pid namespace.
375 * When looking up which pid is init for $qpid, we first
376 * 1. Stat /proc/$qpid/ns/pid.
377 * 2. Check whether the ino_t is in our store.
378 * a. if not, fork a child in qpid's ns to send us
379 * ucred.pid = 1, and read the initpid. Cache
380 * initpid and creation time for /proc/initpid
381 * in a new store entry.
382 * b. if so, verify that /proc/initpid still matches
383 * what we have saved. If not, clear the store
384 * entry and go back to a. If so, return the
385 * cached initpid.
386 */
387 struct pidns_init_store {
388 ino_t ino; // inode number for /proc/$pid/ns/pid
389 pid_t initpid; // the pid of nit in that ns
390 long int ctime; // the time at which /proc/$initpid was created
391 struct pidns_init_store *next;
392 long int lastcheck;
393 };
394
395 /* lol - look at how they are allocated in the kernel */
396 #define PIDNS_HASH_SIZE 4096
397 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
398
399 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
400 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
401 static void lock_mutex(pthread_mutex_t *l)
402 {
403 int ret;
404
405 if ((ret = pthread_mutex_lock(l)) != 0) {
406 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
407 exit(1);
408 }
409 }
410
411 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
412 * Number of hierarchies mounted. */
413 static int num_hierarchies;
414
415 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
416 * Hierachies mounted {cpuset, blkio, ...}:
417 * Initialized via __constructor__ collect_and_mount_subsystems(). */
418 static char **hierarchies;
419
420 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
421 * Open file descriptors:
422 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
423 * private mount namespace.
424 * Initialized via __constructor__ collect_and_mount_subsystems().
425 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
426 * mounts and respective files in the private namespace even when located in
427 * another namespace using the *at() family of functions
428 * {openat(), fchownat(), ...}. */
429 static int *fd_hierarchies;
430 static int cgroup_mount_ns_fd = -1;
431
432 static void unlock_mutex(pthread_mutex_t *l)
433 {
434 int ret;
435
436 if ((ret = pthread_mutex_unlock(l)) != 0) {
437 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
438 exit(1);
439 }
440 }
441
442 static void store_lock(void)
443 {
444 lock_mutex(&pidns_store_mutex);
445 }
446
447 static void store_unlock(void)
448 {
449 unlock_mutex(&pidns_store_mutex);
450 }
451
452 /* Must be called under store_lock */
453 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
454 {
455 struct stat initsb;
456 char fnam[100];
457
458 snprintf(fnam, 100, "/proc/%d", e->initpid);
459 if (stat(fnam, &initsb) < 0)
460 return false;
461
462 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
463 initsb.st_ctime, e->initpid);
464
465 if (e->ctime != initsb.st_ctime)
466 return false;
467 return true;
468 }
469
470 /* Must be called under store_lock */
471 static void remove_initpid(struct pidns_init_store *e)
472 {
473 struct pidns_init_store *tmp;
474 int h;
475
476 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
477
478 h = HASH(e->ino);
479 if (pidns_hash_table[h] == e) {
480 pidns_hash_table[h] = e->next;
481 free(e);
482 return;
483 }
484
485 tmp = pidns_hash_table[h];
486 while (tmp) {
487 if (tmp->next == e) {
488 tmp->next = e->next;
489 free(e);
490 return;
491 }
492 tmp = tmp->next;
493 }
494 }
495
496 #define PURGE_SECS 5
497 /* Must be called under store_lock */
498 static void prune_initpid_store(void)
499 {
500 static long int last_prune = 0;
501 struct pidns_init_store *e, *prev, *delme;
502 long int now, threshold;
503 int i;
504
505 if (!last_prune) {
506 last_prune = time(NULL);
507 return;
508 }
509 now = time(NULL);
510 if (now < last_prune + PURGE_SECS)
511 return;
512
513 lxcfs_debug("%s\n", "Pruning.");
514
515 last_prune = now;
516 threshold = now - 2 * PURGE_SECS;
517
518 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
519 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
520 if (e->lastcheck < threshold) {
521
522 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
523
524 delme = e;
525 if (prev)
526 prev->next = e->next;
527 else
528 pidns_hash_table[i] = e->next;
529 e = e->next;
530 free(delme);
531 } else {
532 prev = e;
533 e = e->next;
534 }
535 }
536 }
537 }
538
539 /* Must be called under store_lock */
540 static void save_initpid(struct stat *sb, pid_t pid)
541 {
542 struct pidns_init_store *e;
543 char fpath[100];
544 struct stat procsb;
545 int h;
546
547 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
548
549 snprintf(fpath, 100, "/proc/%d", pid);
550 if (stat(fpath, &procsb) < 0)
551 return;
552 do {
553 e = malloc(sizeof(*e));
554 } while (!e);
555 e->ino = sb->st_ino;
556 e->initpid = pid;
557 e->ctime = procsb.st_ctime;
558 h = HASH(e->ino);
559 e->next = pidns_hash_table[h];
560 e->lastcheck = time(NULL);
561 pidns_hash_table[h] = e;
562 }
563
564 /*
565 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
566 * entry for the inode number and creation time. Verify that the init pid
567 * is still valid. If not, remove it. Return the entry if valid, NULL
568 * otherwise.
569 * Must be called under store_lock
570 */
571 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
572 {
573 int h = HASH(sb->st_ino);
574 struct pidns_init_store *e = pidns_hash_table[h];
575
576 while (e) {
577 if (e->ino == sb->st_ino) {
578 if (initpid_still_valid(e, sb)) {
579 e->lastcheck = time(NULL);
580 return e;
581 }
582 remove_initpid(e);
583 return NULL;
584 }
585 e = e->next;
586 }
587
588 return NULL;
589 }
590
591 static int is_dir(const char *path, int fd)
592 {
593 struct stat statbuf;
594 int ret = fstatat(fd, path, &statbuf, fd);
595 if (ret == 0 && S_ISDIR(statbuf.st_mode))
596 return 1;
597 return 0;
598 }
599
600 static char *must_copy_string(const char *str)
601 {
602 char *dup = NULL;
603 if (!str)
604 return NULL;
605 do {
606 dup = strdup(str);
607 } while (!dup);
608
609 return dup;
610 }
611
612 static inline void drop_trailing_newlines(char *s)
613 {
614 int l;
615
616 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
617 s[l-1] = '\0';
618 }
619
620 #define BATCH_SIZE 50
621 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
622 {
623 int newbatches = (newlen / BATCH_SIZE) + 1;
624 int oldbatches = (oldlen / BATCH_SIZE) + 1;
625
626 if (!*mem || newbatches > oldbatches) {
627 char *tmp;
628 do {
629 tmp = realloc(*mem, newbatches * BATCH_SIZE);
630 } while (!tmp);
631 *mem = tmp;
632 }
633 }
634 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
635 {
636 size_t newlen = *len + linelen;
637 dorealloc(contents, *len, newlen + 1);
638 memcpy(*contents + *len, line, linelen+1);
639 *len = newlen;
640 }
641
642 static char *slurp_file(const char *from, int fd)
643 {
644 char *line = NULL;
645 char *contents = NULL;
646 FILE *f = fdopen(fd, "r");
647 size_t len = 0, fulllen = 0;
648 ssize_t linelen;
649
650 if (!f)
651 return NULL;
652
653 while ((linelen = getline(&line, &len, f)) != -1) {
654 append_line(&contents, &fulllen, line, linelen);
655 }
656 fclose(f);
657
658 if (contents)
659 drop_trailing_newlines(contents);
660 free(line);
661 return contents;
662 }
663
664 static bool write_string(const char *fnam, const char *string, int fd)
665 {
666 FILE *f;
667 size_t len, ret;
668
669 f = fdopen(fd, "w");
670 if (!f)
671 return false;
672
673 len = strlen(string);
674 ret = fwrite(string, 1, len, f);
675 if (ret != len) {
676 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
677 strerror(errno), string, fnam);
678 fclose(f);
679 return false;
680 }
681
682 if (fclose(f) < 0) {
683 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
684 return false;
685 }
686
687 return true;
688 }
689
690 struct cgfs_files {
691 char *name;
692 uint32_t uid, gid;
693 uint32_t mode;
694 };
695
696 #define ALLOC_NUM 20
697 static bool store_hierarchy(char *stridx, char *h)
698 {
699 if (num_hierarchies % ALLOC_NUM == 0) {
700 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
701 n *= ALLOC_NUM;
702 char **tmp = realloc(hierarchies, n * sizeof(char *));
703 if (!tmp) {
704 lxcfs_error("%s\n", strerror(errno));
705 exit(1);
706 }
707 hierarchies = tmp;
708 }
709
710 hierarchies[num_hierarchies++] = must_copy_string(h);
711 return true;
712 }
713
714 static void print_subsystems(void)
715 {
716 int i;
717
718 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
719 fprintf(stderr, "hierarchies:\n");
720 for (i = 0; i < num_hierarchies; i++) {
721 if (hierarchies[i])
722 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
723 fd_hierarchies[i], hierarchies[i]);
724 }
725 }
726
727 static bool in_comma_list(const char *needle, const char *haystack)
728 {
729 const char *s = haystack, *e;
730 size_t nlen = strlen(needle);
731
732 while (*s && (e = strchr(s, ','))) {
733 if (nlen != e - s) {
734 s = e + 1;
735 continue;
736 }
737 if (strncmp(needle, s, nlen) == 0)
738 return true;
739 s = e + 1;
740 }
741 if (strcmp(needle, s) == 0)
742 return true;
743 return false;
744 }
745
746 /* do we need to do any massaging here? I'm not sure... */
747 /* Return the mounted controller and store the corresponding open file descriptor
748 * referring to the controller mountpoint in the private lxcfs namespace in
749 * @cfd.
750 */
751 static char *find_mounted_controller(const char *controller, int *cfd)
752 {
753 int i;
754
755 for (i = 0; i < num_hierarchies; i++) {
756 if (!hierarchies[i])
757 continue;
758 if (strcmp(hierarchies[i], controller) == 0) {
759 *cfd = fd_hierarchies[i];
760 return hierarchies[i];
761 }
762 if (in_comma_list(controller, hierarchies[i])) {
763 *cfd = fd_hierarchies[i];
764 return hierarchies[i];
765 }
766 }
767
768 return NULL;
769 }
770
771 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
772 const char *value)
773 {
774 int ret, fd, cfd;
775 size_t len;
776 char *fnam, *tmpc;
777
778 tmpc = find_mounted_controller(controller, &cfd);
779 if (!tmpc)
780 return false;
781
782 /* Make sure we pass a relative path to *at() family of functions.
783 * . + /cgroup + / + file + \0
784 */
785 len = strlen(cgroup) + strlen(file) + 3;
786 fnam = alloca(len);
787 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
788 if (ret < 0 || (size_t)ret >= len)
789 return false;
790
791 fd = openat(cfd, fnam, O_WRONLY);
792 if (fd < 0)
793 return false;
794
795 return write_string(fnam, value, fd);
796 }
797
798 // Chown all the files in the cgroup directory. We do this when we create
799 // a cgroup on behalf of a user.
800 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
801 {
802 struct dirent *direntp;
803 char path[MAXPATHLEN];
804 size_t len;
805 DIR *d;
806 int fd1, ret;
807
808 len = strlen(dirname);
809 if (len >= MAXPATHLEN) {
810 lxcfs_error("Pathname too long: %s\n", dirname);
811 return;
812 }
813
814 fd1 = openat(fd, dirname, O_DIRECTORY);
815 if (fd1 < 0)
816 return;
817
818 d = fdopendir(fd1);
819 if (!d) {
820 lxcfs_error("Failed to open %s\n", dirname);
821 return;
822 }
823
824 while ((direntp = readdir(d))) {
825 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
826 continue;
827 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
828 if (ret < 0 || ret >= MAXPATHLEN) {
829 lxcfs_error("Pathname too long under %s\n", dirname);
830 continue;
831 }
832 if (fchownat(fd, path, uid, gid, 0) < 0)
833 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
834 }
835 closedir(d);
836 }
837
838 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
839 {
840 int cfd;
841 size_t len;
842 char *dirnam, *tmpc;
843
844 tmpc = find_mounted_controller(controller, &cfd);
845 if (!tmpc)
846 return -EINVAL;
847
848 /* Make sure we pass a relative path to *at() family of functions.
849 * . + /cg + \0
850 */
851 len = strlen(cg) + 2;
852 dirnam = alloca(len);
853 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
854
855 if (mkdirat(cfd, dirnam, 0755) < 0)
856 return -errno;
857
858 if (uid == 0 && gid == 0)
859 return 0;
860
861 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
862 return -errno;
863
864 chown_all_cgroup_files(dirnam, uid, gid, cfd);
865
866 return 0;
867 }
868
869 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
870 {
871 struct dirent *direntp;
872 DIR *dir;
873 bool ret = false;
874 char pathname[MAXPATHLEN];
875 int dupfd;
876
877 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
878 if (dupfd < 0)
879 return false;
880
881 dir = fdopendir(dupfd);
882 if (!dir) {
883 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
884 close(dupfd);
885 return false;
886 }
887
888 while ((direntp = readdir(dir))) {
889 struct stat mystat;
890 int rc;
891
892 if (!strcmp(direntp->d_name, ".") ||
893 !strcmp(direntp->d_name, ".."))
894 continue;
895
896 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
897 if (rc < 0 || rc >= MAXPATHLEN) {
898 lxcfs_error("%s\n", "Pathname too long.");
899 continue;
900 }
901
902 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
903 if (rc) {
904 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
905 continue;
906 }
907 if (S_ISDIR(mystat.st_mode))
908 if (!recursive_rmdir(pathname, fd, cfd))
909 lxcfs_debug("Error removing %s.\n", pathname);
910 }
911
912 ret = true;
913 if (closedir(dir) < 0) {
914 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
915 ret = false;
916 }
917
918 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
919 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
920 ret = false;
921 }
922
923 close(dupfd);
924
925 return ret;
926 }
927
928 bool cgfs_remove(const char *controller, const char *cg)
929 {
930 int fd, cfd;
931 size_t len;
932 char *dirnam, *tmpc;
933 bool bret;
934
935 tmpc = find_mounted_controller(controller, &cfd);
936 if (!tmpc)
937 return false;
938
939 /* Make sure we pass a relative path to *at() family of functions.
940 * . + /cg + \0
941 */
942 len = strlen(cg) + 2;
943 dirnam = alloca(len);
944 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
945
946 fd = openat(cfd, dirnam, O_DIRECTORY);
947 if (fd < 0)
948 return false;
949
950 bret = recursive_rmdir(dirnam, fd, cfd);
951 close(fd);
952 return bret;
953 }
954
955 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
956 {
957 int cfd;
958 size_t len;
959 char *pathname, *tmpc;
960
961 tmpc = find_mounted_controller(controller, &cfd);
962 if (!tmpc)
963 return false;
964
965 /* Make sure we pass a relative path to *at() family of functions.
966 * . + /file + \0
967 */
968 len = strlen(file) + 2;
969 pathname = alloca(len);
970 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
971 if (fchmodat(cfd, pathname, mode, 0) < 0)
972 return false;
973 return true;
974 }
975
976 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
977 {
978 size_t len;
979 char *fname;
980
981 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
982 fname = alloca(len);
983 snprintf(fname, len, "%s/tasks", dirname);
984 if (fchownat(fd, fname, uid, gid, 0) != 0)
985 return -errno;
986 snprintf(fname, len, "%s/cgroup.procs", dirname);
987 if (fchownat(fd, fname, uid, gid, 0) != 0)
988 return -errno;
989 return 0;
990 }
991
992 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
993 {
994 int cfd;
995 size_t len;
996 char *pathname, *tmpc;
997
998 tmpc = find_mounted_controller(controller, &cfd);
999 if (!tmpc)
1000 return -EINVAL;
1001
1002 /* Make sure we pass a relative path to *at() family of functions.
1003 * . + /file + \0
1004 */
1005 len = strlen(file) + 2;
1006 pathname = alloca(len);
1007 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1008 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
1009 return -errno;
1010
1011 if (is_dir(pathname, cfd))
1012 // like cgmanager did, we want to chown the tasks file as well
1013 return chown_tasks_files(pathname, uid, gid, cfd);
1014
1015 return 0;
1016 }
1017
1018 FILE *open_pids_file(const char *controller, const char *cgroup)
1019 {
1020 int fd, cfd;
1021 size_t len;
1022 char *pathname, *tmpc;
1023
1024 tmpc = find_mounted_controller(controller, &cfd);
1025 if (!tmpc)
1026 return NULL;
1027
1028 /* Make sure we pass a relative path to *at() family of functions.
1029 * . + /cgroup + / "cgroup.procs" + \0
1030 */
1031 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
1032 pathname = alloca(len);
1033 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1034
1035 fd = openat(cfd, pathname, O_WRONLY);
1036 if (fd < 0)
1037 return NULL;
1038
1039 return fdopen(fd, "w");
1040 }
1041
1042 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1043 void ***list, size_t typesize,
1044 void* (*iterator)(const char*, const char*, const char*))
1045 {
1046 int cfd, fd, ret;
1047 size_t len;
1048 char *cg, *tmpc;
1049 char pathname[MAXPATHLEN];
1050 size_t sz = 0, asz = 0;
1051 struct dirent *dirent;
1052 DIR *dir;
1053
1054 tmpc = find_mounted_controller(controller, &cfd);
1055 *list = NULL;
1056 if (!tmpc)
1057 return false;
1058
1059 /* Make sure we pass a relative path to *at() family of functions. */
1060 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1061 cg = alloca(len);
1062 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1063 if (ret < 0 || (size_t)ret >= len) {
1064 lxcfs_error("Pathname too long under %s\n", cgroup);
1065 return false;
1066 }
1067
1068 fd = openat(cfd, cg, O_DIRECTORY);
1069 if (fd < 0)
1070 return false;
1071
1072 dir = fdopendir(fd);
1073 if (!dir)
1074 return false;
1075
1076 while ((dirent = readdir(dir))) {
1077 struct stat mystat;
1078
1079 if (!strcmp(dirent->d_name, ".") ||
1080 !strcmp(dirent->d_name, ".."))
1081 continue;
1082
1083 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1084 if (ret < 0 || ret >= MAXPATHLEN) {
1085 lxcfs_error("Pathname too long under %s\n", cg);
1086 continue;
1087 }
1088
1089 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1090 if (ret) {
1091 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1092 continue;
1093 }
1094 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1095 (directories && !S_ISDIR(mystat.st_mode)))
1096 continue;
1097
1098 if (sz+2 >= asz) {
1099 void **tmp;
1100 asz += BATCH_SIZE;
1101 do {
1102 tmp = realloc(*list, asz * typesize);
1103 } while (!tmp);
1104 *list = tmp;
1105 }
1106 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1107 (*list)[sz+1] = NULL;
1108 sz++;
1109 }
1110 if (closedir(dir) < 0) {
1111 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1112 return false;
1113 }
1114 return true;
1115 }
1116
1117 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1118 {
1119 char *dup;
1120 do {
1121 dup = strdup(dir_entry);
1122 } while (!dup);
1123 return dup;
1124 }
1125
1126 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1127 {
1128 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1129 }
1130
1131 void free_key(struct cgfs_files *k)
1132 {
1133 if (!k)
1134 return;
1135 free(k->name);
1136 free(k);
1137 }
1138
1139 void free_keys(struct cgfs_files **keys)
1140 {
1141 int i;
1142
1143 if (!keys)
1144 return;
1145 for (i = 0; keys[i]; i++) {
1146 free_key(keys[i]);
1147 }
1148 free(keys);
1149 }
1150
1151 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1152 {
1153 int ret, fd, cfd;
1154 size_t len;
1155 char *fnam, *tmpc;
1156
1157 tmpc = find_mounted_controller(controller, &cfd);
1158 if (!tmpc)
1159 return false;
1160
1161 /* Make sure we pass a relative path to *at() family of functions.
1162 * . + /cgroup + / + file + \0
1163 */
1164 len = strlen(cgroup) + strlen(file) + 3;
1165 fnam = alloca(len);
1166 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1167 if (ret < 0 || (size_t)ret >= len)
1168 return false;
1169
1170 fd = openat(cfd, fnam, O_RDONLY);
1171 if (fd < 0)
1172 return false;
1173
1174 *value = slurp_file(fnam, fd);
1175 return *value != NULL;
1176 }
1177
1178 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1179 {
1180 int ret, cfd;
1181 size_t len;
1182 char *fnam, *tmpc;
1183
1184 tmpc = find_mounted_controller(controller, &cfd);
1185 if (!tmpc)
1186 return false;
1187
1188 /* Make sure we pass a relative path to *at() family of functions.
1189 * . + /cgroup + / + file + \0
1190 */
1191 len = strlen(cgroup) + strlen(file) + 3;
1192 fnam = alloca(len);
1193 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1194 if (ret < 0 || (size_t)ret >= len)
1195 return false;
1196
1197 return (faccessat(cfd, fnam, F_OK, 0) == 0);
1198 }
1199
1200 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1201 {
1202 int ret, cfd;
1203 size_t len;
1204 char *fnam, *tmpc;
1205 struct stat sb;
1206 struct cgfs_files *newkey;
1207
1208 tmpc = find_mounted_controller(controller, &cfd);
1209 if (!tmpc)
1210 return false;
1211
1212 if (file && *file == '/')
1213 file++;
1214
1215 if (file && strchr(file, '/'))
1216 return NULL;
1217
1218 /* Make sure we pass a relative path to *at() family of functions.
1219 * . + /cgroup + / + file + \0
1220 */
1221 len = strlen(cgroup) + 3;
1222 if (file)
1223 len += strlen(file) + 1;
1224 fnam = alloca(len);
1225 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1226 file ? "/" : "", file ? file : "");
1227
1228 ret = fstatat(cfd, fnam, &sb, 0);
1229 if (ret < 0)
1230 return NULL;
1231
1232 do {
1233 newkey = malloc(sizeof(struct cgfs_files));
1234 } while (!newkey);
1235 if (file)
1236 newkey->name = must_copy_string(file);
1237 else if (strrchr(cgroup, '/'))
1238 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1239 else
1240 newkey->name = must_copy_string(cgroup);
1241 newkey->uid = sb.st_uid;
1242 newkey->gid = sb.st_gid;
1243 newkey->mode = sb.st_mode;
1244
1245 return newkey;
1246 }
1247
1248 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1249 {
1250 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1251 if (!entry) {
1252 lxcfs_error("Error getting files under %s:%s\n", controller,
1253 cgroup);
1254 }
1255 return entry;
1256 }
1257
1258 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1259 {
1260 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1261 }
1262
1263 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1264 {
1265 int cfd;
1266 size_t len;
1267 char *fnam, *tmpc;
1268 int ret;
1269 struct stat sb;
1270
1271 tmpc = find_mounted_controller(controller, &cfd);
1272 if (!tmpc)
1273 return false;
1274
1275 /* Make sure we pass a relative path to *at() family of functions.
1276 * . + /cgroup + / + f + \0
1277 */
1278 len = strlen(cgroup) + strlen(f) + 3;
1279 fnam = alloca(len);
1280 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1281 if (ret < 0 || (size_t)ret >= len)
1282 return false;
1283
1284 ret = fstatat(cfd, fnam, &sb, 0);
1285 if (ret < 0 || !S_ISDIR(sb.st_mode))
1286 return false;
1287
1288 return true;
1289 }
1290
1291 #define SEND_CREDS_OK 0
1292 #define SEND_CREDS_NOTSK 1
1293 #define SEND_CREDS_FAIL 2
1294 static bool recv_creds(int sock, struct ucred *cred, char *v);
1295 static int wait_for_pid(pid_t pid);
1296 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1297 static int send_creds_clone_wrapper(void *arg);
1298
1299 /*
1300 * clone a task which switches to @task's namespace and writes '1'.
1301 * over a unix sock so we can read the task's reaper's pid in our
1302 * namespace
1303 *
1304 * Note: glibc's fork() does not respect pidns, which can lead to failed
1305 * assertions inside glibc (and thus failed forks) if the child's pid in
1306 * the pidns and the parent pid outside are identical. Using clone prevents
1307 * this issue.
1308 */
1309 static void write_task_init_pid_exit(int sock, pid_t target)
1310 {
1311 char fnam[100];
1312 pid_t pid;
1313 int fd, ret;
1314 size_t stack_size = sysconf(_SC_PAGESIZE);
1315 void *stack = alloca(stack_size);
1316
1317 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1318 if (ret < 0 || ret >= sizeof(fnam))
1319 _exit(1);
1320
1321 fd = open(fnam, O_RDONLY);
1322 if (fd < 0) {
1323 perror("write_task_init_pid_exit open of ns/pid");
1324 _exit(1);
1325 }
1326 if (setns(fd, 0)) {
1327 perror("write_task_init_pid_exit setns 1");
1328 close(fd);
1329 _exit(1);
1330 }
1331 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1332 if (pid < 0)
1333 _exit(1);
1334 if (pid != 0) {
1335 if (!wait_for_pid(pid))
1336 _exit(1);
1337 _exit(0);
1338 }
1339 }
1340
1341 static int send_creds_clone_wrapper(void *arg) {
1342 struct ucred cred;
1343 char v;
1344 int sock = *(int *)arg;
1345
1346 /* we are the child */
1347 cred.uid = 0;
1348 cred.gid = 0;
1349 cred.pid = 1;
1350 v = '1';
1351 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1352 return 1;
1353 return 0;
1354 }
1355
1356 static pid_t get_init_pid_for_task(pid_t task)
1357 {
1358 int sock[2];
1359 pid_t pid;
1360 pid_t ret = -1;
1361 char v = '0';
1362 struct ucred cred;
1363
1364 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1365 perror("socketpair");
1366 return -1;
1367 }
1368
1369 pid = fork();
1370 if (pid < 0)
1371 goto out;
1372 if (!pid) {
1373 close(sock[1]);
1374 write_task_init_pid_exit(sock[0], task);
1375 _exit(0);
1376 }
1377
1378 if (!recv_creds(sock[1], &cred, &v))
1379 goto out;
1380 ret = cred.pid;
1381
1382 out:
1383 close(sock[0]);
1384 close(sock[1]);
1385 if (pid > 0)
1386 wait_for_pid(pid);
1387 return ret;
1388 }
1389
1390 pid_t lookup_initpid_in_store(pid_t qpid)
1391 {
1392 pid_t answer = 0;
1393 struct stat sb;
1394 struct pidns_init_store *e;
1395 char fnam[100];
1396
1397 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1398 store_lock();
1399 if (stat(fnam, &sb) < 0)
1400 goto out;
1401 e = lookup_verify_initpid(&sb);
1402 if (e) {
1403 answer = e->initpid;
1404 goto out;
1405 }
1406 answer = get_init_pid_for_task(qpid);
1407 if (answer > 0)
1408 save_initpid(&sb, answer);
1409
1410 out:
1411 /* we prune at end in case we are returning
1412 * the value we were about to return */
1413 prune_initpid_store();
1414 store_unlock();
1415 return answer;
1416 }
1417
1418 static int wait_for_pid(pid_t pid)
1419 {
1420 int status, ret;
1421
1422 if (pid <= 0)
1423 return -1;
1424
1425 again:
1426 ret = waitpid(pid, &status, 0);
1427 if (ret == -1) {
1428 if (errno == EINTR)
1429 goto again;
1430 return -1;
1431 }
1432 if (ret != pid)
1433 goto again;
1434 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1435 return -1;
1436 return 0;
1437 }
1438
1439
1440 /*
1441 * append pid to *src.
1442 * src: a pointer to a char* in which ot append the pid.
1443 * sz: the number of characters printed so far, minus trailing \0.
1444 * asz: the allocated size so far
1445 * pid: the pid to append
1446 */
1447 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1448 {
1449 char tmp[30];
1450
1451 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1452
1453 if (!*src || tmplen + *sz + 1 >= *asz) {
1454 char *tmp;
1455 do {
1456 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1457 } while (!tmp);
1458 *src = tmp;
1459 *asz += BUF_RESERVE_SIZE;
1460 }
1461 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1462 *sz += tmplen;
1463 }
1464
1465 /*
1466 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1467 * valid in the caller's namespace, return the id mapped into
1468 * pid's namespace.
1469 * Returns the mapped id, or -1 on error.
1470 */
1471 unsigned int
1472 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1473 {
1474 unsigned int nsuid, // base id for a range in the idfile's namespace
1475 hostuid, // base id for a range in the caller's namespace
1476 count; // number of ids in this range
1477 char line[400];
1478 int ret;
1479
1480 fseek(idfile, 0L, SEEK_SET);
1481 while (fgets(line, 400, idfile)) {
1482 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1483 if (ret != 3)
1484 continue;
1485 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1486 /*
1487 * uids wrapped around - unexpected as this is a procfile,
1488 * so just bail.
1489 */
1490 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1491 nsuid, hostuid, count, line);
1492 return -1;
1493 }
1494 if (hostuid <= in_id && hostuid+count > in_id) {
1495 /*
1496 * now since hostuid <= in_id < hostuid+count, and
1497 * hostuid+count and nsuid+count do not wrap around,
1498 * we know that nsuid+(in_id-hostuid) which must be
1499 * less that nsuid+(count) must not wrap around
1500 */
1501 return (in_id - hostuid) + nsuid;
1502 }
1503 }
1504
1505 // no answer found
1506 return -1;
1507 }
1508
1509 /*
1510 * for is_privileged_over,
1511 * specify whether we require the calling uid to be root in his
1512 * namespace
1513 */
1514 #define NS_ROOT_REQD true
1515 #define NS_ROOT_OPT false
1516
1517 #define PROCLEN 100
1518
1519 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1520 {
1521 char fpath[PROCLEN];
1522 int ret;
1523 bool answer = false;
1524 uid_t nsuid;
1525
1526 if (victim == -1 || uid == -1)
1527 return false;
1528
1529 /*
1530 * If the request is one not requiring root in the namespace,
1531 * then having the same uid suffices. (i.e. uid 1000 has write
1532 * access to files owned by uid 1000
1533 */
1534 if (!req_ns_root && uid == victim)
1535 return true;
1536
1537 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1538 if (ret < 0 || ret >= PROCLEN)
1539 return false;
1540 FILE *f = fopen(fpath, "r");
1541 if (!f)
1542 return false;
1543
1544 /* if caller's not root in his namespace, reject */
1545 nsuid = convert_id_to_ns(f, uid);
1546 if (nsuid)
1547 goto out;
1548
1549 /*
1550 * If victim is not mapped into caller's ns, reject.
1551 * XXX I'm not sure this check is needed given that fuse
1552 * will be sending requests where the vfs has converted
1553 */
1554 nsuid = convert_id_to_ns(f, victim);
1555 if (nsuid == -1)
1556 goto out;
1557
1558 answer = true;
1559
1560 out:
1561 fclose(f);
1562 return answer;
1563 }
1564
1565 static bool perms_include(int fmode, mode_t req_mode)
1566 {
1567 mode_t r;
1568
1569 switch (req_mode & O_ACCMODE) {
1570 case O_RDONLY:
1571 r = S_IROTH;
1572 break;
1573 case O_WRONLY:
1574 r = S_IWOTH;
1575 break;
1576 case O_RDWR:
1577 r = S_IROTH | S_IWOTH;
1578 break;
1579 default:
1580 return false;
1581 }
1582 return ((fmode & r) == r);
1583 }
1584
1585
1586 /*
1587 * taskcg is a/b/c
1588 * querycg is /a/b/c/d/e
1589 * we return 'd'
1590 */
1591 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1592 {
1593 char *start, *end;
1594
1595 if (strlen(taskcg) <= strlen(querycg)) {
1596 lxcfs_error("%s\n", "I was fed bad input.");
1597 return NULL;
1598 }
1599
1600 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1601 start = strdup(taskcg + 1);
1602 else
1603 start = strdup(taskcg + strlen(querycg) + 1);
1604 if (!start)
1605 return NULL;
1606 end = strchr(start, '/');
1607 if (end)
1608 *end = '\0';
1609 return start;
1610 }
1611
1612 static void stripnewline(char *x)
1613 {
1614 size_t l = strlen(x);
1615 if (l && x[l-1] == '\n')
1616 x[l-1] = '\0';
1617 }
1618
1619 char *get_pid_cgroup(pid_t pid, const char *contrl)
1620 {
1621 int cfd;
1622 char fnam[PROCLEN];
1623 FILE *f;
1624 char *answer = NULL;
1625 char *line = NULL;
1626 size_t len = 0;
1627 int ret;
1628 const char *h = find_mounted_controller(contrl, &cfd);
1629 if (!h)
1630 return NULL;
1631
1632 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1633 if (ret < 0 || ret >= PROCLEN)
1634 return NULL;
1635 if (!(f = fopen(fnam, "r")))
1636 return NULL;
1637
1638 while (getline(&line, &len, f) != -1) {
1639 char *c1, *c2;
1640 if (!line[0])
1641 continue;
1642 c1 = strchr(line, ':');
1643 if (!c1)
1644 goto out;
1645 c1++;
1646 c2 = strchr(c1, ':');
1647 if (!c2)
1648 goto out;
1649 *c2 = '\0';
1650 if (strcmp(c1, h) != 0)
1651 continue;
1652 c2++;
1653 stripnewline(c2);
1654 do {
1655 answer = strdup(c2);
1656 } while (!answer);
1657 break;
1658 }
1659
1660 out:
1661 fclose(f);
1662 free(line);
1663 return answer;
1664 }
1665
1666 /*
1667 * check whether a fuse context may access a cgroup dir or file
1668 *
1669 * If file is not null, it is a cgroup file to check under cg.
1670 * If file is null, then we are checking perms on cg itself.
1671 *
1672 * For files we can check the mode of the list_keys result.
1673 * For cgroups, we must make assumptions based on the files under the
1674 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1675 * yet.
1676 */
1677 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1678 {
1679 struct cgfs_files *k = NULL;
1680 bool ret = false;
1681
1682 k = cgfs_get_key(contrl, cg, file);
1683 if (!k)
1684 return false;
1685
1686 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1687 if (perms_include(k->mode >> 6, mode)) {
1688 ret = true;
1689 goto out;
1690 }
1691 }
1692 if (fc->gid == k->gid) {
1693 if (perms_include(k->mode >> 3, mode)) {
1694 ret = true;
1695 goto out;
1696 }
1697 }
1698 ret = perms_include(k->mode, mode);
1699
1700 out:
1701 free_key(k);
1702 return ret;
1703 }
1704
1705 #define INITSCOPE "/init.scope"
1706 void prune_init_slice(char *cg)
1707 {
1708 char *point;
1709 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1710
1711 if (cg_len < initscope_len)
1712 return;
1713
1714 point = cg + cg_len - initscope_len;
1715 if (strcmp(point, INITSCOPE) == 0) {
1716 if (point == cg)
1717 *(point+1) = '\0';
1718 else
1719 *point = '\0';
1720 }
1721 }
1722
1723 /*
1724 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1725 * If pid is in /a, he may act on /a/b, but not on /b.
1726 * if the answer is false and nextcg is not NULL, then *nextcg will point
1727 * to a string containing the next cgroup directory under cg, which must be
1728 * freed by the caller.
1729 */
1730 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1731 {
1732 bool answer = false;
1733 char *c2 = get_pid_cgroup(pid, contrl);
1734 char *linecmp;
1735
1736 if (!c2)
1737 return false;
1738 prune_init_slice(c2);
1739
1740 /*
1741 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1742 * they pass in a cgroup without leading '/'
1743 *
1744 * The original line here was:
1745 * linecmp = *cg == '/' ? c2 : c2+1;
1746 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1747 * Serge, do you know?
1748 */
1749 if (*cg == '/' || !strncmp(cg, "./", 2))
1750 linecmp = c2;
1751 else
1752 linecmp = c2 + 1;
1753 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1754 if (nextcg) {
1755 *nextcg = get_next_cgroup_dir(linecmp, cg);
1756 }
1757 goto out;
1758 }
1759 answer = true;
1760
1761 out:
1762 free(c2);
1763 return answer;
1764 }
1765
1766 /*
1767 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1768 */
1769 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1770 {
1771 bool answer = false;
1772 char *c2, *task_cg;
1773 size_t target_len, task_len;
1774
1775 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1776 return true;
1777
1778 c2 = get_pid_cgroup(pid, contrl);
1779 if (!c2)
1780 return false;
1781 prune_init_slice(c2);
1782
1783 task_cg = c2 + 1;
1784 target_len = strlen(cg);
1785 task_len = strlen(task_cg);
1786 if (task_len == 0) {
1787 /* Task is in the root cg, it can see everything. This case is
1788 * not handled by the strmcps below, since they test for the
1789 * last /, but that is the first / that we've chopped off
1790 * above.
1791 */
1792 answer = true;
1793 goto out;
1794 }
1795 if (strcmp(cg, task_cg) == 0) {
1796 answer = true;
1797 goto out;
1798 }
1799 if (target_len < task_len) {
1800 /* looking up a parent dir */
1801 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1802 answer = true;
1803 goto out;
1804 }
1805 if (target_len > task_len) {
1806 /* looking up a child dir */
1807 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1808 answer = true;
1809 goto out;
1810 }
1811
1812 out:
1813 free(c2);
1814 return answer;
1815 }
1816
1817 /*
1818 * given /cgroup/freezer/a/b, return "freezer".
1819 * the returned char* should NOT be freed.
1820 */
1821 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1822 {
1823 const char *p1;
1824 char *contr, *slash;
1825
1826 if (strlen(path) < 9) {
1827 errno = EACCES;
1828 return NULL;
1829 }
1830 if (*(path + 7) != '/') {
1831 errno = EINVAL;
1832 return NULL;
1833 }
1834 p1 = path + 8;
1835 contr = strdupa(p1);
1836 if (!contr) {
1837 errno = ENOMEM;
1838 return NULL;
1839 }
1840 slash = strstr(contr, "/");
1841 if (slash)
1842 *slash = '\0';
1843
1844 int i;
1845 for (i = 0; i < num_hierarchies; i++) {
1846 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1847 return hierarchies[i];
1848 }
1849 errno = ENOENT;
1850 return NULL;
1851 }
1852
1853 /*
1854 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1855 * Note that the returned value may include files (keynames) etc
1856 */
1857 static const char *find_cgroup_in_path(const char *path)
1858 {
1859 const char *p1;
1860
1861 if (strlen(path) < 9) {
1862 errno = EACCES;
1863 return NULL;
1864 }
1865 p1 = strstr(path + 8, "/");
1866 if (!p1) {
1867 errno = EINVAL;
1868 return NULL;
1869 }
1870 errno = 0;
1871 return p1 + 1;
1872 }
1873
1874 /*
1875 * split the last path element from the path in @cg.
1876 * @dir is newly allocated and should be freed, @last not
1877 */
1878 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1879 {
1880 char *p;
1881
1882 do {
1883 *dir = strdup(cg);
1884 } while (!*dir);
1885 *last = strrchr(cg, '/');
1886 if (!*last) {
1887 *last = NULL;
1888 return;
1889 }
1890 p = strrchr(*dir, '/');
1891 *p = '\0';
1892 }
1893
1894 /*
1895 * FUSE ops for /cgroup
1896 */
1897
1898 int cg_getattr(const char *path, struct stat *sb)
1899 {
1900 struct timespec now;
1901 struct fuse_context *fc = fuse_get_context();
1902 char * cgdir = NULL;
1903 char *last = NULL, *path1, *path2;
1904 struct cgfs_files *k = NULL;
1905 const char *cgroup;
1906 const char *controller = NULL;
1907 int ret = -ENOENT;
1908
1909
1910 if (!fc)
1911 return -EIO;
1912
1913 memset(sb, 0, sizeof(struct stat));
1914
1915 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1916 return -EINVAL;
1917
1918 sb->st_uid = sb->st_gid = 0;
1919 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1920 sb->st_size = 0;
1921
1922 if (strcmp(path, "/cgroup") == 0) {
1923 sb->st_mode = S_IFDIR | 00755;
1924 sb->st_nlink = 2;
1925 return 0;
1926 }
1927
1928 controller = pick_controller_from_path(fc, path);
1929 if (!controller)
1930 return -errno;
1931 cgroup = find_cgroup_in_path(path);
1932 if (!cgroup) {
1933 /* this is just /cgroup/controller, return it as a dir */
1934 sb->st_mode = S_IFDIR | 00755;
1935 sb->st_nlink = 2;
1936 return 0;
1937 }
1938
1939 get_cgdir_and_path(cgroup, &cgdir, &last);
1940
1941 if (!last) {
1942 path1 = "/";
1943 path2 = cgdir;
1944 } else {
1945 path1 = cgdir;
1946 path2 = last;
1947 }
1948
1949 pid_t initpid = lookup_initpid_in_store(fc->pid);
1950 if (initpid <= 0)
1951 initpid = fc->pid;
1952 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1953 * Then check that caller's cgroup is under path if last is a child
1954 * cgroup, or cgdir if last is a file */
1955
1956 if (is_child_cgroup(controller, path1, path2)) {
1957 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1958 ret = -ENOENT;
1959 goto out;
1960 }
1961 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1962 /* this is just /cgroup/controller, return it as a dir */
1963 sb->st_mode = S_IFDIR | 00555;
1964 sb->st_nlink = 2;
1965 ret = 0;
1966 goto out;
1967 }
1968 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1969 ret = -EACCES;
1970 goto out;
1971 }
1972
1973 // get uid, gid, from '/tasks' file and make up a mode
1974 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1975 sb->st_mode = S_IFDIR | 00755;
1976 k = cgfs_get_key(controller, cgroup, NULL);
1977 if (!k) {
1978 sb->st_uid = sb->st_gid = 0;
1979 } else {
1980 sb->st_uid = k->uid;
1981 sb->st_gid = k->gid;
1982 }
1983 free_key(k);
1984 sb->st_nlink = 2;
1985 ret = 0;
1986 goto out;
1987 }
1988
1989 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1990 sb->st_mode = S_IFREG | k->mode;
1991 sb->st_nlink = 1;
1992 sb->st_uid = k->uid;
1993 sb->st_gid = k->gid;
1994 sb->st_size = 0;
1995 free_key(k);
1996 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1997 ret = -ENOENT;
1998 goto out;
1999 }
2000 ret = 0;
2001 }
2002
2003 out:
2004 free(cgdir);
2005 return ret;
2006 }
2007
2008 int cg_opendir(const char *path, struct fuse_file_info *fi)
2009 {
2010 struct fuse_context *fc = fuse_get_context();
2011 const char *cgroup;
2012 struct file_info *dir_info;
2013 char *controller = NULL;
2014
2015 if (!fc)
2016 return -EIO;
2017
2018 if (strcmp(path, "/cgroup") == 0) {
2019 cgroup = NULL;
2020 controller = NULL;
2021 } else {
2022 // return list of keys for the controller, and list of child cgroups
2023 controller = pick_controller_from_path(fc, path);
2024 if (!controller)
2025 return -errno;
2026
2027 cgroup = find_cgroup_in_path(path);
2028 if (!cgroup) {
2029 /* this is just /cgroup/controller, return its contents */
2030 cgroup = "/";
2031 }
2032 }
2033
2034 pid_t initpid = lookup_initpid_in_store(fc->pid);
2035 if (initpid <= 0)
2036 initpid = fc->pid;
2037 if (cgroup) {
2038 if (!caller_may_see_dir(initpid, controller, cgroup))
2039 return -ENOENT;
2040 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2041 return -EACCES;
2042 }
2043
2044 /* we'll free this at cg_releasedir */
2045 dir_info = malloc(sizeof(*dir_info));
2046 if (!dir_info)
2047 return -ENOMEM;
2048 dir_info->controller = must_copy_string(controller);
2049 dir_info->cgroup = must_copy_string(cgroup);
2050 dir_info->type = LXC_TYPE_CGDIR;
2051 dir_info->buf = NULL;
2052 dir_info->file = NULL;
2053 dir_info->buflen = 0;
2054
2055 fi->fh = (unsigned long)dir_info;
2056 return 0;
2057 }
2058
2059 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2060 struct fuse_file_info *fi)
2061 {
2062 struct file_info *d = (struct file_info *)fi->fh;
2063 struct cgfs_files **list = NULL;
2064 int i, ret;
2065 char *nextcg = NULL;
2066 struct fuse_context *fc = fuse_get_context();
2067 char **clist = NULL;
2068
2069 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2070 return -EIO;
2071
2072 if (d->type != LXC_TYPE_CGDIR) {
2073 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
2074 return -EIO;
2075 }
2076 if (!d->cgroup && !d->controller) {
2077 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2078 int i;
2079
2080 for (i = 0; i < num_hierarchies; i++) {
2081 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2082 return -EIO;
2083 }
2084 }
2085 return 0;
2086 }
2087
2088 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2089 // not a valid cgroup
2090 ret = -EINVAL;
2091 goto out;
2092 }
2093
2094 pid_t initpid = lookup_initpid_in_store(fc->pid);
2095 if (initpid <= 0)
2096 initpid = fc->pid;
2097 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2098 if (nextcg) {
2099 ret = filler(buf, nextcg, NULL, 0);
2100 free(nextcg);
2101 if (ret != 0) {
2102 ret = -EIO;
2103 goto out;
2104 }
2105 }
2106 ret = 0;
2107 goto out;
2108 }
2109
2110 for (i = 0; list && list[i]; i++) {
2111 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2112 ret = -EIO;
2113 goto out;
2114 }
2115 }
2116
2117 // now get the list of child cgroups
2118
2119 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2120 ret = 0;
2121 goto out;
2122 }
2123 if (clist) {
2124 for (i = 0; clist[i]; i++) {
2125 if (filler(buf, clist[i], NULL, 0) != 0) {
2126 ret = -EIO;
2127 goto out;
2128 }
2129 }
2130 }
2131 ret = 0;
2132
2133 out:
2134 free_keys(list);
2135 if (clist) {
2136 for (i = 0; clist[i]; i++)
2137 free(clist[i]);
2138 free(clist);
2139 }
2140 return ret;
2141 }
2142
2143 void do_release_file_info(struct fuse_file_info *fi)
2144 {
2145 struct file_info *f = (struct file_info *)fi->fh;
2146
2147 if (!f)
2148 return;
2149
2150 fi->fh = 0;
2151
2152 free(f->controller);
2153 f->controller = NULL;
2154 free(f->cgroup);
2155 f->cgroup = NULL;
2156 free(f->file);
2157 f->file = NULL;
2158 free(f->buf);
2159 f->buf = NULL;
2160 free(f);
2161 f = NULL;
2162 }
2163
2164 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2165 {
2166 do_release_file_info(fi);
2167 return 0;
2168 }
2169
2170 int cg_open(const char *path, struct fuse_file_info *fi)
2171 {
2172 const char *cgroup;
2173 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2174 struct cgfs_files *k = NULL;
2175 struct file_info *file_info;
2176 struct fuse_context *fc = fuse_get_context();
2177 int ret;
2178
2179 if (!fc)
2180 return -EIO;
2181
2182 controller = pick_controller_from_path(fc, path);
2183 if (!controller)
2184 return -errno;
2185 cgroup = find_cgroup_in_path(path);
2186 if (!cgroup)
2187 return -errno;
2188
2189 get_cgdir_and_path(cgroup, &cgdir, &last);
2190 if (!last) {
2191 path1 = "/";
2192 path2 = cgdir;
2193 } else {
2194 path1 = cgdir;
2195 path2 = last;
2196 }
2197
2198 k = cgfs_get_key(controller, path1, path2);
2199 if (!k) {
2200 ret = -EINVAL;
2201 goto out;
2202 }
2203 free_key(k);
2204
2205 pid_t initpid = lookup_initpid_in_store(fc->pid);
2206 if (initpid <= 0)
2207 initpid = fc->pid;
2208 if (!caller_may_see_dir(initpid, controller, path1)) {
2209 ret = -ENOENT;
2210 goto out;
2211 }
2212 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2213 ret = -EACCES;
2214 goto out;
2215 }
2216
2217 /* we'll free this at cg_release */
2218 file_info = malloc(sizeof(*file_info));
2219 if (!file_info) {
2220 ret = -ENOMEM;
2221 goto out;
2222 }
2223 file_info->controller = must_copy_string(controller);
2224 file_info->cgroup = must_copy_string(path1);
2225 file_info->file = must_copy_string(path2);
2226 file_info->type = LXC_TYPE_CGFILE;
2227 file_info->buf = NULL;
2228 file_info->buflen = 0;
2229
2230 fi->fh = (unsigned long)file_info;
2231 ret = 0;
2232
2233 out:
2234 free(cgdir);
2235 return ret;
2236 }
2237
2238 int cg_access(const char *path, int mode)
2239 {
2240 int ret;
2241 const char *cgroup;
2242 char *path1, *path2, *controller;
2243 char *last = NULL, *cgdir = NULL;
2244 struct cgfs_files *k = NULL;
2245 struct fuse_context *fc = fuse_get_context();
2246
2247 if (strcmp(path, "/cgroup") == 0)
2248 return 0;
2249
2250 if (!fc)
2251 return -EIO;
2252
2253 controller = pick_controller_from_path(fc, path);
2254 if (!controller)
2255 return -errno;
2256 cgroup = find_cgroup_in_path(path);
2257 if (!cgroup) {
2258 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2259 if ((mode & W_OK) == 0)
2260 return 0;
2261 return -EACCES;
2262 }
2263
2264 get_cgdir_and_path(cgroup, &cgdir, &last);
2265 if (!last) {
2266 path1 = "/";
2267 path2 = cgdir;
2268 } else {
2269 path1 = cgdir;
2270 path2 = last;
2271 }
2272
2273 k = cgfs_get_key(controller, path1, path2);
2274 if (!k) {
2275 if ((mode & W_OK) == 0)
2276 ret = 0;
2277 else
2278 ret = -EACCES;
2279 goto out;
2280 }
2281 free_key(k);
2282
2283 pid_t initpid = lookup_initpid_in_store(fc->pid);
2284 if (initpid <= 0)
2285 initpid = fc->pid;
2286 if (!caller_may_see_dir(initpid, controller, path1)) {
2287 ret = -ENOENT;
2288 goto out;
2289 }
2290 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2291 ret = -EACCES;
2292 goto out;
2293 }
2294
2295 ret = 0;
2296
2297 out:
2298 free(cgdir);
2299 return ret;
2300 }
2301
2302 int cg_release(const char *path, struct fuse_file_info *fi)
2303 {
2304 do_release_file_info(fi);
2305 return 0;
2306 }
2307
2308 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2309
2310 static bool wait_for_sock(int sock, int timeout)
2311 {
2312 struct epoll_event ev;
2313 int epfd, ret, now, starttime, deltatime, saved_errno;
2314
2315 if ((starttime = time(NULL)) < 0)
2316 return false;
2317
2318 if ((epfd = epoll_create(1)) < 0) {
2319 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2320 return false;
2321 }
2322
2323 ev.events = POLLIN_SET;
2324 ev.data.fd = sock;
2325 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2326 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2327 close(epfd);
2328 return false;
2329 }
2330
2331 again:
2332 if ((now = time(NULL)) < 0) {
2333 close(epfd);
2334 return false;
2335 }
2336
2337 deltatime = (starttime + timeout) - now;
2338 if (deltatime < 0) { // timeout
2339 errno = 0;
2340 close(epfd);
2341 return false;
2342 }
2343 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2344 if (ret < 0 && errno == EINTR)
2345 goto again;
2346 saved_errno = errno;
2347 close(epfd);
2348
2349 if (ret <= 0) {
2350 errno = saved_errno;
2351 return false;
2352 }
2353 return true;
2354 }
2355
2356 static int msgrecv(int sockfd, void *buf, size_t len)
2357 {
2358 if (!wait_for_sock(sockfd, 2))
2359 return -1;
2360 return recv(sockfd, buf, len, MSG_DONTWAIT);
2361 }
2362
2363 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2364 {
2365 struct msghdr msg = { 0 };
2366 struct iovec iov;
2367 struct cmsghdr *cmsg;
2368 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2369 char buf[1];
2370 buf[0] = 'p';
2371
2372 if (pingfirst) {
2373 if (msgrecv(sock, buf, 1) != 1) {
2374 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2375 return SEND_CREDS_FAIL;
2376 }
2377 }
2378
2379 msg.msg_control = cmsgbuf;
2380 msg.msg_controllen = sizeof(cmsgbuf);
2381
2382 cmsg = CMSG_FIRSTHDR(&msg);
2383 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2384 cmsg->cmsg_level = SOL_SOCKET;
2385 cmsg->cmsg_type = SCM_CREDENTIALS;
2386 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2387
2388 msg.msg_name = NULL;
2389 msg.msg_namelen = 0;
2390
2391 buf[0] = v;
2392 iov.iov_base = buf;
2393 iov.iov_len = sizeof(buf);
2394 msg.msg_iov = &iov;
2395 msg.msg_iovlen = 1;
2396
2397 if (sendmsg(sock, &msg, 0) < 0) {
2398 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2399 if (errno == 3)
2400 return SEND_CREDS_NOTSK;
2401 return SEND_CREDS_FAIL;
2402 }
2403
2404 return SEND_CREDS_OK;
2405 }
2406
2407 static bool recv_creds(int sock, struct ucred *cred, char *v)
2408 {
2409 struct msghdr msg = { 0 };
2410 struct iovec iov;
2411 struct cmsghdr *cmsg;
2412 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2413 char buf[1];
2414 int ret;
2415 int optval = 1;
2416
2417 *v = '1';
2418
2419 cred->pid = -1;
2420 cred->uid = -1;
2421 cred->gid = -1;
2422
2423 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2424 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2425 return false;
2426 }
2427 buf[0] = '1';
2428 if (write(sock, buf, 1) != 1) {
2429 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2430 return false;
2431 }
2432
2433 msg.msg_name = NULL;
2434 msg.msg_namelen = 0;
2435 msg.msg_control = cmsgbuf;
2436 msg.msg_controllen = sizeof(cmsgbuf);
2437
2438 iov.iov_base = buf;
2439 iov.iov_len = sizeof(buf);
2440 msg.msg_iov = &iov;
2441 msg.msg_iovlen = 1;
2442
2443 if (!wait_for_sock(sock, 2)) {
2444 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2445 return false;
2446 }
2447 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2448 if (ret < 0) {
2449 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2450 return false;
2451 }
2452
2453 cmsg = CMSG_FIRSTHDR(&msg);
2454
2455 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2456 cmsg->cmsg_level == SOL_SOCKET &&
2457 cmsg->cmsg_type == SCM_CREDENTIALS) {
2458 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2459 }
2460 *v = buf[0];
2461
2462 return true;
2463 }
2464
2465 struct pid_ns_clone_args {
2466 int *cpipe;
2467 int sock;
2468 pid_t tpid;
2469 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2470 };
2471
2472 /*
2473 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2474 * with clone(). This simply writes '1' as ACK back to the parent
2475 * before calling the actual wrapped function.
2476 */
2477 static int pid_ns_clone_wrapper(void *arg) {
2478 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2479 char b = '1';
2480
2481 close(args->cpipe[0]);
2482 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2483 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2484 close(args->cpipe[1]);
2485 return args->wrapped(args->sock, args->tpid);
2486 }
2487
2488 /*
2489 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2490 * int value back over the socket. This shifts the pid from the
2491 * sender's pidns into tpid's pidns.
2492 */
2493 static int pid_to_ns(int sock, pid_t tpid)
2494 {
2495 char v = '0';
2496 struct ucred cred;
2497
2498 while (recv_creds(sock, &cred, &v)) {
2499 if (v == '1')
2500 return 0;
2501 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2502 return 1;
2503 }
2504 return 0;
2505 }
2506
2507
2508 /*
2509 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2510 * in your old pidns. Only children which you clone will be in the target
2511 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2512 * actually convert pids.
2513 *
2514 * Note: glibc's fork() does not respect pidns, which can lead to failed
2515 * assertions inside glibc (and thus failed forks) if the child's pid in
2516 * the pidns and the parent pid outside are identical. Using clone prevents
2517 * this issue.
2518 */
2519 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2520 {
2521 int newnsfd = -1, ret, cpipe[2];
2522 char fnam[100];
2523 pid_t cpid;
2524 char v;
2525
2526 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2527 if (ret < 0 || ret >= sizeof(fnam))
2528 _exit(1);
2529 newnsfd = open(fnam, O_RDONLY);
2530 if (newnsfd < 0)
2531 _exit(1);
2532 if (setns(newnsfd, 0) < 0)
2533 _exit(1);
2534 close(newnsfd);
2535
2536 if (pipe(cpipe) < 0)
2537 _exit(1);
2538
2539 struct pid_ns_clone_args args = {
2540 .cpipe = cpipe,
2541 .sock = sock,
2542 .tpid = tpid,
2543 .wrapped = &pid_to_ns
2544 };
2545 size_t stack_size = sysconf(_SC_PAGESIZE);
2546 void *stack = alloca(stack_size);
2547
2548 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2549 if (cpid < 0)
2550 _exit(1);
2551
2552 // give the child 1 second to be done forking and
2553 // write its ack
2554 if (!wait_for_sock(cpipe[0], 1))
2555 _exit(1);
2556 ret = read(cpipe[0], &v, 1);
2557 if (ret != sizeof(char) || v != '1')
2558 _exit(1);
2559
2560 if (!wait_for_pid(cpid))
2561 _exit(1);
2562 _exit(0);
2563 }
2564
2565 /*
2566 * To read cgroup files with a particular pid, we will setns into the child
2567 * pidns, open a pipe, fork a child - which will be the first to really be in
2568 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2569 */
2570 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2571 {
2572 int sock[2] = {-1, -1};
2573 char *tmpdata = NULL;
2574 int ret;
2575 pid_t qpid, cpid = -1;
2576 bool answer = false;
2577 char v = '0';
2578 struct ucred cred;
2579 size_t sz = 0, asz = 0;
2580
2581 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2582 return false;
2583
2584 /*
2585 * Now we read the pids from returned data one by one, pass
2586 * them into a child in the target namespace, read back the
2587 * translated pids, and put them into our to-return data
2588 */
2589
2590 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2591 perror("socketpair");
2592 free(tmpdata);
2593 return false;
2594 }
2595
2596 cpid = fork();
2597 if (cpid == -1)
2598 goto out;
2599
2600 if (!cpid) // child - exits when done
2601 pid_to_ns_wrapper(sock[1], tpid);
2602
2603 char *ptr = tmpdata;
2604 cred.uid = 0;
2605 cred.gid = 0;
2606 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2607 cred.pid = qpid;
2608 ret = send_creds(sock[0], &cred, v, true);
2609
2610 if (ret == SEND_CREDS_NOTSK)
2611 goto next;
2612 if (ret == SEND_CREDS_FAIL)
2613 goto out;
2614
2615 // read converted results
2616 if (!wait_for_sock(sock[0], 2)) {
2617 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2618 goto out;
2619 }
2620 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2621 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2622 goto out;
2623 }
2624 must_strcat_pid(d, &sz, &asz, qpid);
2625 next:
2626 ptr = strchr(ptr, '\n');
2627 if (!ptr)
2628 break;
2629 ptr++;
2630 }
2631
2632 cred.pid = getpid();
2633 v = '1';
2634 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2635 // failed to ask child to exit
2636 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2637 goto out;
2638 }
2639
2640 answer = true;
2641
2642 out:
2643 free(tmpdata);
2644 if (cpid != -1)
2645 wait_for_pid(cpid);
2646 if (sock[0] != -1) {
2647 close(sock[0]);
2648 close(sock[1]);
2649 }
2650 return answer;
2651 }
2652
2653 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2654 struct fuse_file_info *fi)
2655 {
2656 struct fuse_context *fc = fuse_get_context();
2657 struct file_info *f = (struct file_info *)fi->fh;
2658 struct cgfs_files *k = NULL;
2659 char *data = NULL;
2660 int ret, s;
2661 bool r;
2662
2663 if (f->type != LXC_TYPE_CGFILE) {
2664 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2665 return -EIO;
2666 }
2667
2668 if (offset)
2669 return 0;
2670
2671 if (!fc)
2672 return -EIO;
2673
2674 if (!f->controller)
2675 return -EINVAL;
2676
2677 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2678 return -EINVAL;
2679 }
2680 free_key(k);
2681
2682
2683 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2684 ret = -EACCES;
2685 goto out;
2686 }
2687
2688 if (strcmp(f->file, "tasks") == 0 ||
2689 strcmp(f->file, "/tasks") == 0 ||
2690 strcmp(f->file, "/cgroup.procs") == 0 ||
2691 strcmp(f->file, "cgroup.procs") == 0)
2692 // special case - we have to translate the pids
2693 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2694 else
2695 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2696
2697 if (!r) {
2698 ret = -EINVAL;
2699 goto out;
2700 }
2701
2702 if (!data) {
2703 ret = 0;
2704 goto out;
2705 }
2706 s = strlen(data);
2707 if (s > size)
2708 s = size;
2709 memcpy(buf, data, s);
2710 if (s > 0 && s < size && data[s-1] != '\n')
2711 buf[s++] = '\n';
2712
2713 ret = s;
2714
2715 out:
2716 free(data);
2717 return ret;
2718 }
2719
2720 static int pid_from_ns(int sock, pid_t tpid)
2721 {
2722 pid_t vpid;
2723 struct ucred cred;
2724 char v;
2725 int ret;
2726
2727 cred.uid = 0;
2728 cred.gid = 0;
2729 while (1) {
2730 if (!wait_for_sock(sock, 2)) {
2731 lxcfs_error("%s\n", "Timeout reading from parent.");
2732 return 1;
2733 }
2734 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2735 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2736 return 1;
2737 }
2738 if (vpid == -1) // done
2739 break;
2740 v = '0';
2741 cred.pid = vpid;
2742 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2743 v = '1';
2744 cred.pid = getpid();
2745 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2746 return 1;
2747 }
2748 }
2749 return 0;
2750 }
2751
2752 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2753 {
2754 int newnsfd = -1, ret, cpipe[2];
2755 char fnam[100];
2756 pid_t cpid;
2757 char v;
2758
2759 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2760 if (ret < 0 || ret >= sizeof(fnam))
2761 _exit(1);
2762 newnsfd = open(fnam, O_RDONLY);
2763 if (newnsfd < 0)
2764 _exit(1);
2765 if (setns(newnsfd, 0) < 0)
2766 _exit(1);
2767 close(newnsfd);
2768
2769 if (pipe(cpipe) < 0)
2770 _exit(1);
2771
2772 struct pid_ns_clone_args args = {
2773 .cpipe = cpipe,
2774 .sock = sock,
2775 .tpid = tpid,
2776 .wrapped = &pid_from_ns
2777 };
2778 size_t stack_size = sysconf(_SC_PAGESIZE);
2779 void *stack = alloca(stack_size);
2780
2781 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2782 if (cpid < 0)
2783 _exit(1);
2784
2785 // give the child 1 second to be done forking and
2786 // write its ack
2787 if (!wait_for_sock(cpipe[0], 1))
2788 _exit(1);
2789 ret = read(cpipe[0], &v, 1);
2790 if (ret != sizeof(char) || v != '1')
2791 _exit(1);
2792
2793 if (!wait_for_pid(cpid))
2794 _exit(1);
2795 _exit(0);
2796 }
2797
2798 /*
2799 * Given host @uid, return the uid to which it maps in
2800 * @pid's user namespace, or -1 if none.
2801 */
2802 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2803 {
2804 FILE *f;
2805 char line[400];
2806
2807 sprintf(line, "/proc/%d/uid_map", pid);
2808 if ((f = fopen(line, "r")) == NULL) {
2809 return false;
2810 }
2811
2812 *answer = convert_id_to_ns(f, uid);
2813 fclose(f);
2814
2815 if (*answer == -1)
2816 return false;
2817 return true;
2818 }
2819
2820 /*
2821 * get_pid_creds: get the real uid and gid of @pid from
2822 * /proc/$$/status
2823 * (XXX should we use euid here?)
2824 */
2825 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2826 {
2827 char line[400];
2828 uid_t u;
2829 gid_t g;
2830 FILE *f;
2831
2832 *uid = -1;
2833 *gid = -1;
2834 sprintf(line, "/proc/%d/status", pid);
2835 if ((f = fopen(line, "r")) == NULL) {
2836 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2837 return;
2838 }
2839 while (fgets(line, 400, f)) {
2840 if (strncmp(line, "Uid:", 4) == 0) {
2841 if (sscanf(line+4, "%u", &u) != 1) {
2842 lxcfs_error("bad uid line for pid %u\n", pid);
2843 fclose(f);
2844 return;
2845 }
2846 *uid = u;
2847 } else if (strncmp(line, "Gid:", 4) == 0) {
2848 if (sscanf(line+4, "%u", &g) != 1) {
2849 lxcfs_error("bad gid line for pid %u\n", pid);
2850 fclose(f);
2851 return;
2852 }
2853 *gid = g;
2854 }
2855 }
2856 fclose(f);
2857 }
2858
2859 /*
2860 * May the requestor @r move victim @v to a new cgroup?
2861 * This is allowed if
2862 * . they are the same task
2863 * . they are ownedy by the same uid
2864 * . @r is root on the host, or
2865 * . @v's uid is mapped into @r's where @r is root.
2866 */
2867 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2868 {
2869 uid_t v_uid, tmpuid;
2870 gid_t v_gid;
2871
2872 if (r == v)
2873 return true;
2874 if (r_uid == 0)
2875 return true;
2876 get_pid_creds(v, &v_uid, &v_gid);
2877 if (r_uid == v_uid)
2878 return true;
2879 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2880 && hostuid_to_ns(v_uid, r, &tmpuid))
2881 return true;
2882 return false;
2883 }
2884
2885 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2886 const char *file, const char *buf)
2887 {
2888 int sock[2] = {-1, -1};
2889 pid_t qpid, cpid = -1;
2890 FILE *pids_file = NULL;
2891 bool answer = false, fail = false;
2892
2893 pids_file = open_pids_file(contrl, cg);
2894 if (!pids_file)
2895 return false;
2896
2897 /*
2898 * write the pids to a socket, have helper in writer's pidns
2899 * call movepid for us
2900 */
2901 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2902 perror("socketpair");
2903 goto out;
2904 }
2905
2906 cpid = fork();
2907 if (cpid == -1)
2908 goto out;
2909
2910 if (!cpid) { // child
2911 fclose(pids_file);
2912 pid_from_ns_wrapper(sock[1], tpid);
2913 }
2914
2915 const char *ptr = buf;
2916 while (sscanf(ptr, "%d", &qpid) == 1) {
2917 struct ucred cred;
2918 char v;
2919
2920 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2921 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2922 goto out;
2923 }
2924
2925 if (recv_creds(sock[0], &cred, &v)) {
2926 if (v == '0') {
2927 if (!may_move_pid(tpid, tuid, cred.pid)) {
2928 fail = true;
2929 break;
2930 }
2931 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2932 fail = true;
2933 }
2934 }
2935
2936 ptr = strchr(ptr, '\n');
2937 if (!ptr)
2938 break;
2939 ptr++;
2940 }
2941
2942 /* All good, write the value */
2943 qpid = -1;
2944 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2945 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2946
2947 if (!fail)
2948 answer = true;
2949
2950 out:
2951 if (cpid != -1)
2952 wait_for_pid(cpid);
2953 if (sock[0] != -1) {
2954 close(sock[0]);
2955 close(sock[1]);
2956 }
2957 if (pids_file) {
2958 if (fclose(pids_file) != 0)
2959 answer = false;
2960 }
2961 return answer;
2962 }
2963
2964 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2965 struct fuse_file_info *fi)
2966 {
2967 struct fuse_context *fc = fuse_get_context();
2968 char *localbuf = NULL;
2969 struct cgfs_files *k = NULL;
2970 struct file_info *f = (struct file_info *)fi->fh;
2971 bool r;
2972
2973 if (f->type != LXC_TYPE_CGFILE) {
2974 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2975 return -EIO;
2976 }
2977
2978 if (offset)
2979 return 0;
2980
2981 if (!fc)
2982 return -EIO;
2983
2984 localbuf = alloca(size+1);
2985 localbuf[size] = '\0';
2986 memcpy(localbuf, buf, size);
2987
2988 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2989 size = -EINVAL;
2990 goto out;
2991 }
2992
2993 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2994 size = -EACCES;
2995 goto out;
2996 }
2997
2998 if (strcmp(f->file, "tasks") == 0 ||
2999 strcmp(f->file, "/tasks") == 0 ||
3000 strcmp(f->file, "/cgroup.procs") == 0 ||
3001 strcmp(f->file, "cgroup.procs") == 0)
3002 // special case - we have to translate the pids
3003 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
3004 else
3005 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
3006
3007 if (!r)
3008 size = -EINVAL;
3009
3010 out:
3011 free_key(k);
3012 return size;
3013 }
3014
3015 int cg_chown(const char *path, uid_t uid, gid_t gid)
3016 {
3017 struct fuse_context *fc = fuse_get_context();
3018 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3019 struct cgfs_files *k = NULL;
3020 const char *cgroup;
3021 int ret;
3022
3023 if (!fc)
3024 return -EIO;
3025
3026 if (strcmp(path, "/cgroup") == 0)
3027 return -EPERM;
3028
3029 controller = pick_controller_from_path(fc, path);
3030 if (!controller)
3031 return errno == ENOENT ? -EPERM : -errno;
3032
3033 cgroup = find_cgroup_in_path(path);
3034 if (!cgroup)
3035 /* this is just /cgroup/controller */
3036 return -EPERM;
3037
3038 get_cgdir_and_path(cgroup, &cgdir, &last);
3039
3040 if (!last) {
3041 path1 = "/";
3042 path2 = cgdir;
3043 } else {
3044 path1 = cgdir;
3045 path2 = last;
3046 }
3047
3048 if (is_child_cgroup(controller, path1, path2)) {
3049 // get uid, gid, from '/tasks' file and make up a mode
3050 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3051 k = cgfs_get_key(controller, cgroup, "tasks");
3052
3053 } else
3054 k = cgfs_get_key(controller, path1, path2);
3055
3056 if (!k) {
3057 ret = -EINVAL;
3058 goto out;
3059 }
3060
3061 /*
3062 * This being a fuse request, the uid and gid must be valid
3063 * in the caller's namespace. So we can just check to make
3064 * sure that the caller is root in his uid, and privileged
3065 * over the file's current owner.
3066 */
3067 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3068 ret = -EACCES;
3069 goto out;
3070 }
3071
3072 ret = cgfs_chown_file(controller, cgroup, uid, gid);
3073
3074 out:
3075 free_key(k);
3076 free(cgdir);
3077
3078 return ret;
3079 }
3080
3081 int cg_chmod(const char *path, mode_t mode)
3082 {
3083 struct fuse_context *fc = fuse_get_context();
3084 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3085 struct cgfs_files *k = NULL;
3086 const char *cgroup;
3087 int ret;
3088
3089 if (!fc)
3090 return -EIO;
3091
3092 if (strcmp(path, "/cgroup") == 0)
3093 return -EPERM;
3094
3095 controller = pick_controller_from_path(fc, path);
3096 if (!controller)
3097 return errno == ENOENT ? -EPERM : -errno;
3098
3099 cgroup = find_cgroup_in_path(path);
3100 if (!cgroup)
3101 /* this is just /cgroup/controller */
3102 return -EPERM;
3103
3104 get_cgdir_and_path(cgroup, &cgdir, &last);
3105
3106 if (!last) {
3107 path1 = "/";
3108 path2 = cgdir;
3109 } else {
3110 path1 = cgdir;
3111 path2 = last;
3112 }
3113
3114 if (is_child_cgroup(controller, path1, path2)) {
3115 // get uid, gid, from '/tasks' file and make up a mode
3116 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3117 k = cgfs_get_key(controller, cgroup, "tasks");
3118
3119 } else
3120 k = cgfs_get_key(controller, path1, path2);
3121
3122 if (!k) {
3123 ret = -EINVAL;
3124 goto out;
3125 }
3126
3127 /*
3128 * This being a fuse request, the uid and gid must be valid
3129 * in the caller's namespace. So we can just check to make
3130 * sure that the caller is root in his uid, and privileged
3131 * over the file's current owner.
3132 */
3133 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3134 ret = -EPERM;
3135 goto out;
3136 }
3137
3138 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3139 ret = -EINVAL;
3140 goto out;
3141 }
3142
3143 ret = 0;
3144 out:
3145 free_key(k);
3146 free(cgdir);
3147 return ret;
3148 }
3149
3150 int cg_mkdir(const char *path, mode_t mode)
3151 {
3152 struct fuse_context *fc = fuse_get_context();
3153 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3154 const char *cgroup;
3155 int ret;
3156
3157 if (!fc)
3158 return -EIO;
3159
3160 controller = pick_controller_from_path(fc, path);
3161 if (!controller)
3162 return errno == ENOENT ? -EPERM : -errno;
3163
3164 cgroup = find_cgroup_in_path(path);
3165 if (!cgroup)
3166 return -errno;
3167
3168 get_cgdir_and_path(cgroup, &cgdir, &last);
3169 if (!last)
3170 path1 = "/";
3171 else
3172 path1 = cgdir;
3173
3174 pid_t initpid = lookup_initpid_in_store(fc->pid);
3175 if (initpid <= 0)
3176 initpid = fc->pid;
3177 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3178 if (!next)
3179 ret = -EINVAL;
3180 else if (last && strcmp(next, last) == 0)
3181 ret = -EEXIST;
3182 else
3183 ret = -EPERM;
3184 goto out;
3185 }
3186
3187 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3188 ret = -EACCES;
3189 goto out;
3190 }
3191 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3192 ret = -EACCES;
3193 goto out;
3194 }
3195
3196 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3197
3198 out:
3199 free(cgdir);
3200 free(next);
3201 return ret;
3202 }
3203
3204 int cg_rmdir(const char *path)
3205 {
3206 struct fuse_context *fc = fuse_get_context();
3207 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3208 const char *cgroup;
3209 int ret;
3210
3211 if (!fc)
3212 return -EIO;
3213
3214 controller = pick_controller_from_path(fc, path);
3215 if (!controller) /* Someone's trying to delete "/cgroup". */
3216 return -EPERM;
3217
3218 cgroup = find_cgroup_in_path(path);
3219 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3220 return -EPERM;
3221
3222 get_cgdir_and_path(cgroup, &cgdir, &last);
3223 if (!last) {
3224 /* Someone's trying to delete a cgroup on the same level as the
3225 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3226 * rmdir "/cgroup/blkio/init.slice".
3227 */
3228 ret = -EPERM;
3229 goto out;
3230 }
3231
3232 pid_t initpid = lookup_initpid_in_store(fc->pid);
3233 if (initpid <= 0)
3234 initpid = fc->pid;
3235 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3236 if (!last || (next && (strcmp(next, last) == 0)))
3237 ret = -EBUSY;
3238 else
3239 ret = -ENOENT;
3240 goto out;
3241 }
3242
3243 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3244 ret = -EACCES;
3245 goto out;
3246 }
3247 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3248 ret = -EACCES;
3249 goto out;
3250 }
3251
3252 if (!cgfs_remove(controller, cgroup)) {
3253 ret = -EINVAL;
3254 goto out;
3255 }
3256
3257 ret = 0;
3258
3259 out:
3260 free(cgdir);
3261 free(next);
3262 return ret;
3263 }
3264
3265 static bool startswith(const char *line, const char *pref)
3266 {
3267 if (strncmp(line, pref, strlen(pref)) == 0)
3268 return true;
3269 return false;
3270 }
3271
3272 static void parse_memstat(char *memstat, unsigned long *cached,
3273 unsigned long *active_anon, unsigned long *inactive_anon,
3274 unsigned long *active_file, unsigned long *inactive_file,
3275 unsigned long *unevictable, unsigned long *shmem)
3276 {
3277 char *eol;
3278
3279 while (*memstat) {
3280 if (startswith(memstat, "total_cache")) {
3281 sscanf(memstat + 11, "%lu", cached);
3282 *cached /= 1024;
3283 } else if (startswith(memstat, "total_active_anon")) {
3284 sscanf(memstat + 17, "%lu", active_anon);
3285 *active_anon /= 1024;
3286 } else if (startswith(memstat, "total_inactive_anon")) {
3287 sscanf(memstat + 19, "%lu", inactive_anon);
3288 *inactive_anon /= 1024;
3289 } else if (startswith(memstat, "total_active_file")) {
3290 sscanf(memstat + 17, "%lu", active_file);
3291 *active_file /= 1024;
3292 } else if (startswith(memstat, "total_inactive_file")) {
3293 sscanf(memstat + 19, "%lu", inactive_file);
3294 *inactive_file /= 1024;
3295 } else if (startswith(memstat, "total_unevictable")) {
3296 sscanf(memstat + 17, "%lu", unevictable);
3297 *unevictable /= 1024;
3298 } else if (startswith(memstat, "total_shmem")) {
3299 sscanf(memstat + 11, "%lu", shmem);
3300 *shmem /= 1024;
3301 }
3302 eol = strchr(memstat, '\n');
3303 if (!eol)
3304 return;
3305 memstat = eol+1;
3306 }
3307 }
3308
3309 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3310 {
3311 char *eol;
3312 char key[32];
3313
3314 memset(key, 0, 32);
3315 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3316
3317 size_t len = strlen(key);
3318 *v = 0;
3319
3320 while (*str) {
3321 if (startswith(str, key)) {
3322 sscanf(str + len, "%lu", v);
3323 return;
3324 }
3325 eol = strchr(str, '\n');
3326 if (!eol)
3327 return;
3328 str = eol+1;
3329 }
3330 }
3331
3332 int read_file(const char *path, char *buf, size_t size, struct file_info *d)
3333 {
3334 size_t linelen = 0, total_len = 0, rv = 0;
3335 char *line = NULL;
3336 char *cache = d->buf;
3337 size_t cache_size = d->buflen;
3338 FILE *f = fopen(path, "r");
3339 if (!f)
3340 return 0;
3341
3342 while (getline(&line, &linelen, f) != -1) {
3343 ssize_t l = snprintf(cache, cache_size, "%s", line);
3344 if (l < 0) {
3345 perror("Error writing to cache");
3346 rv = 0;
3347 goto err;
3348 }
3349 if (l >= cache_size) {
3350 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3351 rv = 0;
3352 goto err;
3353 }
3354 cache += l;
3355 cache_size -= l;
3356 total_len += l;
3357 }
3358
3359 d->size = total_len;
3360 if (total_len > size)
3361 total_len = size;
3362
3363 /* read from off 0 */
3364 memcpy(buf, d->buf, total_len);
3365 rv = total_len;
3366 err:
3367 fclose(f);
3368 free(line);
3369 return rv;
3370 }
3371
3372 /*
3373 * FUSE ops for /proc
3374 */
3375
3376 static unsigned long get_memlimit(const char *cgroup, const char *file)
3377 {
3378 char *memlimit_str = NULL;
3379 unsigned long memlimit = -1;
3380
3381 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3382 memlimit = strtoul(memlimit_str, NULL, 10);
3383
3384 free(memlimit_str);
3385
3386 return memlimit;
3387 }
3388
3389 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3390 {
3391 char *copy = strdupa(cgroup);
3392 unsigned long memlimit = 0, retlimit;
3393
3394 retlimit = get_memlimit(copy, file);
3395
3396 while (strcmp(copy, "/") != 0) {
3397 copy = dirname(copy);
3398 memlimit = get_memlimit(copy, file);
3399 if (memlimit != -1 && memlimit < retlimit)
3400 retlimit = memlimit;
3401 };
3402
3403 return retlimit;
3404 }
3405
3406 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3407 struct fuse_file_info *fi)
3408 {
3409 struct fuse_context *fc = fuse_get_context();
3410 struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
3411 struct file_info *d = (struct file_info *)fi->fh;
3412 char *cg;
3413 char *memusage_str = NULL, *memstat_str = NULL,
3414 *memswlimit_str = NULL, *memswusage_str = NULL;
3415 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3416 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3417 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
3418 hostswtotal = 0;
3419 char *line = NULL;
3420 size_t linelen = 0, total_len = 0, rv = 0;
3421 char *cache = d->buf;
3422 size_t cache_size = d->buflen;
3423 FILE *f = NULL;
3424
3425 if (offset){
3426 if (offset > d->size)
3427 return -EINVAL;
3428 if (!d->cached)
3429 return 0;
3430 int left = d->size - offset;
3431 total_len = left > size ? size: left;
3432 memcpy(buf, cache + offset, total_len);
3433 return total_len;
3434 }
3435
3436 pid_t initpid = lookup_initpid_in_store(fc->pid);
3437 if (initpid <= 0)
3438 initpid = fc->pid;
3439 cg = get_pid_cgroup(initpid, "memory");
3440 if (!cg)
3441 return read_file("/proc/meminfo", buf, size, d);
3442 prune_init_slice(cg);
3443
3444 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3445 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3446 goto err;
3447 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3448 goto err;
3449
3450 // Following values are allowed to fail, because swapaccount might be turned
3451 // off for current kernel
3452 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3453 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3454 {
3455 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3456 memswusage = strtoul(memswusage_str, NULL, 10);
3457
3458 memswlimit = memswlimit / 1024;
3459 memswusage = memswusage / 1024;
3460 }
3461
3462 memusage = strtoul(memusage_str, NULL, 10);
3463 memlimit /= 1024;
3464 memusage /= 1024;
3465
3466 parse_memstat(memstat_str, &cached, &active_anon,
3467 &inactive_anon, &active_file, &inactive_file,
3468 &unevictable, &shmem);
3469
3470 f = fopen("/proc/meminfo", "r");
3471 if (!f)
3472 goto err;
3473
3474 while (getline(&line, &linelen, f) != -1) {
3475 ssize_t l;
3476 char *printme, lbuf[100];
3477
3478 memset(lbuf, 0, 100);
3479 if (startswith(line, "MemTotal:")) {
3480 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3481 if (hosttotal < memlimit)
3482 memlimit = hosttotal;
3483 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3484 printme = lbuf;
3485 } else if (startswith(line, "MemFree:")) {
3486 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3487 printme = lbuf;
3488 } else if (startswith(line, "MemAvailable:")) {
3489 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
3490 printme = lbuf;
3491 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 && opts->swap_off == false) {
3492 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3493 if (hostswtotal < memswlimit)
3494 memswlimit = hostswtotal;
3495 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
3496 printme = lbuf;
3497 } else if (startswith(line, "SwapTotal:") && opts->swap_off == true) {
3498 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", 0UL);
3499 printme = lbuf;
3500 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0 && opts->swap_off == false) {
3501 unsigned long swaptotal = memswlimit,
3502 swapusage = memswusage - memusage,
3503 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3504 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3505 printme = lbuf;
3506 } else if (startswith(line, "SwapFree:") && opts->swap_off == true) {
3507 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", 0UL);
3508 printme = lbuf;
3509 } else if (startswith(line, "Slab:")) {
3510 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3511 printme = lbuf;
3512 } else if (startswith(line, "Buffers:")) {
3513 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3514 printme = lbuf;
3515 } else if (startswith(line, "Cached:")) {
3516 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3517 printme = lbuf;
3518 } else if (startswith(line, "SwapCached:")) {
3519 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3520 printme = lbuf;
3521 } else if (startswith(line, "Active:")) {
3522 snprintf(lbuf, 100, "Active: %8lu kB\n",
3523 active_anon + active_file);
3524 printme = lbuf;
3525 } else if (startswith(line, "Inactive:")) {
3526 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3527 inactive_anon + inactive_file);
3528 printme = lbuf;
3529 } else if (startswith(line, "Active(anon)")) {
3530 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3531 printme = lbuf;
3532 } else if (startswith(line, "Inactive(anon)")) {
3533 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3534 printme = lbuf;
3535 } else if (startswith(line, "Active(file)")) {
3536 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3537 printme = lbuf;
3538 } else if (startswith(line, "Inactive(file)")) {
3539 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3540 printme = lbuf;
3541 } else if (startswith(line, "Unevictable")) {
3542 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3543 printme = lbuf;
3544 } else if (startswith(line, "SReclaimable")) {
3545 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3546 printme = lbuf;
3547 } else if (startswith(line, "SUnreclaim")) {
3548 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3549 printme = lbuf;
3550 } else if (startswith(line, "Shmem:")) {
3551 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3552 printme = lbuf;
3553 } else if (startswith(line, "ShmemHugePages")) {
3554 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3555 printme = lbuf;
3556 } else if (startswith(line, "ShmemPmdMapped")) {
3557 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3558 printme = lbuf;
3559 } else
3560 printme = line;
3561
3562 l = snprintf(cache, cache_size, "%s", printme);
3563 if (l < 0) {
3564 perror("Error writing to cache");
3565 rv = 0;
3566 goto err;
3567
3568 }
3569 if (l >= cache_size) {
3570 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3571 rv = 0;
3572 goto err;
3573 }
3574
3575 cache += l;
3576 cache_size -= l;
3577 total_len += l;
3578 }
3579
3580 d->cached = 1;
3581 d->size = total_len;
3582 if (total_len > size ) total_len = size;
3583 memcpy(buf, d->buf, total_len);
3584
3585 rv = total_len;
3586 err:
3587 if (f)
3588 fclose(f);
3589 free(line);
3590 free(cg);
3591 free(memusage_str);
3592 free(memswlimit_str);
3593 free(memswusage_str);
3594 free(memstat_str);
3595 return rv;
3596 }
3597
3598 /*
3599 * Read the cpuset.cpus for cg
3600 * Return the answer in a newly allocated string which must be freed
3601 */
3602 char *get_cpuset(const char *cg)
3603 {
3604 char *answer;
3605
3606 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3607 return NULL;
3608 return answer;
3609 }
3610
3611 bool cpu_in_cpuset(int cpu, const char *cpuset);
3612
3613 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3614 {
3615 int cpu;
3616
3617 if (sscanf(line, "processor : %d", &cpu) != 1)
3618 return false;
3619 return cpu_in_cpuset(cpu, cpuset);
3620 }
3621
3622 /*
3623 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3624 * depending on `param`. Parameter value is returned throuh `value`.
3625 */
3626 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3627 {
3628 bool rv = false;
3629 char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3630 char *str = NULL;
3631
3632 sprintf(file, "cpu.cfs_%s_us", param);
3633
3634 if (!cgfs_get_value("cpu", cg, file, &str))
3635 goto err;
3636
3637 if (sscanf(str, "%ld", value) != 1)
3638 goto err;
3639
3640 rv = true;
3641
3642 err:
3643 if (str)
3644 free(str);
3645 return rv;
3646 }
3647
3648 /*
3649 * Return the maximum number of visible CPUs based on CPU quotas.
3650 * If there is no quota set, zero is returned.
3651 */
3652 int max_cpu_count(const char *cg)
3653 {
3654 int rv, nprocs;
3655 int64_t cfs_quota, cfs_period;
3656
3657 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3658 return 0;
3659
3660 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3661 return 0;
3662
3663 if (cfs_quota <= 0 || cfs_period <= 0)
3664 return 0;
3665
3666 rv = cfs_quota / cfs_period;
3667
3668 /* In case quota/period does not yield a whole number, add one CPU for
3669 * the remainder.
3670 */
3671 if ((cfs_quota % cfs_period) > 0)
3672 rv += 1;
3673
3674 nprocs = get_nprocs();
3675
3676 if (rv > nprocs)
3677 rv = nprocs;
3678
3679 return rv;
3680 }
3681
3682 /*
3683 * Determine whether CPU views should be used or not.
3684 */
3685 bool use_cpuview(const char *cg)
3686 {
3687 int cfd;
3688 char *tmpc;
3689
3690 tmpc = find_mounted_controller("cpu", &cfd);
3691 if (!tmpc)
3692 return false;
3693
3694 tmpc = find_mounted_controller("cpuacct", &cfd);
3695 if (!tmpc)
3696 return false;
3697
3698 return true;
3699 }
3700
3701 /*
3702 * check whether this is a '^processor" line in /proc/cpuinfo
3703 */
3704 static bool is_processor_line(const char *line)
3705 {
3706 int cpu;
3707
3708 if (sscanf(line, "processor : %d", &cpu) == 1)
3709 return true;
3710 return false;
3711 }
3712
3713 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3714 struct fuse_file_info *fi)
3715 {
3716 struct fuse_context *fc = fuse_get_context();
3717 struct file_info *d = (struct file_info *)fi->fh;
3718 char *cg;
3719 char *cpuset = NULL;
3720 char *line = NULL;
3721 size_t linelen = 0, total_len = 0, rv = 0;
3722 bool am_printing = false, firstline = true, is_s390x = false;
3723 int curcpu = -1, cpu, max_cpus = 0;
3724 bool use_view;
3725 char *cache = d->buf;
3726 size_t cache_size = d->buflen;
3727 FILE *f = NULL;
3728
3729 if (offset){
3730 if (offset > d->size)
3731 return -EINVAL;
3732 if (!d->cached)
3733 return 0;
3734 int left = d->size - offset;
3735 total_len = left > size ? size: left;
3736 memcpy(buf, cache + offset, total_len);
3737 return total_len;
3738 }
3739
3740 pid_t initpid = lookup_initpid_in_store(fc->pid);
3741 if (initpid <= 0)
3742 initpid = fc->pid;
3743 cg = get_pid_cgroup(initpid, "cpuset");
3744 if (!cg)
3745 return read_file("proc/cpuinfo", buf, size, d);
3746 prune_init_slice(cg);
3747
3748 cpuset = get_cpuset(cg);
3749 if (!cpuset)
3750 goto err;
3751
3752 use_view = use_cpuview(cg);
3753
3754 if (use_view)
3755 max_cpus = max_cpu_count(cg);
3756
3757 f = fopen("/proc/cpuinfo", "r");
3758 if (!f)
3759 goto err;
3760
3761 while (getline(&line, &linelen, f) != -1) {
3762 ssize_t l;
3763 if (firstline) {
3764 firstline = false;
3765 if (strstr(line, "IBM/S390") != NULL) {
3766 is_s390x = true;
3767 am_printing = true;
3768 continue;
3769 }
3770 }
3771 if (strncmp(line, "# processors:", 12) == 0)
3772 continue;
3773 if (is_processor_line(line)) {
3774 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3775 break;
3776 am_printing = cpuline_in_cpuset(line, cpuset);
3777 if (am_printing) {
3778 curcpu ++;
3779 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3780 if (l < 0) {
3781 perror("Error writing to cache");
3782 rv = 0;
3783 goto err;
3784 }
3785 if (l >= cache_size) {
3786 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3787 rv = 0;
3788 goto err;
3789 }
3790 cache += l;
3791 cache_size -= l;
3792 total_len += l;
3793 }
3794 continue;
3795 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3796 char *p;
3797 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3798 break;
3799 if (!cpu_in_cpuset(cpu, cpuset))
3800 continue;
3801 curcpu ++;
3802 p = strchr(line, ':');
3803 if (!p || !*p)
3804 goto err;
3805 p++;
3806 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3807 if (l < 0) {
3808 perror("Error writing to cache");
3809 rv = 0;
3810 goto err;
3811 }
3812 if (l >= cache_size) {
3813 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3814 rv = 0;
3815 goto err;
3816 }
3817 cache += l;
3818 cache_size -= l;
3819 total_len += l;
3820 continue;
3821
3822 }
3823 if (am_printing) {
3824 l = snprintf(cache, cache_size, "%s", line);
3825 if (l < 0) {
3826 perror("Error writing to cache");
3827 rv = 0;
3828 goto err;
3829 }
3830 if (l >= cache_size) {
3831 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3832 rv = 0;
3833 goto err;
3834 }
3835 cache += l;
3836 cache_size -= l;
3837 total_len += l;
3838 }
3839 }
3840
3841 if (is_s390x) {
3842 char *origcache = d->buf;
3843 ssize_t l;
3844 do {
3845 d->buf = malloc(d->buflen);
3846 } while (!d->buf);
3847 cache = d->buf;
3848 cache_size = d->buflen;
3849 total_len = 0;
3850 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3851 if (l < 0 || l >= cache_size) {
3852 free(origcache);
3853 goto err;
3854 }
3855 cache_size -= l;
3856 cache += l;
3857 total_len += l;
3858 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3859 if (l < 0 || l >= cache_size) {
3860 free(origcache);
3861 goto err;
3862 }
3863 cache_size -= l;
3864 cache += l;
3865 total_len += l;
3866 l = snprintf(cache, cache_size, "%s", origcache);
3867 free(origcache);
3868 if (l < 0 || l >= cache_size)
3869 goto err;
3870 total_len += l;
3871 }
3872
3873 d->cached = 1;
3874 d->size = total_len;
3875 if (total_len > size ) total_len = size;
3876
3877 /* read from off 0 */
3878 memcpy(buf, d->buf, total_len);
3879 rv = total_len;
3880 err:
3881 if (f)
3882 fclose(f);
3883 free(line);
3884 free(cpuset);
3885 free(cg);
3886 return rv;
3887 }
3888
3889 static uint64_t get_reaper_start_time(pid_t pid)
3890 {
3891 int ret;
3892 FILE *f;
3893 uint64_t starttime;
3894 /* strlen("/proc/") = 6
3895 * +
3896 * LXCFS_NUMSTRLEN64
3897 * +
3898 * strlen("/stat") = 5
3899 * +
3900 * \0 = 1
3901 * */
3902 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3903 char path[__PROC_PID_STAT_LEN];
3904 pid_t qpid;
3905
3906 qpid = lookup_initpid_in_store(pid);
3907 if (qpid <= 0) {
3908 /* Caller can check for EINVAL on 0. */
3909 errno = EINVAL;
3910 return 0;
3911 }
3912
3913 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3914 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3915 /* Caller can check for EINVAL on 0. */
3916 errno = EINVAL;
3917 return 0;
3918 }
3919
3920 f = fopen(path, "r");
3921 if (!f) {
3922 /* Caller can check for EINVAL on 0. */
3923 errno = EINVAL;
3924 return 0;
3925 }
3926
3927 /* Note that the *scanf() argument supression requires that length
3928 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3929 * at us. It's like telling someone you're not married and then asking
3930 * if you can bring your wife to the party.
3931 */
3932 ret = fscanf(f, "%*d " /* (1) pid %d */
3933 "%*s " /* (2) comm %s */
3934 "%*c " /* (3) state %c */
3935 "%*d " /* (4) ppid %d */
3936 "%*d " /* (5) pgrp %d */
3937 "%*d " /* (6) session %d */
3938 "%*d " /* (7) tty_nr %d */
3939 "%*d " /* (8) tpgid %d */
3940 "%*u " /* (9) flags %u */
3941 "%*u " /* (10) minflt %lu */
3942 "%*u " /* (11) cminflt %lu */
3943 "%*u " /* (12) majflt %lu */
3944 "%*u " /* (13) cmajflt %lu */
3945 "%*u " /* (14) utime %lu */
3946 "%*u " /* (15) stime %lu */
3947 "%*d " /* (16) cutime %ld */
3948 "%*d " /* (17) cstime %ld */
3949 "%*d " /* (18) priority %ld */
3950 "%*d " /* (19) nice %ld */
3951 "%*d " /* (20) num_threads %ld */
3952 "%*d " /* (21) itrealvalue %ld */
3953 "%" PRIu64, /* (22) starttime %llu */
3954 &starttime);
3955 if (ret != 1) {
3956 fclose(f);
3957 /* Caller can check for EINVAL on 0. */
3958 errno = EINVAL;
3959 return 0;
3960 }
3961
3962 fclose(f);
3963
3964 errno = 0;
3965 return starttime;
3966 }
3967
3968 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3969 {
3970 uint64_t clockticks;
3971 int64_t ticks_per_sec;
3972
3973 clockticks = get_reaper_start_time(pid);
3974 if (clockticks == 0 && errno == EINVAL) {
3975 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3976 return 0;
3977 }
3978
3979 ticks_per_sec = sysconf(_SC_CLK_TCK);
3980 if (ticks_per_sec < 0 && errno == EINVAL) {
3981 lxcfs_debug(
3982 "%s\n",
3983 "failed to determine number of clock ticks in a second");
3984 return 0;
3985 }
3986
3987 return (clockticks /= ticks_per_sec);
3988 }
3989
3990 static uint64_t get_reaper_age(pid_t pid)
3991 {
3992 uint64_t procstart, uptime, procage;
3993
3994 /* We need to substract the time the process has started since system
3995 * boot minus the time when the system has started to get the actual
3996 * reaper age.
3997 */
3998 procstart = get_reaper_start_time_in_sec(pid);
3999 procage = procstart;
4000 if (procstart > 0) {
4001 int ret;
4002 struct timespec spec;
4003
4004 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
4005 if (ret < 0)
4006 return 0;
4007 /* We could make this more precise here by using the tv_nsec
4008 * field in the timespec struct and convert it to milliseconds
4009 * and then create a double for the seconds and milliseconds but
4010 * that seems more work than it is worth.
4011 */
4012 uptime = spec.tv_sec;
4013 procage = uptime - procstart;
4014 }
4015
4016 return procage;
4017 }
4018
4019 /*
4020 * Returns 0 on success.
4021 * It is the caller's responsibility to free `return_usage`, unless this
4022 * function returns an error.
4023 */
4024 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
4025 {
4026 int cpucount = get_nprocs_conf();
4027 struct cpuacct_usage *cpu_usage;
4028 int rv = 0, i, j, ret, read_pos = 0, read_cnt;
4029 int cg_cpu;
4030 uint64_t cg_user, cg_system;
4031 int64_t ticks_per_sec;
4032 char *usage_str = NULL;
4033
4034 ticks_per_sec = sysconf(_SC_CLK_TCK);
4035
4036 if (ticks_per_sec < 0 && errno == EINVAL) {
4037 lxcfs_debug(
4038 "%s\n",
4039 "read_cpuacct_usage_all failed to determine number of clock ticks "
4040 "in a second");
4041 return -1;
4042 }
4043
4044 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4045 if (!cpu_usage)
4046 return -ENOMEM;
4047
4048 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4049 rv = -1;
4050 goto err;
4051 }
4052
4053 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4054 lxcfs_error("read_cpuacct_usage_all reading first line from "
4055 "%s/cpuacct.usage_all failed.\n", cg);
4056 rv = -1;
4057 goto err;
4058 }
4059
4060 read_pos += read_cnt;
4061
4062 for (i = 0, j = 0; i < cpucount; i++) {
4063 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4064 &cg_system, &read_cnt);
4065
4066 if (ret == EOF)
4067 break;
4068
4069 if (ret != 3) {
4070 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4071 "failed.\n", cg);
4072 rv = -1;
4073 goto err;
4074 }
4075
4076 read_pos += read_cnt;
4077
4078 /* Convert the time from nanoseconds to USER_HZ */
4079 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4080 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4081 j++;
4082 }
4083
4084 rv = 0;
4085 *return_usage = cpu_usage;
4086 *size = cpucount;
4087
4088 err:
4089 if (usage_str)
4090 free(usage_str);
4091
4092 if (rv != 0) {
4093 free(cpu_usage);
4094 *return_usage = NULL;
4095 }
4096
4097 return rv;
4098 }
4099
4100 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4101 {
4102 int i;
4103 unsigned long sum = 0;
4104
4105 for (i = 0; i < cpu_count; i++) {
4106 if (!newer[i].online)
4107 continue;
4108
4109 /* When cpuset is changed on the fly, the CPUs might get reordered.
4110 * We could either reset all counters, or check that the substractions
4111 * below will return expected results.
4112 */
4113 if (newer[i].user > older[i].user)
4114 diff[i].user = newer[i].user - older[i].user;
4115 else
4116 diff[i].user = 0;
4117
4118 if (newer[i].system > older[i].system)
4119 diff[i].system = newer[i].system - older[i].system;
4120 else
4121 diff[i].system = 0;
4122
4123 if (newer[i].idle > older[i].idle)
4124 diff[i].idle = newer[i].idle - older[i].idle;
4125 else
4126 diff[i].idle = 0;
4127
4128 sum += diff[i].user;
4129 sum += diff[i].system;
4130 sum += diff[i].idle;
4131 }
4132
4133 return sum;
4134 }
4135
4136 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4137 {
4138 unsigned long free_space, to_add;
4139
4140 free_space = threshold - usage->user - usage->system;
4141
4142 if (free_space > usage->idle)
4143 free_space = usage->idle;
4144
4145 to_add = free_space > *surplus ? *surplus : free_space;
4146
4147 *counter += to_add;
4148 usage->idle -= to_add;
4149 *surplus -= to_add;
4150 }
4151
4152 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4153 {
4154 struct cg_proc_stat *first = NULL, *prev, *tmp;
4155
4156 for (prev = NULL; node; ) {
4157 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4158 tmp = node;
4159 lxcfs_debug("Removing stat node for %s\n", node->cg);
4160
4161 if (prev)
4162 prev->next = node->next;
4163 else
4164 first = node->next;
4165
4166 node = node->next;
4167 free_proc_stat_node(tmp);
4168 } else {
4169 if (!first)
4170 first = node;
4171 prev = node;
4172 node = node->next;
4173 }
4174 }
4175
4176 return first;
4177 }
4178
4179 #define PROC_STAT_PRUNE_INTERVAL 10
4180 static void prune_proc_stat_history(void)
4181 {
4182 int i;
4183 time_t now = time(NULL);
4184
4185 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4186 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4187
4188 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4189 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4190 return;
4191 }
4192
4193 if (proc_stat_history[i]->next) {
4194 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4195 proc_stat_history[i]->lastcheck = now;
4196 }
4197
4198 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4199 }
4200 }
4201
4202 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4203 {
4204 struct cg_proc_stat *node;
4205
4206 pthread_rwlock_rdlock(&head->lock);
4207
4208 if (!head->next) {
4209 pthread_rwlock_unlock(&head->lock);
4210 return NULL;
4211 }
4212
4213 node = head->next;
4214
4215 do {
4216 if (strcmp(cg, node->cg) == 0)
4217 goto out;
4218 } while ((node = node->next));
4219
4220 node = NULL;
4221
4222 out:
4223 pthread_rwlock_unlock(&head->lock);
4224 prune_proc_stat_history();
4225 return node;
4226 }
4227
4228 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4229 {
4230 struct cg_proc_stat *node;
4231 int i;
4232
4233 node = malloc(sizeof(struct cg_proc_stat));
4234 if (!node)
4235 goto err;
4236
4237 node->cg = NULL;
4238 node->usage = NULL;
4239 node->view = NULL;
4240
4241 node->cg = malloc(strlen(cg) + 1);
4242 if (!node->cg)
4243 goto err;
4244
4245 strcpy(node->cg, cg);
4246
4247 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4248 if (!node->usage)
4249 goto err;
4250
4251 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4252
4253 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4254 if (!node->view)
4255 goto err;
4256
4257 node->cpu_count = cpu_count;
4258 node->next = NULL;
4259
4260 if (pthread_mutex_init(&node->lock, NULL) != 0) {
4261 lxcfs_error("%s\n", "Failed to initialize node lock");
4262 goto err;
4263 }
4264
4265 for (i = 0; i < cpu_count; i++) {
4266 node->view[i].user = 0;
4267 node->view[i].system = 0;
4268 node->view[i].idle = 0;
4269 }
4270
4271 return node;
4272
4273 err:
4274 if (node && node->cg)
4275 free(node->cg);
4276 if (node && node->usage)
4277 free(node->usage);
4278 if (node && node->view)
4279 free(node->view);
4280 if (node)
4281 free(node);
4282
4283 return NULL;
4284 }
4285
4286 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4287 {
4288 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4289 struct cg_proc_stat_head *head = proc_stat_history[hash];
4290 struct cg_proc_stat *node, *rv = new_node;
4291
4292 pthread_rwlock_wrlock(&head->lock);
4293
4294 if (!head->next) {
4295 head->next = new_node;
4296 goto out;
4297 }
4298
4299 node = head->next;
4300
4301 for (;;) {
4302 if (strcmp(node->cg, new_node->cg) == 0) {
4303 /* The node is already present, return it */
4304 free_proc_stat_node(new_node);
4305 rv = node;
4306 goto out;
4307 }
4308
4309 if (node->next) {
4310 node = node->next;
4311 continue;
4312 }
4313
4314 node->next = new_node;
4315 goto out;
4316 }
4317
4318 out:
4319 pthread_rwlock_unlock(&head->lock);
4320 return rv;
4321 }
4322
4323 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4324 {
4325 struct cpuacct_usage *new_usage, *new_view;
4326 int i;
4327
4328 /* Allocate new memory */
4329 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4330 if (!new_usage)
4331 return false;
4332
4333 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4334 if (!new_view) {
4335 free(new_usage);
4336 return false;
4337 }
4338
4339 /* Copy existing data & initialize new elements */
4340 for (i = 0; i < cpu_count; i++) {
4341 if (i < node->cpu_count) {
4342 new_usage[i].user = node->usage[i].user;
4343 new_usage[i].system = node->usage[i].system;
4344 new_usage[i].idle = node->usage[i].idle;
4345
4346 new_view[i].user = node->view[i].user;
4347 new_view[i].system = node->view[i].system;
4348 new_view[i].idle = node->view[i].idle;
4349 } else {
4350 new_usage[i].user = 0;
4351 new_usage[i].system = 0;
4352 new_usage[i].idle = 0;
4353
4354 new_view[i].user = 0;
4355 new_view[i].system = 0;
4356 new_view[i].idle = 0;
4357 }
4358 }
4359
4360 free(node->usage);
4361 free(node->view);
4362
4363 node->usage = new_usage;
4364 node->view = new_view;
4365 node->cpu_count = cpu_count;
4366
4367 return true;
4368 }
4369
4370 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4371 {
4372 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4373 struct cg_proc_stat_head *head = proc_stat_history[hash];
4374 struct cg_proc_stat *node;
4375
4376 node = find_proc_stat_node(head, cg);
4377
4378 if (!node) {
4379 node = new_proc_stat_node(usage, cpu_count, cg);
4380 if (!node)
4381 return NULL;
4382
4383 node = add_proc_stat_node(node);
4384 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4385 }
4386
4387 pthread_mutex_lock(&node->lock);
4388
4389 /* If additional CPUs on the host have been enabled, CPU usage counter
4390 * arrays have to be expanded */
4391 if (node->cpu_count < cpu_count) {
4392 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4393 node->cpu_count, cpu_count, cg);
4394
4395 if (!expand_proc_stat_node(node, cpu_count)) {
4396 pthread_mutex_unlock(&node->lock);
4397 lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4398 node->cpu_count, cpu_count, cg);
4399 return NULL;
4400 }
4401 }
4402
4403 return node;
4404 }
4405
4406 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4407 {
4408 int i;
4409
4410 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4411 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4412
4413 for (i = 0; i < cpu_count; i++) {
4414 node->view[i].user = 0;
4415 node->view[i].system = 0;
4416 node->view[i].idle = 0;
4417 }
4418
4419 node->cpu_count = cpu_count;
4420 }
4421
4422 static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, FILE *f, char *buf, size_t buf_size)
4423 {
4424 char *line = NULL;
4425 size_t linelen = 0, total_len = 0, rv = 0, l;
4426 int curcpu = -1; /* cpu numbering starts at 0 */
4427 int physcpu, i;
4428 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4429 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4430 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4431 unsigned long user_surplus = 0, system_surplus = 0;
4432 unsigned long total_sum, threshold;
4433 struct cg_proc_stat *stat_node;
4434 struct cpuacct_usage *diff = NULL;
4435 int nprocs = get_nprocs_conf();
4436
4437 if (cg_cpu_usage_size < nprocs)
4438 nprocs = cg_cpu_usage_size;
4439
4440 /* Read all CPU stats and stop when we've encountered other lines */
4441 while (getline(&line, &linelen, f) != -1) {
4442 int ret;
4443 char cpu_char[10]; /* That's a lot of cores */
4444 uint64_t all_used, cg_used;
4445
4446 if (strlen(line) == 0)
4447 continue;
4448 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4449 /* not a ^cpuN line containing a number N */
4450 break;
4451 }
4452
4453 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4454 continue;
4455
4456 if (physcpu >= cg_cpu_usage_size)
4457 continue;
4458
4459 curcpu ++;
4460 cpu_cnt ++;
4461
4462 if (!cpu_in_cpuset(physcpu, cpuset)) {
4463 for (i = curcpu; i <= physcpu; i++) {
4464 cg_cpu_usage[i].online = false;
4465 }
4466 continue;
4467 }
4468
4469 if (curcpu < physcpu) {
4470 /* Some CPUs may be disabled */
4471 for (i = curcpu; i < physcpu; i++)
4472 cg_cpu_usage[i].online = false;
4473
4474 curcpu = physcpu;
4475 }
4476
4477 cg_cpu_usage[curcpu].online = true;
4478
4479 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4480 &user,
4481 &nice,
4482 &system,
4483 &idle,
4484 &iowait,
4485 &irq,
4486 &softirq,
4487 &steal,
4488 &guest,
4489 &guest_nice);
4490
4491 if (ret != 10)
4492 continue;
4493
4494 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4495 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4496
4497 if (all_used >= cg_used) {
4498 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4499
4500 } else {
4501 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4502 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4503 curcpu, cg, all_used, cg_used);
4504 cg_cpu_usage[curcpu].idle = idle;
4505 }
4506 }
4507
4508 /* Cannot use more CPUs than is available due to cpuset */
4509 if (max_cpus > cpu_cnt)
4510 max_cpus = cpu_cnt;
4511
4512 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4513
4514 if (!stat_node) {
4515 lxcfs_error("unable to find/create stat node for %s\n", cg);
4516 rv = 0;
4517 goto err;
4518 }
4519
4520 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4521 if (!diff) {
4522 rv = 0;
4523 goto err;
4524 }
4525
4526 /*
4527 * If the new values are LOWER than values stored in memory, it means
4528 * the cgroup has been reset/recreated and we should reset too.
4529 */
4530 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4531 if (!cg_cpu_usage[curcpu].online)
4532 continue;
4533
4534 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4535 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4536
4537 break;
4538 }
4539
4540 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4541
4542 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4543 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4544
4545 if (!stat_node->usage[curcpu].online)
4546 continue;
4547
4548 i++;
4549
4550 stat_node->usage[curcpu].user += diff[curcpu].user;
4551 stat_node->usage[curcpu].system += diff[curcpu].system;
4552 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4553
4554 if (max_cpus > 0 && i >= max_cpus) {
4555 user_surplus += diff[curcpu].user;
4556 system_surplus += diff[curcpu].system;
4557 }
4558 }
4559
4560 /* Calculate usage counters of visible CPUs */
4561 if (max_cpus > 0) {
4562 /* threshold = maximum usage per cpu, including idle */
4563 threshold = total_sum / cpu_cnt * max_cpus;
4564
4565 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4566 if (i == max_cpus)
4567 break;
4568
4569 if (!stat_node->usage[curcpu].online)
4570 continue;
4571
4572 i++;
4573
4574 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4575 continue;
4576
4577 /* Add user */
4578 add_cpu_usage(
4579 &user_surplus,
4580 &diff[curcpu],
4581 &diff[curcpu].user,
4582 threshold);
4583
4584 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4585 continue;
4586
4587 /* If there is still room, add system */
4588 add_cpu_usage(
4589 &system_surplus,
4590 &diff[curcpu],
4591 &diff[curcpu].system,
4592 threshold);
4593 }
4594
4595 if (user_surplus > 0)
4596 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4597 if (system_surplus > 0)
4598 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4599
4600 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4601 if (i == max_cpus)
4602 break;
4603
4604 if (!stat_node->usage[curcpu].online)
4605 continue;
4606
4607 i++;
4608
4609 stat_node->view[curcpu].user += diff[curcpu].user;
4610 stat_node->view[curcpu].system += diff[curcpu].system;
4611 stat_node->view[curcpu].idle += diff[curcpu].idle;
4612
4613 user_sum += stat_node->view[curcpu].user;
4614 system_sum += stat_node->view[curcpu].system;
4615 idle_sum += stat_node->view[curcpu].idle;
4616 }
4617
4618 } else {
4619 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4620 if (!stat_node->usage[curcpu].online)
4621 continue;
4622
4623 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4624 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4625 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4626
4627 user_sum += stat_node->view[curcpu].user;
4628 system_sum += stat_node->view[curcpu].system;
4629 idle_sum += stat_node->view[curcpu].idle;
4630 }
4631 }
4632
4633 /* Render the file */
4634 /* cpu-all */
4635 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4636 user_sum,
4637 system_sum,
4638 idle_sum);
4639
4640 if (l < 0) {
4641 perror("Error writing to cache");
4642 rv = 0;
4643 goto err;
4644
4645 }
4646 if (l >= buf_size) {
4647 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4648 rv = 0;
4649 goto err;
4650 }
4651
4652 buf += l;
4653 buf_size -= l;
4654 total_len += l;
4655
4656 /* Render visible CPUs */
4657 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4658 if (!stat_node->usage[curcpu].online)
4659 continue;
4660
4661 i++;
4662
4663 if (max_cpus > 0 && i == max_cpus)
4664 break;
4665
4666 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4667 i,
4668 stat_node->view[curcpu].user,
4669 stat_node->view[curcpu].system,
4670 stat_node->view[curcpu].idle);
4671
4672 if (l < 0) {
4673 perror("Error writing to cache");
4674 rv = 0;
4675 goto err;
4676
4677 }
4678 if (l >= buf_size) {
4679 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4680 rv = 0;
4681 goto err;
4682 }
4683
4684 buf += l;
4685 buf_size -= l;
4686 total_len += l;
4687 }
4688
4689 /* Pass the rest of /proc/stat, start with the last line read */
4690 l = snprintf(buf, buf_size, "%s", line);
4691
4692 if (l < 0) {
4693 perror("Error writing to cache");
4694 rv = 0;
4695 goto err;
4696
4697 }
4698 if (l >= buf_size) {
4699 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4700 rv = 0;
4701 goto err;
4702 }
4703
4704 buf += l;
4705 buf_size -= l;
4706 total_len += l;
4707
4708 /* Pass the rest of the host's /proc/stat */
4709 while (getline(&line, &linelen, f) != -1) {
4710 l = snprintf(buf, buf_size, "%s", line);
4711 if (l < 0) {
4712 perror("Error writing to cache");
4713 rv = 0;
4714 goto err;
4715 }
4716 if (l >= buf_size) {
4717 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4718 rv = 0;
4719 goto err;
4720 }
4721 buf += l;
4722 buf_size -= l;
4723 total_len += l;
4724 }
4725
4726 rv = total_len;
4727
4728 err:
4729 if (stat_node)
4730 pthread_mutex_unlock(&stat_node->lock);
4731 if (line)
4732 free(line);
4733 if (diff)
4734 free(diff);
4735 return rv;
4736 }
4737
4738 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4739 static int proc_stat_read(char *buf, size_t size, off_t offset,
4740 struct fuse_file_info *fi)
4741 {
4742 struct fuse_context *fc = fuse_get_context();
4743 struct file_info *d = (struct file_info *)fi->fh;
4744 char *cg;
4745 char *cpuset = NULL;
4746 char *line = NULL;
4747 size_t linelen = 0, total_len = 0, rv = 0;
4748 int curcpu = -1; /* cpu numbering starts at 0 */
4749 int physcpu = 0;
4750 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4751 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
4752 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4753 char cpuall[CPUALL_MAX_SIZE];
4754 /* reserve for cpu all */
4755 char *cache = d->buf + CPUALL_MAX_SIZE;
4756 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4757 FILE *f = NULL;
4758 struct cpuacct_usage *cg_cpu_usage = NULL;
4759 int cg_cpu_usage_size = 0;
4760
4761 if (offset){
4762 if (offset > d->size)
4763 return -EINVAL;
4764 if (!d->cached)
4765 return 0;
4766 int left = d->size - offset;
4767 total_len = left > size ? size: left;
4768 memcpy(buf, d->buf + offset, total_len);
4769 return total_len;
4770 }
4771
4772 pid_t initpid = lookup_initpid_in_store(fc->pid);
4773 if (initpid <= 0)
4774 initpid = fc->pid;
4775 cg = get_pid_cgroup(initpid, "cpuset");
4776 if (!cg)
4777 return read_file("/proc/stat", buf, size, d);
4778 prune_init_slice(cg);
4779
4780 cpuset = get_cpuset(cg);
4781 if (!cpuset)
4782 goto err;
4783
4784 /*
4785 * Read cpuacct.usage_all for all CPUs.
4786 * If the cpuacct cgroup is present, it is used to calculate the container's
4787 * CPU usage. If not, values from the host's /proc/stat are used.
4788 */
4789 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
4790 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
4791 "falling back to the host's /proc/stat");
4792 }
4793
4794 f = fopen("/proc/stat", "r");
4795 if (!f)
4796 goto err;
4797
4798 //skip first line
4799 if (getline(&line, &linelen, f) < 0) {
4800 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4801 goto err;
4802 }
4803
4804 if (use_cpuview(cg) && cg_cpu_usage) {
4805 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4806 f, d->buf, d->buflen);
4807 goto out;
4808 }
4809
4810 while (getline(&line, &linelen, f) != -1) {
4811 ssize_t l;
4812 char cpu_char[10]; /* That's a lot of cores */
4813 char *c;
4814 uint64_t all_used, cg_used, new_idle;
4815 int ret;
4816
4817 if (strlen(line) == 0)
4818 continue;
4819 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4820 /* not a ^cpuN line containing a number N, just print it */
4821 l = snprintf(cache, cache_size, "%s", line);
4822 if (l < 0) {
4823 perror("Error writing to cache");
4824 rv = 0;
4825 goto err;
4826 }
4827 if (l >= cache_size) {
4828 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4829 rv = 0;
4830 goto err;
4831 }
4832 cache += l;
4833 cache_size -= l;
4834 total_len += l;
4835 continue;
4836 }
4837
4838 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4839 continue;
4840 if (!cpu_in_cpuset(physcpu, cpuset))
4841 continue;
4842 curcpu ++;
4843
4844 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4845 &user,
4846 &nice,
4847 &system,
4848 &idle,
4849 &iowait,
4850 &irq,
4851 &softirq,
4852 &steal,
4853 &guest,
4854 &guest_nice);
4855
4856 if (ret != 10 || !cg_cpu_usage) {
4857 c = strchr(line, ' ');
4858 if (!c)
4859 continue;
4860 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4861 if (l < 0) {
4862 perror("Error writing to cache");
4863 rv = 0;
4864 goto err;
4865
4866 }
4867 if (l >= cache_size) {
4868 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4869 rv = 0;
4870 goto err;
4871 }
4872
4873 cache += l;
4874 cache_size -= l;
4875 total_len += l;
4876
4877 if (ret != 10)
4878 continue;
4879 }
4880
4881 if (cg_cpu_usage) {
4882 if (physcpu >= cg_cpu_usage_size)
4883 break;
4884
4885 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4886 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
4887
4888 if (all_used >= cg_used) {
4889 new_idle = idle + (all_used - cg_used);
4890
4891 } else {
4892 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4893 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4894 curcpu, cg, all_used, cg_used);
4895 new_idle = idle;
4896 }
4897
4898 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4899 curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
4900 new_idle);
4901
4902 if (l < 0) {
4903 perror("Error writing to cache");
4904 rv = 0;
4905 goto err;
4906
4907 }
4908 if (l >= cache_size) {
4909 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4910 rv = 0;
4911 goto err;
4912 }
4913
4914 cache += l;
4915 cache_size -= l;
4916 total_len += l;
4917
4918 user_sum += cg_cpu_usage[physcpu].user;
4919 system_sum += cg_cpu_usage[physcpu].system;
4920 idle_sum += new_idle;
4921
4922 } else {
4923 user_sum += user;
4924 nice_sum += nice;
4925 system_sum += system;
4926 idle_sum += idle;
4927 iowait_sum += iowait;
4928 irq_sum += irq;
4929 softirq_sum += softirq;
4930 steal_sum += steal;
4931 guest_sum += guest;
4932 guest_nice_sum += guest_nice;
4933 }
4934 }
4935
4936 cache = d->buf;
4937
4938 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4939 user_sum,
4940 nice_sum,
4941 system_sum,
4942 idle_sum,
4943 iowait_sum,
4944 irq_sum,
4945 softirq_sum,
4946 steal_sum,
4947 guest_sum,
4948 guest_nice_sum);
4949 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4950 memcpy(cache, cpuall, cpuall_len);
4951 cache += cpuall_len;
4952 } else {
4953 /* shouldn't happen */
4954 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4955 cpuall_len = 0;
4956 }
4957
4958 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4959 total_len += cpuall_len;
4960
4961 out:
4962 d->cached = 1;
4963 d->size = total_len;
4964 if (total_len > size)
4965 total_len = size;
4966
4967 memcpy(buf, d->buf, total_len);
4968 rv = total_len;
4969
4970 err:
4971 if (f)
4972 fclose(f);
4973 if (cg_cpu_usage)
4974 free(cg_cpu_usage);
4975 free(line);
4976 free(cpuset);
4977 free(cg);
4978 return rv;
4979 }
4980
4981 /* This function retrieves the busy time of a group of tasks by looking at
4982 * cpuacct.usage. Unfortunately, this only makes sense when the container has
4983 * been given it's own cpuacct cgroup. If not, this function will take the busy
4984 * time of all other taks that do not actually belong to the container into
4985 * account as well. If someone has a clever solution for this please send a
4986 * patch!
4987 */
4988 static unsigned long get_reaper_busy(pid_t task)
4989 {
4990 pid_t initpid = lookup_initpid_in_store(task);
4991 char *cgroup = NULL, *usage_str = NULL;
4992 unsigned long usage = 0;
4993
4994 if (initpid <= 0)
4995 return 0;
4996
4997 cgroup = get_pid_cgroup(initpid, "cpuacct");
4998 if (!cgroup)
4999 goto out;
5000 prune_init_slice(cgroup);
5001 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
5002 goto out;
5003 usage = strtoul(usage_str, NULL, 10);
5004 usage /= 1000000000;
5005
5006 out:
5007 free(cgroup);
5008 free(usage_str);
5009 return usage;
5010 }
5011
5012 #if RELOADTEST
5013 void iwashere(void)
5014 {
5015 int fd;
5016
5017 fd = creat("/tmp/lxcfs-iwashere", 0644);
5018 if (fd >= 0)
5019 close(fd);
5020 }
5021 #endif
5022
5023 /*
5024 * We read /proc/uptime and reuse its second field.
5025 * For the first field, we use the mtime for the reaper for
5026 * the calling pid as returned by getreaperage
5027 */
5028 static int proc_uptime_read(char *buf, size_t size, off_t offset,
5029 struct fuse_file_info *fi)
5030 {
5031 struct fuse_context *fc = fuse_get_context();
5032 struct file_info *d = (struct file_info *)fi->fh;
5033 unsigned long int busytime = get_reaper_busy(fc->pid);
5034 char *cache = d->buf;
5035 ssize_t total_len = 0;
5036 uint64_t idletime, reaperage;
5037
5038 #if RELOADTEST
5039 iwashere();
5040 #endif
5041
5042 if (offset){
5043 if (!d->cached)
5044 return 0;
5045 if (offset > d->size)
5046 return -EINVAL;
5047 int left = d->size - offset;
5048 total_len = left > size ? size: left;
5049 memcpy(buf, cache + offset, total_len);
5050 return total_len;
5051 }
5052
5053 reaperage = get_reaper_age(fc->pid);
5054 /* To understand why this is done, please read the comment to the
5055 * get_reaper_busy() function.
5056 */
5057 idletime = reaperage;
5058 if (reaperage >= busytime)
5059 idletime = reaperage - busytime;
5060
5061 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
5062 if (total_len < 0 || total_len >= d->buflen){
5063 lxcfs_error("%s\n", "failed to write to cache");
5064 return 0;
5065 }
5066
5067 d->size = (int)total_len;
5068 d->cached = 1;
5069
5070 if (total_len > size) total_len = size;
5071
5072 memcpy(buf, d->buf, total_len);
5073 return total_len;
5074 }
5075
5076 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5077 struct fuse_file_info *fi)
5078 {
5079 char dev_name[72];
5080 struct fuse_context *fc = fuse_get_context();
5081 struct file_info *d = (struct file_info *)fi->fh;
5082 char *cg;
5083 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5084 *io_wait_time_str = NULL, *io_service_time_str = NULL;
5085 unsigned long read = 0, write = 0;
5086 unsigned long read_merged = 0, write_merged = 0;
5087 unsigned long read_sectors = 0, write_sectors = 0;
5088 unsigned long read_ticks = 0, write_ticks = 0;
5089 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5090 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5091 char *cache = d->buf;
5092 size_t cache_size = d->buflen;
5093 char *line = NULL;
5094 size_t linelen = 0, total_len = 0, rv = 0;
5095 unsigned int major = 0, minor = 0;
5096 int i = 0;
5097 FILE *f = NULL;
5098
5099 if (offset){
5100 if (offset > d->size)
5101 return -EINVAL;
5102 if (!d->cached)
5103 return 0;
5104 int left = d->size - offset;
5105 total_len = left > size ? size: left;
5106 memcpy(buf, cache + offset, total_len);
5107 return total_len;
5108 }
5109
5110 pid_t initpid = lookup_initpid_in_store(fc->pid);
5111 if (initpid <= 0)
5112 initpid = fc->pid;
5113 cg = get_pid_cgroup(initpid, "blkio");
5114 if (!cg)
5115 return read_file("/proc/diskstats", buf, size, d);
5116 prune_init_slice(cg);
5117
5118 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
5119 goto err;
5120 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
5121 goto err;
5122 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
5123 goto err;
5124 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
5125 goto err;
5126 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
5127 goto err;
5128
5129
5130 f = fopen("/proc/diskstats", "r");
5131 if (!f)
5132 goto err;
5133
5134 while (getline(&line, &linelen, f) != -1) {
5135 ssize_t l;
5136 char lbuf[256];
5137
5138 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5139 if (i != 3)
5140 continue;
5141
5142 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5143 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5144 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5145 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5146 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5147 read_sectors = read_sectors/512;
5148 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5149 write_sectors = write_sectors/512;
5150
5151 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5152 rd_svctm = rd_svctm/1000000;
5153 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5154 rd_wait = rd_wait/1000000;
5155 read_ticks = rd_svctm + rd_wait;
5156
5157 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5158 wr_svctm = wr_svctm/1000000;
5159 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5160 wr_wait = wr_wait/1000000;
5161 write_ticks = wr_svctm + wr_wait;
5162
5163 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5164 tot_ticks = tot_ticks/1000000;
5165
5166 memset(lbuf, 0, 256);
5167 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5168 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5169 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5170 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5171 else
5172 continue;
5173
5174 l = snprintf(cache, cache_size, "%s", lbuf);
5175 if (l < 0) {
5176 perror("Error writing to fuse buf");
5177 rv = 0;
5178 goto err;
5179 }
5180 if (l >= cache_size) {
5181 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5182 rv = 0;
5183 goto err;
5184 }
5185 cache += l;
5186 cache_size -= l;
5187 total_len += l;
5188 }
5189
5190 d->cached = 1;
5191 d->size = total_len;
5192 if (total_len > size ) total_len = size;
5193 memcpy(buf, d->buf, total_len);
5194
5195 rv = total_len;
5196 err:
5197 free(cg);
5198 if (f)
5199 fclose(f);
5200 free(line);
5201 free(io_serviced_str);
5202 free(io_merged_str);
5203 free(io_service_bytes_str);
5204 free(io_wait_time_str);
5205 free(io_service_time_str);
5206 return rv;
5207 }
5208
5209 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5210 struct fuse_file_info *fi)
5211 {
5212 struct fuse_context *fc = fuse_get_context();
5213 struct file_info *d = (struct file_info *)fi->fh;
5214 char *cg = NULL;
5215 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
5216 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
5217 ssize_t total_len = 0, rv = 0;
5218 ssize_t l = 0;
5219 char *cache = d->buf;
5220
5221 if (offset) {
5222 if (offset > d->size)
5223 return -EINVAL;
5224 if (!d->cached)
5225 return 0;
5226 int left = d->size - offset;
5227 total_len = left > size ? size: left;
5228 memcpy(buf, cache + offset, total_len);
5229 return total_len;
5230 }
5231
5232 pid_t initpid = lookup_initpid_in_store(fc->pid);
5233 if (initpid <= 0)
5234 initpid = fc->pid;
5235 cg = get_pid_cgroup(initpid, "memory");
5236 if (!cg)
5237 return read_file("/proc/swaps", buf, size, d);
5238 prune_init_slice(cg);
5239
5240 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
5241
5242 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
5243 goto err;
5244
5245 memusage = strtoul(memusage_str, NULL, 10);
5246
5247 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5248 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5249
5250 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
5251 memswusage = strtoul(memswusage_str, NULL, 10);
5252
5253 swap_total = (memswlimit - memlimit) / 1024;
5254 swap_free = (memswusage - memusage) / 1024;
5255 }
5256
5257 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5258
5259 /* When no mem + swap limit is specified or swapaccount=0*/
5260 if (!memswlimit) {
5261 char *line = NULL;
5262 size_t linelen = 0;
5263 FILE *f = fopen("/proc/meminfo", "r");
5264
5265 if (!f)
5266 goto err;
5267
5268 while (getline(&line, &linelen, f) != -1) {
5269 if (startswith(line, "SwapTotal:")) {
5270 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
5271 } else if (startswith(line, "SwapFree:")) {
5272 sscanf(line, "SwapFree: %8lu kB", &swap_free);
5273 }
5274 }
5275
5276 free(line);
5277 fclose(f);
5278 }
5279
5280 if (swap_total > 0) {
5281 l = snprintf(d->buf + total_len, d->size - total_len,
5282 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5283 swap_total, swap_free);
5284 total_len += l;
5285 }
5286
5287 if (total_len < 0 || l < 0) {
5288 perror("Error writing to cache");
5289 rv = 0;
5290 goto err;
5291 }
5292
5293 d->cached = 1;
5294 d->size = (int)total_len;
5295
5296 if (total_len > size) total_len = size;
5297 memcpy(buf, d->buf, total_len);
5298 rv = total_len;
5299
5300 err:
5301 free(cg);
5302 free(memswlimit_str);
5303 free(memlimit_str);
5304 free(memusage_str);
5305 free(memswusage_str);
5306 return rv;
5307 }
5308 /*
5309 * Find the process pid from cgroup path.
5310 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5311 * @pid_buf : put pid to pid_buf.
5312 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5313 * @depth : the depth of cgroup in container.
5314 * @sum : return the number of pid.
5315 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5316 */
5317 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5318 {
5319 DIR *dir;
5320 int fd;
5321 struct dirent *file;
5322 FILE *f = NULL;
5323 size_t linelen = 0;
5324 char *line = NULL;
5325 int pd;
5326 char *path_dir, *path;
5327 char **pid;
5328
5329 /* path = dpath + "/cgroup.procs" + /0 */
5330 do {
5331 path = malloc(strlen(dpath) + 20);
5332 } while (!path);
5333
5334 strcpy(path, dpath);
5335 fd = openat(cfd, path, O_RDONLY);
5336 if (fd < 0)
5337 goto out;
5338
5339 dir = fdopendir(fd);
5340 if (dir == NULL) {
5341 close(fd);
5342 goto out;
5343 }
5344
5345 while (((file = readdir(dir)) != NULL) && depth > 0) {
5346 if (strncmp(file->d_name, ".", 1) == 0)
5347 continue;
5348 if (strncmp(file->d_name, "..", 1) == 0)
5349 continue;
5350 if (file->d_type == DT_DIR) {
5351 /* path + '/' + d_name +/0 */
5352 do {
5353 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5354 } while (!path_dir);
5355 strcpy(path_dir, path);
5356 strcat(path_dir, "/");
5357 strcat(path_dir, file->d_name);
5358 pd = depth - 1;
5359 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5360 free(path_dir);
5361 }
5362 }
5363 closedir(dir);
5364
5365 strcat(path, "/cgroup.procs");
5366 fd = openat(cfd, path, O_RDONLY);
5367 if (fd < 0)
5368 goto out;
5369
5370 f = fdopen(fd, "r");
5371 if (!f) {
5372 close(fd);
5373 goto out;
5374 }
5375
5376 while (getline(&line, &linelen, f) != -1) {
5377 do {
5378 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5379 } while (!pid);
5380 *pid_buf = pid;
5381 do {
5382 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5383 } while (*(*pid_buf + sum) == NULL);
5384 strcpy(*(*pid_buf + sum), line);
5385 sum++;
5386 }
5387 fclose(f);
5388 out:
5389 if (line)
5390 free(line);
5391 free(path);
5392 return sum;
5393 }
5394 /*
5395 * calc_load calculates the load according to the following formula:
5396 * load1 = load0 * exp + active * (1 - exp)
5397 *
5398 * @load1: the new loadavg.
5399 * @load0: the former loadavg.
5400 * @active: the total number of running pid at this moment.
5401 * @exp: the fixed-point defined in the beginning.
5402 */
5403 static unsigned long
5404 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5405 {
5406 unsigned long newload;
5407
5408 active = active > 0 ? active * FIXED_1 : 0;
5409 newload = load * exp + active * (FIXED_1 - exp);
5410 if (active >= load)
5411 newload += FIXED_1 - 1;
5412
5413 return newload / FIXED_1;
5414 }
5415
5416 /*
5417 * Return 0 means that container p->cg is closed.
5418 * Return -1 means that error occurred in refresh.
5419 * Positive num equals the total number of pid.
5420 */
5421 static int refresh_load(struct load_node *p, char *path)
5422 {
5423 FILE *f = NULL;
5424 char **idbuf;
5425 char proc_path[256];
5426 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5427 char *line = NULL;
5428 size_t linelen = 0;
5429 int sum, length;
5430 DIR *dp;
5431 struct dirent *file;
5432
5433 do {
5434 idbuf = malloc(sizeof(char *));
5435 } while (!idbuf);
5436 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5437 /* normal exit */
5438 if (sum == 0)
5439 goto out;
5440
5441 for (i = 0; i < sum; i++) {
5442 /*clean up '\n' */
5443 length = strlen(idbuf[i])-1;
5444 idbuf[i][length] = '\0';
5445 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5446 if (ret < 0 || ret > 255) {
5447 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5448 i = sum;
5449 sum = -1;
5450 goto err_out;
5451 }
5452
5453 dp = opendir(proc_path);
5454 if (!dp) {
5455 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5456 continue;
5457 }
5458 while ((file = readdir(dp)) != NULL) {
5459 if (strncmp(file->d_name, ".", 1) == 0)
5460 continue;
5461 if (strncmp(file->d_name, "..", 1) == 0)
5462 continue;
5463 total_pid++;
5464 /* We make the biggest pid become last_pid.*/
5465 ret = atof(file->d_name);
5466 last_pid = (ret > last_pid) ? ret : last_pid;
5467
5468 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5469 if (ret < 0 || ret > 255) {
5470 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5471 i = sum;
5472 sum = -1;
5473 closedir(dp);
5474 goto err_out;
5475 }
5476 f = fopen(proc_path, "r");
5477 if (f != NULL) {
5478 while (getline(&line, &linelen, f) != -1) {
5479 /* Find State */
5480 if ((line[0] == 'S') && (line[1] == 't'))
5481 break;
5482 }
5483 if ((line[7] == 'R') || (line[7] == 'D'))
5484 run_pid++;
5485 fclose(f);
5486 }
5487 }
5488 closedir(dp);
5489 }
5490 /*Calculate the loadavg.*/
5491 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5492 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5493 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5494 p->run_pid = run_pid;
5495 p->total_pid = total_pid;
5496 p->last_pid = last_pid;
5497
5498 free(line);
5499 err_out:
5500 for (; i > 0; i--)
5501 free(idbuf[i-1]);
5502 out:
5503 free(idbuf);
5504 return sum;
5505 }
5506 /*
5507 * Traverse the hash table and update it.
5508 */
5509 void *load_begin(void *arg)
5510 {
5511
5512 char *path = NULL;
5513 int i, sum, length, ret;
5514 struct load_node *f;
5515 int first_node;
5516 clock_t time1, time2;
5517
5518 while (1) {
5519 if (loadavg_stop == 1)
5520 return NULL;
5521
5522 time1 = clock();
5523 for (i = 0; i < LOAD_SIZE; i++) {
5524 pthread_mutex_lock(&load_hash[i].lock);
5525 if (load_hash[i].next == NULL) {
5526 pthread_mutex_unlock(&load_hash[i].lock);
5527 continue;
5528 }
5529 f = load_hash[i].next;
5530 first_node = 1;
5531 while (f) {
5532 length = strlen(f->cg) + 2;
5533 do {
5534 /* strlen(f->cg) + '.' or '' + \0 */
5535 path = malloc(length);
5536 } while (!path);
5537
5538 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5539 if (ret < 0 || ret > length - 1) {
5540 /* snprintf failed, ignore the node.*/
5541 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5542 goto out;
5543 }
5544 sum = refresh_load(f, path);
5545 if (sum == 0) {
5546 f = del_node(f, i);
5547 } else {
5548 out: f = f->next;
5549 }
5550 free(path);
5551 /* load_hash[i].lock locks only on the first node.*/
5552 if (first_node == 1) {
5553 first_node = 0;
5554 pthread_mutex_unlock(&load_hash[i].lock);
5555 }
5556 }
5557 }
5558
5559 if (loadavg_stop == 1)
5560 return NULL;
5561
5562 time2 = clock();
5563 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5564 }
5565 }
5566
5567 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5568 struct fuse_file_info *fi)
5569 {
5570 struct fuse_context *fc = fuse_get_context();
5571 struct file_info *d = (struct file_info *)fi->fh;
5572 pid_t initpid;
5573 char *cg;
5574 size_t total_len = 0;
5575 char *cache = d->buf;
5576 struct load_node *n;
5577 int hash;
5578 int cfd, rv = 0;
5579 unsigned long a, b, c;
5580
5581 if (offset) {
5582 if (offset > d->size)
5583 return -EINVAL;
5584 if (!d->cached)
5585 return 0;
5586 int left = d->size - offset;
5587 total_len = left > size ? size : left;
5588 memcpy(buf, cache + offset, total_len);
5589 return total_len;
5590 }
5591 if (!loadavg)
5592 return read_file("/proc/loadavg", buf, size, d);
5593
5594 initpid = lookup_initpid_in_store(fc->pid);
5595 if (initpid <= 0)
5596 initpid = fc->pid;
5597 cg = get_pid_cgroup(initpid, "cpu");
5598 if (!cg)
5599 return read_file("/proc/loadavg", buf, size, d);
5600
5601 prune_init_slice(cg);
5602 hash = calc_hash(cg) % LOAD_SIZE;
5603 n = locate_node(cg, hash);
5604
5605 /* First time */
5606 if (n == NULL) {
5607 if (!find_mounted_controller("cpu", &cfd)) {
5608 /*
5609 * In locate_node() above, pthread_rwlock_unlock() isn't used
5610 * because delete is not allowed before read has ended.
5611 */
5612 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5613 rv = 0;
5614 goto err;
5615 }
5616 do {
5617 n = malloc(sizeof(struct load_node));
5618 } while (!n);
5619
5620 do {
5621 n->cg = malloc(strlen(cg)+1);
5622 } while (!n->cg);
5623 strcpy(n->cg, cg);
5624 n->avenrun[0] = 0;
5625 n->avenrun[1] = 0;
5626 n->avenrun[2] = 0;
5627 n->run_pid = 0;
5628 n->total_pid = 1;
5629 n->last_pid = initpid;
5630 n->cfd = cfd;
5631 insert_node(&n, hash);
5632 }
5633 a = n->avenrun[0] + (FIXED_1/200);
5634 b = n->avenrun[1] + (FIXED_1/200);
5635 c = n->avenrun[2] + (FIXED_1/200);
5636 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5637 LOAD_INT(a), LOAD_FRAC(a),
5638 LOAD_INT(b), LOAD_FRAC(b),
5639 LOAD_INT(c), LOAD_FRAC(c),
5640 n->run_pid, n->total_pid, n->last_pid);
5641 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5642 if (total_len < 0 || total_len >= d->buflen) {
5643 lxcfs_error("%s\n", "Failed to write to cache");
5644 rv = 0;
5645 goto err;
5646 }
5647 d->size = (int)total_len;
5648 d->cached = 1;
5649
5650 if (total_len > size)
5651 total_len = size;
5652 memcpy(buf, d->buf, total_len);
5653 rv = total_len;
5654
5655 err:
5656 free(cg);
5657 return rv;
5658 }
5659 /* Return a positive number on success, return 0 on failure.*/
5660 pthread_t load_daemon(int load_use)
5661 {
5662 int ret;
5663 pthread_t pid;
5664
5665 ret = init_load();
5666 if (ret == -1) {
5667 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5668 return 0;
5669 }
5670 ret = pthread_create(&pid, NULL, load_begin, NULL);
5671 if (ret != 0) {
5672 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5673 load_free();
5674 return 0;
5675 }
5676 /* use loadavg, here loadavg = 1*/
5677 loadavg = load_use;
5678 return pid;
5679 }
5680
5681 /* Returns 0 on success. */
5682 int stop_load_daemon(pthread_t pid)
5683 {
5684 int s;
5685
5686 /* Signal the thread to gracefully stop */
5687 loadavg_stop = 1;
5688
5689 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5690 if (s != 0) {
5691 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5692 return -1;
5693 }
5694
5695 load_free();
5696 loadavg_stop = 0;
5697
5698 return 0;
5699 }
5700
5701 static off_t get_procfile_size(const char *which)
5702 {
5703 FILE *f = fopen(which, "r");
5704 char *line = NULL;
5705 size_t len = 0;
5706 ssize_t sz, answer = 0;
5707 if (!f)
5708 return 0;
5709
5710 while ((sz = getline(&line, &len, f)) != -1)
5711 answer += sz;
5712 fclose (f);
5713 free(line);
5714
5715 return answer;
5716 }
5717
5718 int proc_getattr(const char *path, struct stat *sb)
5719 {
5720 struct timespec now;
5721
5722 memset(sb, 0, sizeof(struct stat));
5723 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5724 return -EINVAL;
5725 sb->st_uid = sb->st_gid = 0;
5726 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5727 if (strcmp(path, "/proc") == 0) {
5728 sb->st_mode = S_IFDIR | 00555;
5729 sb->st_nlink = 2;
5730 return 0;
5731 }
5732 if (strcmp(path, "/proc/meminfo") == 0 ||
5733 strcmp(path, "/proc/cpuinfo") == 0 ||
5734 strcmp(path, "/proc/uptime") == 0 ||
5735 strcmp(path, "/proc/stat") == 0 ||
5736 strcmp(path, "/proc/diskstats") == 0 ||
5737 strcmp(path, "/proc/swaps") == 0 ||
5738 strcmp(path, "/proc/loadavg") == 0) {
5739 sb->st_size = 0;
5740 sb->st_mode = S_IFREG | 00444;
5741 sb->st_nlink = 1;
5742 return 0;
5743 }
5744
5745 return -ENOENT;
5746 }
5747
5748 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5749 struct fuse_file_info *fi)
5750 {
5751 if (filler(buf, ".", NULL, 0) != 0 ||
5752 filler(buf, "..", NULL, 0) != 0 ||
5753 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5754 filler(buf, "meminfo", NULL, 0) != 0 ||
5755 filler(buf, "stat", NULL, 0) != 0 ||
5756 filler(buf, "uptime", NULL, 0) != 0 ||
5757 filler(buf, "diskstats", NULL, 0) != 0 ||
5758 filler(buf, "swaps", NULL, 0) != 0 ||
5759 filler(buf, "loadavg", NULL, 0) != 0)
5760 return -EINVAL;
5761 return 0;
5762 }
5763
5764 int proc_open(const char *path, struct fuse_file_info *fi)
5765 {
5766 int type = -1;
5767 struct file_info *info;
5768
5769 if (strcmp(path, "/proc/meminfo") == 0)
5770 type = LXC_TYPE_PROC_MEMINFO;
5771 else if (strcmp(path, "/proc/cpuinfo") == 0)
5772 type = LXC_TYPE_PROC_CPUINFO;
5773 else if (strcmp(path, "/proc/uptime") == 0)
5774 type = LXC_TYPE_PROC_UPTIME;
5775 else if (strcmp(path, "/proc/stat") == 0)
5776 type = LXC_TYPE_PROC_STAT;
5777 else if (strcmp(path, "/proc/diskstats") == 0)
5778 type = LXC_TYPE_PROC_DISKSTATS;
5779 else if (strcmp(path, "/proc/swaps") == 0)
5780 type = LXC_TYPE_PROC_SWAPS;
5781 else if (strcmp(path, "/proc/loadavg") == 0)
5782 type = LXC_TYPE_PROC_LOADAVG;
5783 if (type == -1)
5784 return -ENOENT;
5785
5786 info = malloc(sizeof(*info));
5787 if (!info)
5788 return -ENOMEM;
5789
5790 memset(info, 0, sizeof(*info));
5791 info->type = type;
5792
5793 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5794 do {
5795 info->buf = malloc(info->buflen);
5796 } while (!info->buf);
5797 memset(info->buf, 0, info->buflen);
5798 /* set actual size to buffer size */
5799 info->size = info->buflen;
5800
5801 fi->fh = (unsigned long)info;
5802 return 0;
5803 }
5804
5805 int proc_access(const char *path, int mask)
5806 {
5807 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5808 return 0;
5809
5810 /* these are all read-only */
5811 if ((mask & ~R_OK) != 0)
5812 return -EACCES;
5813 return 0;
5814 }
5815
5816 int proc_release(const char *path, struct fuse_file_info *fi)
5817 {
5818 do_release_file_info(fi);
5819 return 0;
5820 }
5821
5822 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5823 struct fuse_file_info *fi)
5824 {
5825 struct file_info *f = (struct file_info *) fi->fh;
5826
5827 switch (f->type) {
5828 case LXC_TYPE_PROC_MEMINFO:
5829 return proc_meminfo_read(buf, size, offset, fi);
5830 case LXC_TYPE_PROC_CPUINFO:
5831 return proc_cpuinfo_read(buf, size, offset, fi);
5832 case LXC_TYPE_PROC_UPTIME:
5833 return proc_uptime_read(buf, size, offset, fi);
5834 case LXC_TYPE_PROC_STAT:
5835 return proc_stat_read(buf, size, offset, fi);
5836 case LXC_TYPE_PROC_DISKSTATS:
5837 return proc_diskstats_read(buf, size, offset, fi);
5838 case LXC_TYPE_PROC_SWAPS:
5839 return proc_swaps_read(buf, size, offset, fi);
5840 case LXC_TYPE_PROC_LOADAVG:
5841 return proc_loadavg_read(buf, size, offset, fi);
5842 default:
5843 return -EINVAL;
5844 }
5845 }
5846
5847 /*
5848 * Functions needed to setup cgroups in the __constructor__.
5849 */
5850
5851 static bool mkdir_p(const char *dir, mode_t mode)
5852 {
5853 const char *tmp = dir;
5854 const char *orig = dir;
5855 char *makeme;
5856
5857 do {
5858 dir = tmp + strspn(tmp, "/");
5859 tmp = dir + strcspn(dir, "/");
5860 makeme = strndup(orig, dir - orig);
5861 if (!makeme)
5862 return false;
5863 if (mkdir(makeme, mode) && errno != EEXIST) {
5864 lxcfs_error("Failed to create directory '%s': %s.\n",
5865 makeme, strerror(errno));
5866 free(makeme);
5867 return false;
5868 }
5869 free(makeme);
5870 } while(tmp != dir);
5871
5872 return true;
5873 }
5874
5875 static bool umount_if_mounted(void)
5876 {
5877 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5878 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5879 return false;
5880 }
5881 return true;
5882 }
5883
5884 /* __typeof__ should be safe to use with all compilers. */
5885 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5886 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5887 {
5888 return (fs->f_type == (fs_type_magic)magic_val);
5889 }
5890
5891 /*
5892 * looking at fs/proc_namespace.c, it appears we can
5893 * actually expect the rootfs entry to very specifically contain
5894 * " - rootfs rootfs "
5895 * IIUC, so long as we've chrooted so that rootfs is not our root,
5896 * the rootfs entry should always be skipped in mountinfo contents.
5897 */
5898 static bool is_on_ramfs(void)
5899 {
5900 FILE *f;
5901 char *p, *p2;
5902 char *line = NULL;
5903 size_t len = 0;
5904 int i;
5905
5906 f = fopen("/proc/self/mountinfo", "r");
5907 if (!f)
5908 return false;
5909
5910 while (getline(&line, &len, f) != -1) {
5911 for (p = line, i = 0; p && i < 4; i++)
5912 p = strchr(p + 1, ' ');
5913 if (!p)
5914 continue;
5915 p2 = strchr(p + 1, ' ');
5916 if (!p2)
5917 continue;
5918 *p2 = '\0';
5919 if (strcmp(p + 1, "/") == 0) {
5920 // this is '/'. is it the ramfs?
5921 p = strchr(p2 + 1, '-');
5922 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5923 free(line);
5924 fclose(f);
5925 return true;
5926 }
5927 }
5928 }
5929 free(line);
5930 fclose(f);
5931 return false;
5932 }
5933
5934 static int pivot_enter()
5935 {
5936 int ret = -1, oldroot = -1, newroot = -1;
5937
5938 oldroot = open("/", O_DIRECTORY | O_RDONLY);
5939 if (oldroot < 0) {
5940 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5941 return ret;
5942 }
5943
5944 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5945 if (newroot < 0) {
5946 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5947 goto err;
5948 }
5949
5950 /* change into new root fs */
5951 if (fchdir(newroot) < 0) {
5952 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5953 goto err;
5954 }
5955
5956 /* pivot_root into our new root fs */
5957 if (pivot_root(".", ".") < 0) {
5958 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5959 goto err;
5960 }
5961
5962 /*
5963 * At this point the old-root is mounted on top of our new-root.
5964 * To unmounted it we must not be chdir'd into it, so escape back
5965 * to the old-root.
5966 */
5967 if (fchdir(oldroot) < 0) {
5968 lxcfs_error("%s\n", "Failed to enter old root.");
5969 goto err;
5970 }
5971
5972 if (umount2(".", MNT_DETACH) < 0) {
5973 lxcfs_error("%s\n", "Failed to detach old root.");
5974 goto err;
5975 }
5976
5977 if (fchdir(newroot) < 0) {
5978 lxcfs_error("%s\n", "Failed to re-enter new root.");
5979 goto err;
5980 }
5981
5982 ret = 0;
5983
5984 err:
5985 if (oldroot > 0)
5986 close(oldroot);
5987 if (newroot > 0)
5988 close(newroot);
5989
5990 return ret;
5991 }
5992
5993 static int chroot_enter()
5994 {
5995 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5996 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5997 return -1;
5998 }
5999
6000 if (chroot(".") < 0) {
6001 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
6002 return -1;
6003 }
6004
6005 if (chdir("/") < 0) {
6006 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
6007 return -1;
6008 }
6009
6010 return 0;
6011 }
6012
6013 static int permute_and_enter(void)
6014 {
6015 struct statfs sb;
6016
6017 if (statfs("/", &sb) < 0) {
6018 lxcfs_error("%s\n", "Could not stat / mountpoint.");
6019 return -1;
6020 }
6021
6022 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
6023 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
6024 * /proc/1/mountinfo. */
6025 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
6026 return chroot_enter();
6027
6028 if (pivot_enter() < 0) {
6029 lxcfs_error("%s\n", "Could not perform pivot root.");
6030 return -1;
6031 }
6032
6033 return 0;
6034 }
6035
6036 /* Prepare our new clean root. */
6037 static int permute_prepare(void)
6038 {
6039 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
6040 lxcfs_error("%s\n", "Failed to create directory for new root.");
6041 return -1;
6042 }
6043
6044 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
6045 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
6046 return -1;
6047 }
6048
6049 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
6050 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
6051 return -1;
6052 }
6053
6054 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
6055 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
6056 return -1;
6057 }
6058
6059 return 0;
6060 }
6061
6062 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6063 static bool permute_root(void)
6064 {
6065 /* Prepare new root. */
6066 if (permute_prepare() < 0)
6067 return false;
6068
6069 /* Pivot into new root. */
6070 if (permute_and_enter() < 0)
6071 return false;
6072
6073 return true;
6074 }
6075
6076 static int preserve_mnt_ns(int pid)
6077 {
6078 int ret;
6079 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6080 char path[len];
6081
6082 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6083 if (ret < 0 || (size_t)ret >= len)
6084 return -1;
6085
6086 return open(path, O_RDONLY | O_CLOEXEC);
6087 }
6088
6089 static bool cgfs_prepare_mounts(void)
6090 {
6091 if (!mkdir_p(BASEDIR, 0700)) {
6092 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6093 return false;
6094 }
6095
6096 if (!umount_if_mounted()) {
6097 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6098 return false;
6099 }
6100
6101 if (unshare(CLONE_NEWNS) < 0) {
6102 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6103 return false;
6104 }
6105
6106 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6107 if (cgroup_mount_ns_fd < 0) {
6108 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6109 return false;
6110 }
6111
6112 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6113 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6114 return false;
6115 }
6116
6117 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6118 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6119 return false;
6120 }
6121
6122 return true;
6123 }
6124
6125 static bool cgfs_mount_hierarchies(void)
6126 {
6127 char *target;
6128 size_t clen, len;
6129 int i, ret;
6130
6131 for (i = 0; i < num_hierarchies; i++) {
6132 char *controller = hierarchies[i];
6133
6134 clen = strlen(controller);
6135 len = strlen(BASEDIR) + clen + 2;
6136 target = malloc(len);
6137 if (!target)
6138 return false;
6139
6140 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
6141 if (ret < 0 || ret >= len) {
6142 free(target);
6143 return false;
6144 }
6145 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
6146 free(target);
6147 return false;
6148 }
6149 if (!strcmp(controller, "unified"))
6150 ret = mount("none", target, "cgroup2", 0, NULL);
6151 else
6152 ret = mount(controller, target, "cgroup", 0, controller);
6153 if (ret < 0) {
6154 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
6155 free(target);
6156 return false;
6157 }
6158
6159 fd_hierarchies[i] = open(target, O_DIRECTORY);
6160 if (fd_hierarchies[i] < 0) {
6161 free(target);
6162 return false;
6163 }
6164 free(target);
6165 }
6166 return true;
6167 }
6168
6169 static bool cgfs_setup_controllers(void)
6170 {
6171 if (!cgfs_prepare_mounts())
6172 return false;
6173
6174 if (!cgfs_mount_hierarchies()) {
6175 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6176 return false;
6177 }
6178
6179 if (!permute_root())
6180 return false;
6181
6182 return true;
6183 }
6184
6185 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
6186 {
6187 FILE *f;
6188 char *cret, *line = NULL;
6189 char cwd[MAXPATHLEN];
6190 size_t len = 0;
6191 int i, init_ns = -1;
6192 bool found_unified = false;
6193
6194 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
6195 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
6196 return;
6197 }
6198
6199 while (getline(&line, &len, f) != -1) {
6200 char *idx, *p, *p2;
6201
6202 p = strchr(line, ':');
6203 if (!p)
6204 goto out;
6205 idx = line;
6206 *(p++) = '\0';
6207
6208 p2 = strrchr(p, ':');
6209 if (!p2)
6210 goto out;
6211 *p2 = '\0';
6212
6213 /* With cgroupv2 /proc/self/cgroup can contain entries of the
6214 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
6215 * because it parses out the empty string "" and later on passes
6216 * it to mount(). Let's skip such entries.
6217 */
6218 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
6219 found_unified = true;
6220 p = "unified";
6221 }
6222
6223 if (!store_hierarchy(line, p))
6224 goto out;
6225 }
6226
6227 /* Preserve initial namespace. */
6228 init_ns = preserve_mnt_ns(getpid());
6229 if (init_ns < 0) {
6230 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
6231 goto out;
6232 }
6233
6234 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
6235 if (!fd_hierarchies) {
6236 lxcfs_error("%s\n", strerror(errno));
6237 goto out;
6238 }
6239
6240 for (i = 0; i < num_hierarchies; i++)
6241 fd_hierarchies[i] = -1;
6242
6243 cret = getcwd(cwd, MAXPATHLEN);
6244 if (!cret)
6245 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6246
6247 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6248 * to privately mount lxcfs cgroups. */
6249 if (!cgfs_setup_controllers()) {
6250 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
6251 goto out;
6252 }
6253
6254 if (setns(init_ns, 0) < 0) {
6255 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
6256 goto out;
6257 }
6258
6259 if (!cret || chdir(cwd) < 0)
6260 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6261
6262 if (!init_cpuview()) {
6263 lxcfs_error("%s\n", "failed to init CPU view");
6264 goto out;
6265 }
6266
6267 print_subsystems();
6268
6269 out:
6270 free(line);
6271 fclose(f);
6272 if (init_ns >= 0)
6273 close(init_ns);
6274 }
6275
6276 static void __attribute__((destructor)) free_subsystems(void)
6277 {
6278 int i;
6279
6280 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6281
6282 for (i = 0; i < num_hierarchies; i++) {
6283 if (hierarchies[i])
6284 free(hierarchies[i]);
6285 if (fd_hierarchies && fd_hierarchies[i] >= 0)
6286 close(fd_hierarchies[i]);
6287 }
6288 free(hierarchies);
6289 free(fd_hierarchies);
6290 free_cpuview();
6291
6292 if (cgroup_mount_ns_fd >= 0)
6293 close(cgroup_mount_ns_fd);
6294 }