]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
bindings: partially convert to cleanup macros
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #define __STDC_FORMAT_MACROS
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <fuse.h>
16 #include <inttypes.h>
17 #include <libgen.h>
18 #include <pthread.h>
19 #include <sched.h>
20 #include <stdbool.h>
21 #include <stdint.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <time.h>
26 #include <unistd.h>
27 #include <wait.h>
28 #include <linux/magic.h>
29 #include <linux/sched.h>
30 #include <sys/epoll.h>
31 #include <sys/mman.h>
32 #include <sys/mount.h>
33 #include <sys/param.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/sysinfo.h>
37 #include <sys/vfs.h>
38
39 #include "bindings.h"
40 #include "config.h"
41 #include "memory_utils.h"
42
43 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
44 #define LXCFS_NUMSTRLEN64 21
45
46 /* Define pivot_root() if missing from the C library */
47 #ifndef HAVE_PIVOT_ROOT
48 static int pivot_root(const char * new_root, const char * put_old)
49 {
50 #ifdef __NR_pivot_root
51 return syscall(__NR_pivot_root, new_root, put_old);
52 #else
53 errno = ENOSYS;
54 return -1;
55 #endif
56 }
57 #else
58 extern int pivot_root(const char * new_root, const char * put_old);
59 #endif
60
61 enum {
62 LXC_TYPE_CGDIR,
63 LXC_TYPE_CGFILE,
64 LXC_TYPE_PROC_MEMINFO,
65 LXC_TYPE_PROC_CPUINFO,
66 LXC_TYPE_PROC_UPTIME,
67 LXC_TYPE_PROC_STAT,
68 LXC_TYPE_PROC_DISKSTATS,
69 LXC_TYPE_PROC_SWAPS,
70 LXC_TYPE_PROC_LOADAVG,
71 };
72
73 struct file_info {
74 char *controller;
75 char *cgroup;
76 char *file;
77 int type;
78 char *buf; // unused as of yet
79 int buflen;
80 int size; //actual data size
81 int cached;
82 };
83
84 struct cpuacct_usage {
85 uint64_t user;
86 uint64_t system;
87 uint64_t idle;
88 bool online;
89 };
90
91 /* The function of hash table.*/
92 #define LOAD_SIZE 100 /*the size of hash_table */
93 #define FLUSH_TIME 5 /*the flush rate */
94 #define DEPTH_DIR 3 /*the depth of per cgroup */
95 /* The function of calculate loadavg .*/
96 #define FSHIFT 11 /* nr of bits of precision */
97 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
98 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
99 #define EXP_5 2014 /* 1/exp(5sec/5min) */
100 #define EXP_15 2037 /* 1/exp(5sec/15min) */
101 #define LOAD_INT(x) ((x) >> FSHIFT)
102 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
103 /*
104 * This parameter is used for proc_loadavg_read().
105 * 1 means use loadavg, 0 means not use.
106 */
107 static int loadavg = 0;
108 static volatile sig_atomic_t loadavg_stop = 0;
109 static int calc_hash(const char *name)
110 {
111 unsigned int hash = 0;
112 unsigned int x = 0;
113 /* ELFHash algorithm. */
114 while (*name) {
115 hash = (hash << 4) + *name++;
116 x = hash & 0xf0000000;
117 if (x != 0)
118 hash ^= (x >> 24);
119 hash &= ~x;
120 }
121 return (hash & 0x7fffffff);
122 }
123
124 struct load_node {
125 char *cg; /*cg */
126 unsigned long avenrun[3]; /* Load averages */
127 unsigned int run_pid;
128 unsigned int total_pid;
129 unsigned int last_pid;
130 int cfd; /* The file descriptor of the mounted cgroup */
131 struct load_node *next;
132 struct load_node **pre;
133 };
134
135 struct load_head {
136 /*
137 * The lock is about insert load_node and refresh load_node.To the first
138 * load_node of each hash bucket, insert and refresh in this hash bucket is
139 * mutually exclusive.
140 */
141 pthread_mutex_t lock;
142 /*
143 * The rdlock is about read loadavg and delete load_node.To each hash
144 * bucket, read and delete is mutually exclusive. But at the same time, we
145 * allow paratactic read operation. This rdlock is at list level.
146 */
147 pthread_rwlock_t rdlock;
148 /*
149 * The rilock is about read loadavg and insert load_node.To the first
150 * load_node of each hash bucket, read and insert is mutually exclusive.
151 * But at the same time, we allow paratactic read operation.
152 */
153 pthread_rwlock_t rilock;
154 struct load_node *next;
155 };
156
157 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
158 /*
159 * init_load initialize the hash table.
160 * Return 0 on success, return -1 on failure.
161 */
162 static int init_load(void)
163 {
164 int i;
165 int ret;
166
167 for (i = 0; i < LOAD_SIZE; i++) {
168 load_hash[i].next = NULL;
169 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
170 if (ret != 0) {
171 lxcfs_error("%s\n", "Failed to initialize lock");
172 goto out3;
173 }
174 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
175 if (ret != 0) {
176 lxcfs_error("%s\n", "Failed to initialize rdlock");
177 goto out2;
178 }
179 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
180 if (ret != 0) {
181 lxcfs_error("%s\n", "Failed to initialize rilock");
182 goto out1;
183 }
184 }
185 return 0;
186 out1:
187 pthread_rwlock_destroy(&load_hash[i].rdlock);
188 out2:
189 pthread_mutex_destroy(&load_hash[i].lock);
190 out3:
191 while (i > 0) {
192 i--;
193 pthread_mutex_destroy(&load_hash[i].lock);
194 pthread_rwlock_destroy(&load_hash[i].rdlock);
195 pthread_rwlock_destroy(&load_hash[i].rilock);
196 }
197 return -1;
198 }
199
200 static void insert_node(struct load_node **n, int locate)
201 {
202 struct load_node *f;
203
204 pthread_mutex_lock(&load_hash[locate].lock);
205 pthread_rwlock_wrlock(&load_hash[locate].rilock);
206 f = load_hash[locate].next;
207 load_hash[locate].next = *n;
208
209 (*n)->pre = &(load_hash[locate].next);
210 if (f)
211 f->pre = &((*n)->next);
212 (*n)->next = f;
213 pthread_mutex_unlock(&load_hash[locate].lock);
214 pthread_rwlock_unlock(&load_hash[locate].rilock);
215 }
216 /*
217 * locate_node() finds special node. Not return NULL means success.
218 * It should be noted that rdlock isn't unlocked at the end of code
219 * because this function is used to read special node. Delete is not
220 * allowed before read has ended.
221 * unlock rdlock only in proc_loadavg_read().
222 */
223 static struct load_node *locate_node(char *cg, int locate)
224 {
225 struct load_node *f = NULL;
226 int i = 0;
227
228 pthread_rwlock_rdlock(&load_hash[locate].rilock);
229 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
230 if (load_hash[locate].next == NULL) {
231 pthread_rwlock_unlock(&load_hash[locate].rilock);
232 return f;
233 }
234 f = load_hash[locate].next;
235 pthread_rwlock_unlock(&load_hash[locate].rilock);
236 while (f && ((i = strcmp(f->cg, cg)) != 0))
237 f = f->next;
238 return f;
239 }
240 /* Delete the load_node n and return the next node of it. */
241 static struct load_node *del_node(struct load_node *n, int locate)
242 {
243 struct load_node *g;
244
245 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
246 if (n->next == NULL) {
247 *(n->pre) = NULL;
248 } else {
249 *(n->pre) = n->next;
250 n->next->pre = n->pre;
251 }
252 g = n->next;
253 __free_move__(n->cg);
254 __free_move__(n);
255 pthread_rwlock_unlock(&load_hash[locate].rdlock);
256 return g;
257 }
258
259 static void load_free(void)
260 {
261 int i;
262 struct load_node *f, *p;
263
264 for (i = 0; i < LOAD_SIZE; i++) {
265 pthread_mutex_lock(&load_hash[i].lock);
266 pthread_rwlock_wrlock(&load_hash[i].rilock);
267 pthread_rwlock_wrlock(&load_hash[i].rdlock);
268 if (load_hash[i].next == NULL) {
269 pthread_mutex_unlock(&load_hash[i].lock);
270 pthread_mutex_destroy(&load_hash[i].lock);
271 pthread_rwlock_unlock(&load_hash[i].rilock);
272 pthread_rwlock_destroy(&load_hash[i].rilock);
273 pthread_rwlock_unlock(&load_hash[i].rdlock);
274 pthread_rwlock_destroy(&load_hash[i].rdlock);
275 continue;
276 }
277 for (f = load_hash[i].next; f; ) {
278 __free_move__(f->cg);
279 p = f->next;
280 __free_move__(f);
281 f = p;
282 }
283 pthread_mutex_unlock(&load_hash[i].lock);
284 pthread_mutex_destroy(&load_hash[i].lock);
285 pthread_rwlock_unlock(&load_hash[i].rilock);
286 pthread_rwlock_destroy(&load_hash[i].rilock);
287 pthread_rwlock_unlock(&load_hash[i].rdlock);
288 pthread_rwlock_destroy(&load_hash[i].rdlock);
289 }
290 }
291
292 /* Data for CPU view */
293 struct cg_proc_stat {
294 char *cg;
295 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
296 struct cpuacct_usage *view; // Usage stats reported to the container
297 int cpu_count;
298 pthread_mutex_t lock; // For node manipulation
299 struct cg_proc_stat *next;
300 };
301
302 struct cg_proc_stat_head {
303 struct cg_proc_stat *next;
304 time_t lastcheck;
305
306 /*
307 * For access to the list. Reading can be parallel, pruning is exclusive.
308 */
309 pthread_rwlock_t lock;
310 };
311
312 #define CPUVIEW_HASH_SIZE 100
313 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
314
315 static bool cpuview_init_head(struct cg_proc_stat_head **head)
316 {
317 *head = malloc(sizeof(struct cg_proc_stat_head));
318 if (!(*head)) {
319 lxcfs_error("%s\n", strerror(errno));
320 return false;
321 }
322
323 (*head)->lastcheck = time(NULL);
324 (*head)->next = NULL;
325
326 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
327 lxcfs_error("%s\n", "Failed to initialize list lock");
328 __free_move__(*head);
329 return false;
330 }
331
332 return true;
333 }
334
335 static bool init_cpuview()
336 {
337 int i;
338
339 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
340 proc_stat_history[i] = NULL;
341
342 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
343 if (!cpuview_init_head(&proc_stat_history[i]))
344 goto err;
345 }
346
347 return true;
348
349 err:
350 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
351 if (proc_stat_history[i]) {
352 __free_move__(proc_stat_history[i]);
353 }
354 }
355
356 return false;
357 }
358
359 static void free_proc_stat_node(struct cg_proc_stat *node)
360 {
361 pthread_mutex_destroy(&node->lock);
362 __free_move__(node->cg);
363 __free_move__(node->usage);
364 __free_move__(node->view);
365 __free_move__(node);
366 }
367
368 static void cpuview_free_head(struct cg_proc_stat_head *head)
369 {
370 struct cg_proc_stat *node, *tmp;
371
372 if (head->next) {
373 node = head->next;
374
375 for (;;) {
376 tmp = node;
377 node = node->next;
378 free_proc_stat_node(tmp);
379
380 if (!node)
381 break;
382 }
383 }
384
385 pthread_rwlock_destroy(&head->lock);
386 __free_move__(head);
387 }
388
389 static void free_cpuview()
390 {
391 int i;
392
393 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
394 if (proc_stat_history[i])
395 cpuview_free_head(proc_stat_history[i]);
396 }
397 }
398
399 /* Reserve buffer size to account for file size changes. */
400 #define BUF_RESERVE_SIZE 512
401
402 /*
403 * A table caching which pid is init for a pid namespace.
404 * When looking up which pid is init for $qpid, we first
405 * 1. Stat /proc/$qpid/ns/pid.
406 * 2. Check whether the ino_t is in our store.
407 * a. if not, fork a child in qpid's ns to send us
408 * ucred.pid = 1, and read the initpid. Cache
409 * initpid and creation time for /proc/initpid
410 * in a new store entry.
411 * b. if so, verify that /proc/initpid still matches
412 * what we have saved. If not, clear the store
413 * entry and go back to a. If so, return the
414 * cached initpid.
415 */
416 struct pidns_init_store {
417 ino_t ino; // inode number for /proc/$pid/ns/pid
418 pid_t initpid; // the pid of nit in that ns
419 long int ctime; // the time at which /proc/$initpid was created
420 struct pidns_init_store *next;
421 long int lastcheck;
422 };
423
424 /* lol - look at how they are allocated in the kernel */
425 #define PIDNS_HASH_SIZE 4096
426 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
427
428 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
429 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
430 static void lock_mutex(pthread_mutex_t *l)
431 {
432 int ret;
433
434 if ((ret = pthread_mutex_lock(l)) != 0) {
435 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
436 exit(1);
437 }
438 }
439
440 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
441 * Number of hierarchies mounted. */
442 static int num_hierarchies;
443
444 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
445 * Hierachies mounted {cpuset, blkio, ...}:
446 * Initialized via __constructor__ collect_and_mount_subsystems(). */
447 static char **hierarchies;
448
449 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
450 * Open file descriptors:
451 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
452 * private mount namespace.
453 * Initialized via __constructor__ collect_and_mount_subsystems().
454 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
455 * mounts and respective files in the private namespace even when located in
456 * another namespace using the *at() family of functions
457 * {openat(), fchownat(), ...}. */
458 static int *fd_hierarchies;
459 static int cgroup_mount_ns_fd = -1;
460
461 static void unlock_mutex(pthread_mutex_t *l)
462 {
463 int ret;
464
465 if ((ret = pthread_mutex_unlock(l)) != 0) {
466 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
467 exit(1);
468 }
469 }
470
471 static void store_lock(void)
472 {
473 lock_mutex(&pidns_store_mutex);
474 }
475
476 static void store_unlock(void)
477 {
478 unlock_mutex(&pidns_store_mutex);
479 }
480
481 /* Must be called under store_lock */
482 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
483 {
484 struct stat initsb;
485 char fnam[100];
486
487 snprintf(fnam, 100, "/proc/%d", e->initpid);
488 if (stat(fnam, &initsb) < 0)
489 return false;
490
491 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
492 initsb.st_ctime, e->initpid);
493
494 if (e->ctime != initsb.st_ctime)
495 return false;
496 return true;
497 }
498
499 /* Must be called under store_lock */
500 static void remove_initpid(struct pidns_init_store *e)
501 {
502 struct pidns_init_store *tmp;
503 int h;
504
505 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
506
507 h = HASH(e->ino);
508 if (pidns_hash_table[h] == e) {
509 pidns_hash_table[h] = e->next;
510 __free_move__(e);
511 return;
512 }
513
514 tmp = pidns_hash_table[h];
515 while (tmp) {
516 if (tmp->next == e) {
517 tmp->next = e->next;
518 __free_move__(e);
519 return;
520 }
521 tmp = tmp->next;
522 }
523 }
524
525 #define PURGE_SECS 5
526 /* Must be called under store_lock */
527 static void prune_initpid_store(void)
528 {
529 static long int last_prune = 0;
530 long int now, threshold;
531 int i;
532
533 if (!last_prune) {
534 last_prune = time(NULL);
535 return;
536 }
537 now = time(NULL);
538 if (now < last_prune + PURGE_SECS)
539 return;
540
541 lxcfs_debug("%s\n", "Pruning.");
542
543 last_prune = now;
544 threshold = now - 2 * PURGE_SECS;
545
546 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
547 struct pidns_init_store *e, *prev;
548
549 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
550 __do_free struct pidns_init_store *delme = NULL;
551
552 if (e->lastcheck < threshold) {
553
554 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
555
556 delme = e;
557 if (prev)
558 prev->next = e->next;
559 else
560 pidns_hash_table[i] = e->next;
561 e = e->next;
562 } else {
563 prev = e;
564 e = e->next;
565 }
566 }
567 }
568 }
569
570 /* Must be called under store_lock */
571 static void save_initpid(struct stat *sb, pid_t pid)
572 {
573 struct pidns_init_store *e;
574 char fpath[100];
575 struct stat procsb;
576 int h;
577
578 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
579
580 snprintf(fpath, 100, "/proc/%d", pid);
581 if (stat(fpath, &procsb) < 0)
582 return;
583 do {
584 e = malloc(sizeof(*e));
585 } while (!e);
586 e->ino = sb->st_ino;
587 e->initpid = pid;
588 e->ctime = procsb.st_ctime;
589 h = HASH(e->ino);
590 e->next = pidns_hash_table[h];
591 e->lastcheck = time(NULL);
592 pidns_hash_table[h] = e;
593 }
594
595 /*
596 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
597 * entry for the inode number and creation time. Verify that the init pid
598 * is still valid. If not, remove it. Return the entry if valid, NULL
599 * otherwise.
600 * Must be called under store_lock
601 */
602 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
603 {
604 int h = HASH(sb->st_ino);
605 struct pidns_init_store *e = pidns_hash_table[h];
606
607 while (e) {
608 if (e->ino == sb->st_ino) {
609 if (initpid_still_valid(e, sb)) {
610 e->lastcheck = time(NULL);
611 return e;
612 }
613 remove_initpid(e);
614 return NULL;
615 }
616 e = e->next;
617 }
618
619 return NULL;
620 }
621
622 static int is_dir(const char *path, int fd)
623 {
624 struct stat statbuf;
625 int ret = fstatat(fd, path, &statbuf, fd);
626 if (ret == 0 && S_ISDIR(statbuf.st_mode))
627 return 1;
628 return 0;
629 }
630
631 static char *must_copy_string(const char *str)
632 {
633 char *dup = NULL;
634 if (!str)
635 return NULL;
636 do {
637 dup = strdup(str);
638 } while (!dup);
639
640 return dup;
641 }
642
643 static inline void drop_trailing_newlines(char *s)
644 {
645 int l;
646
647 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
648 s[l-1] = '\0';
649 }
650
651 #define BATCH_SIZE 50
652 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
653 {
654 int newbatches = (newlen / BATCH_SIZE) + 1;
655 int oldbatches = (oldlen / BATCH_SIZE) + 1;
656
657 if (!*mem || newbatches > oldbatches) {
658 char *tmp;
659 do {
660 tmp = realloc(*mem, newbatches * BATCH_SIZE);
661 } while (!tmp);
662 *mem = tmp;
663 }
664 }
665 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
666 {
667 size_t newlen = *len + linelen;
668 dorealloc(contents, *len, newlen + 1);
669 memcpy(*contents + *len, line, linelen+1);
670 *len = newlen;
671 }
672
673 static char *slurp_file(const char *from, int fd)
674 {
675 __do_free char *line = NULL;
676 __do_fclose FILE *f = NULL;
677 char *contents = NULL;
678 size_t len = 0, fulllen = 0;
679 ssize_t linelen;
680
681 f = fdopen(fd, "r");
682 if (!f)
683 return NULL;
684
685 while ((linelen = getline(&line, &len, f)) != -1)
686 append_line(&contents, &fulllen, line, linelen);
687
688 if (contents)
689 drop_trailing_newlines(contents);
690
691 return contents;
692 }
693
694 static bool write_string(const char *fnam, const char *string, int fd)
695 {
696 FILE *f;
697 size_t len, ret;
698
699 f = fdopen(fd, "w");
700 if (!f)
701 return false;
702
703 len = strlen(string);
704 ret = fwrite(string, 1, len, f);
705 if (ret != len) {
706 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
707 strerror(errno), string, fnam);
708 fclose(f);
709 return false;
710 }
711
712 if (fclose(f) < 0) {
713 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
714 return false;
715 }
716
717 return true;
718 }
719
720 struct cgfs_files {
721 char *name;
722 uint32_t uid, gid;
723 uint32_t mode;
724 };
725
726 #define ALLOC_NUM 20
727 static bool store_hierarchy(char *stridx, char *h)
728 {
729 if (num_hierarchies % ALLOC_NUM == 0) {
730 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
731 n *= ALLOC_NUM;
732 char **tmp = realloc(hierarchies, n * sizeof(char *));
733 if (!tmp) {
734 lxcfs_error("%s\n", strerror(errno));
735 exit(1);
736 }
737 hierarchies = tmp;
738 }
739
740 hierarchies[num_hierarchies++] = must_copy_string(h);
741 return true;
742 }
743
744 static void print_subsystems(void)
745 {
746 int i;
747
748 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
749 fprintf(stderr, "hierarchies:\n");
750 for (i = 0; i < num_hierarchies; i++) {
751 if (hierarchies[i])
752 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
753 fd_hierarchies[i], hierarchies[i]);
754 }
755 }
756
757 static bool in_comma_list(const char *needle, const char *haystack)
758 {
759 const char *s = haystack, *e;
760 size_t nlen = strlen(needle);
761
762 while (*s && (e = strchr(s, ','))) {
763 if (nlen != e - s) {
764 s = e + 1;
765 continue;
766 }
767 if (strncmp(needle, s, nlen) == 0)
768 return true;
769 s = e + 1;
770 }
771 if (strcmp(needle, s) == 0)
772 return true;
773 return false;
774 }
775
776 /* do we need to do any massaging here? I'm not sure... */
777 /* Return the mounted controller and store the corresponding open file descriptor
778 * referring to the controller mountpoint in the private lxcfs namespace in
779 * @cfd.
780 */
781 static char *find_mounted_controller(const char *controller, int *cfd)
782 {
783 int i;
784
785 for (i = 0; i < num_hierarchies; i++) {
786 if (!hierarchies[i])
787 continue;
788 if (strcmp(hierarchies[i], controller) == 0) {
789 *cfd = fd_hierarchies[i];
790 return hierarchies[i];
791 }
792 if (in_comma_list(controller, hierarchies[i])) {
793 *cfd = fd_hierarchies[i];
794 return hierarchies[i];
795 }
796 }
797
798 return NULL;
799 }
800
801 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
802 const char *value)
803 {
804 int ret, fd, cfd;
805 size_t len;
806 char *fnam, *tmpc;
807
808 tmpc = find_mounted_controller(controller, &cfd);
809 if (!tmpc)
810 return false;
811
812 /* Make sure we pass a relative path to *at() family of functions.
813 * . + /cgroup + / + file + \0
814 */
815 len = strlen(cgroup) + strlen(file) + 3;
816 fnam = alloca(len);
817 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
818 if (ret < 0 || (size_t)ret >= len)
819 return false;
820
821 fd = openat(cfd, fnam, O_WRONLY);
822 if (fd < 0)
823 return false;
824
825 return write_string(fnam, value, fd);
826 }
827
828 // Chown all the files in the cgroup directory. We do this when we create
829 // a cgroup on behalf of a user.
830 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
831 {
832 struct dirent *direntp;
833 char path[MAXPATHLEN];
834 size_t len;
835 DIR *d;
836 int fd1, ret;
837
838 len = strlen(dirname);
839 if (len >= MAXPATHLEN) {
840 lxcfs_error("Pathname too long: %s\n", dirname);
841 return;
842 }
843
844 fd1 = openat(fd, dirname, O_DIRECTORY);
845 if (fd1 < 0)
846 return;
847
848 d = fdopendir(fd1);
849 if (!d) {
850 lxcfs_error("Failed to open %s\n", dirname);
851 return;
852 }
853
854 while ((direntp = readdir(d))) {
855 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
856 continue;
857 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
858 if (ret < 0 || ret >= MAXPATHLEN) {
859 lxcfs_error("Pathname too long under %s\n", dirname);
860 continue;
861 }
862 if (fchownat(fd, path, uid, gid, 0) < 0)
863 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
864 }
865 closedir(d);
866 }
867
868 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
869 {
870 int cfd;
871 size_t len;
872 char *dirnam, *tmpc;
873
874 tmpc = find_mounted_controller(controller, &cfd);
875 if (!tmpc)
876 return -EINVAL;
877
878 /* Make sure we pass a relative path to *at() family of functions.
879 * . + /cg + \0
880 */
881 len = strlen(cg) + 2;
882 dirnam = alloca(len);
883 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
884
885 if (mkdirat(cfd, dirnam, 0755) < 0)
886 return -errno;
887
888 if (uid == 0 && gid == 0)
889 return 0;
890
891 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
892 return -errno;
893
894 chown_all_cgroup_files(dirnam, uid, gid, cfd);
895
896 return 0;
897 }
898
899 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
900 {
901 struct dirent *direntp;
902 DIR *dir;
903 bool ret = false;
904 char pathname[MAXPATHLEN];
905 int dupfd;
906
907 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
908 if (dupfd < 0)
909 return false;
910
911 dir = fdopendir(dupfd);
912 if (!dir) {
913 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
914 close(dupfd);
915 return false;
916 }
917
918 while ((direntp = readdir(dir))) {
919 struct stat mystat;
920 int rc;
921
922 if (!strcmp(direntp->d_name, ".") ||
923 !strcmp(direntp->d_name, ".."))
924 continue;
925
926 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
927 if (rc < 0 || rc >= MAXPATHLEN) {
928 lxcfs_error("%s\n", "Pathname too long.");
929 continue;
930 }
931
932 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
933 if (rc) {
934 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
935 continue;
936 }
937 if (S_ISDIR(mystat.st_mode))
938 if (!recursive_rmdir(pathname, fd, cfd))
939 lxcfs_debug("Error removing %s.\n", pathname);
940 }
941
942 ret = true;
943 if (closedir(dir) < 0) {
944 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
945 ret = false;
946 }
947
948 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
949 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
950 ret = false;
951 }
952
953 close(dupfd);
954
955 return ret;
956 }
957
958 bool cgfs_remove(const char *controller, const char *cg)
959 {
960 int fd, cfd;
961 size_t len;
962 char *dirnam, *tmpc;
963 bool bret;
964
965 tmpc = find_mounted_controller(controller, &cfd);
966 if (!tmpc)
967 return false;
968
969 /* Make sure we pass a relative path to *at() family of functions.
970 * . + /cg + \0
971 */
972 len = strlen(cg) + 2;
973 dirnam = alloca(len);
974 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
975
976 fd = openat(cfd, dirnam, O_DIRECTORY);
977 if (fd < 0)
978 return false;
979
980 bret = recursive_rmdir(dirnam, fd, cfd);
981 close(fd);
982 return bret;
983 }
984
985 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
986 {
987 int cfd;
988 size_t len;
989 char *pathname, *tmpc;
990
991 tmpc = find_mounted_controller(controller, &cfd);
992 if (!tmpc)
993 return false;
994
995 /* Make sure we pass a relative path to *at() family of functions.
996 * . + /file + \0
997 */
998 len = strlen(file) + 2;
999 pathname = alloca(len);
1000 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1001 if (fchmodat(cfd, pathname, mode, 0) < 0)
1002 return false;
1003 return true;
1004 }
1005
1006 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
1007 {
1008 size_t len;
1009 char *fname;
1010
1011 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
1012 fname = alloca(len);
1013 snprintf(fname, len, "%s/tasks", dirname);
1014 if (fchownat(fd, fname, uid, gid, 0) != 0)
1015 return -errno;
1016 snprintf(fname, len, "%s/cgroup.procs", dirname);
1017 if (fchownat(fd, fname, uid, gid, 0) != 0)
1018 return -errno;
1019 return 0;
1020 }
1021
1022 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
1023 {
1024 int cfd;
1025 size_t len;
1026 char *pathname, *tmpc;
1027
1028 tmpc = find_mounted_controller(controller, &cfd);
1029 if (!tmpc)
1030 return -EINVAL;
1031
1032 /* Make sure we pass a relative path to *at() family of functions.
1033 * . + /file + \0
1034 */
1035 len = strlen(file) + 2;
1036 pathname = alloca(len);
1037 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1038 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
1039 return -errno;
1040
1041 if (is_dir(pathname, cfd))
1042 // like cgmanager did, we want to chown the tasks file as well
1043 return chown_tasks_files(pathname, uid, gid, cfd);
1044
1045 return 0;
1046 }
1047
1048 FILE *open_pids_file(const char *controller, const char *cgroup)
1049 {
1050 int fd, cfd;
1051 size_t len;
1052 char *pathname, *tmpc;
1053
1054 tmpc = find_mounted_controller(controller, &cfd);
1055 if (!tmpc)
1056 return NULL;
1057
1058 /* Make sure we pass a relative path to *at() family of functions.
1059 * . + /cgroup + / "cgroup.procs" + \0
1060 */
1061 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
1062 pathname = alloca(len);
1063 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1064
1065 fd = openat(cfd, pathname, O_WRONLY);
1066 if (fd < 0)
1067 return NULL;
1068
1069 return fdopen(fd, "w");
1070 }
1071
1072 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1073 void ***list, size_t typesize,
1074 void* (*iterator)(const char*, const char*, const char*))
1075 {
1076 int cfd, fd, ret;
1077 size_t len;
1078 char *cg, *tmpc;
1079 char pathname[MAXPATHLEN];
1080 size_t sz = 0, asz = 0;
1081 struct dirent *dirent;
1082 DIR *dir;
1083
1084 tmpc = find_mounted_controller(controller, &cfd);
1085 *list = NULL;
1086 if (!tmpc)
1087 return false;
1088
1089 /* Make sure we pass a relative path to *at() family of functions. */
1090 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1091 cg = alloca(len);
1092 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1093 if (ret < 0 || (size_t)ret >= len) {
1094 lxcfs_error("Pathname too long under %s\n", cgroup);
1095 return false;
1096 }
1097
1098 fd = openat(cfd, cg, O_DIRECTORY);
1099 if (fd < 0)
1100 return false;
1101
1102 dir = fdopendir(fd);
1103 if (!dir)
1104 return false;
1105
1106 while ((dirent = readdir(dir))) {
1107 struct stat mystat;
1108
1109 if (!strcmp(dirent->d_name, ".") ||
1110 !strcmp(dirent->d_name, ".."))
1111 continue;
1112
1113 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1114 if (ret < 0 || ret >= MAXPATHLEN) {
1115 lxcfs_error("Pathname too long under %s\n", cg);
1116 continue;
1117 }
1118
1119 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1120 if (ret) {
1121 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1122 continue;
1123 }
1124 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1125 (directories && !S_ISDIR(mystat.st_mode)))
1126 continue;
1127
1128 if (sz+2 >= asz) {
1129 void **tmp;
1130 asz += BATCH_SIZE;
1131 do {
1132 tmp = realloc(*list, asz * typesize);
1133 } while (!tmp);
1134 *list = tmp;
1135 }
1136 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1137 (*list)[sz+1] = NULL;
1138 sz++;
1139 }
1140 if (closedir(dir) < 0) {
1141 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1142 return false;
1143 }
1144 return true;
1145 }
1146
1147 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1148 {
1149 char *dup;
1150 do {
1151 dup = strdup(dir_entry);
1152 } while (!dup);
1153 return dup;
1154 }
1155
1156 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1157 {
1158 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1159 }
1160
1161 void free_key(struct cgfs_files *k)
1162 {
1163 if (!k)
1164 return;
1165
1166 __free_move__(k->name);
1167 __free_move__(k);
1168 }
1169
1170 void free_keys(struct cgfs_files **keys)
1171 {
1172 int i;
1173
1174 if (!keys)
1175 return;
1176
1177 for (i = 0; keys[i]; i++)
1178 free_key(keys[i]);
1179
1180 __free_move__(keys);
1181 }
1182
1183 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1184 {
1185 int ret, fd, cfd;
1186 size_t len;
1187 char *fnam, *tmpc;
1188
1189 tmpc = find_mounted_controller(controller, &cfd);
1190 if (!tmpc)
1191 return false;
1192
1193 /* Make sure we pass a relative path to *at() family of functions.
1194 * . + /cgroup + / + file + \0
1195 */
1196 len = strlen(cgroup) + strlen(file) + 3;
1197 fnam = alloca(len);
1198 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1199 if (ret < 0 || (size_t)ret >= len)
1200 return false;
1201
1202 fd = openat(cfd, fnam, O_RDONLY);
1203 if (fd < 0)
1204 return false;
1205
1206 *value = slurp_file(fnam, fd);
1207 return *value != NULL;
1208 }
1209
1210 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1211 {
1212 int ret, cfd;
1213 size_t len;
1214 char *fnam, *tmpc;
1215
1216 tmpc = find_mounted_controller(controller, &cfd);
1217 if (!tmpc)
1218 return false;
1219
1220 /* Make sure we pass a relative path to *at() family of functions.
1221 * . + /cgroup + / + file + \0
1222 */
1223 len = strlen(cgroup) + strlen(file) + 3;
1224 fnam = alloca(len);
1225 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1226 if (ret < 0 || (size_t)ret >= len)
1227 return false;
1228
1229 return (faccessat(cfd, fnam, F_OK, 0) == 0);
1230 }
1231
1232 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1233 {
1234 int ret, cfd;
1235 size_t len;
1236 char *fnam, *tmpc;
1237 struct stat sb;
1238 struct cgfs_files *newkey;
1239
1240 tmpc = find_mounted_controller(controller, &cfd);
1241 if (!tmpc)
1242 return false;
1243
1244 if (file && *file == '/')
1245 file++;
1246
1247 if (file && strchr(file, '/'))
1248 return NULL;
1249
1250 /* Make sure we pass a relative path to *at() family of functions.
1251 * . + /cgroup + / + file + \0
1252 */
1253 len = strlen(cgroup) + 3;
1254 if (file)
1255 len += strlen(file) + 1;
1256 fnam = alloca(len);
1257 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1258 file ? "/" : "", file ? file : "");
1259
1260 ret = fstatat(cfd, fnam, &sb, 0);
1261 if (ret < 0)
1262 return NULL;
1263
1264 do {
1265 newkey = malloc(sizeof(struct cgfs_files));
1266 } while (!newkey);
1267 if (file)
1268 newkey->name = must_copy_string(file);
1269 else if (strrchr(cgroup, '/'))
1270 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1271 else
1272 newkey->name = must_copy_string(cgroup);
1273 newkey->uid = sb.st_uid;
1274 newkey->gid = sb.st_gid;
1275 newkey->mode = sb.st_mode;
1276
1277 return newkey;
1278 }
1279
1280 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1281 {
1282 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1283 if (!entry) {
1284 lxcfs_error("Error getting files under %s:%s\n", controller,
1285 cgroup);
1286 }
1287 return entry;
1288 }
1289
1290 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1291 {
1292 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1293 }
1294
1295 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1296 {
1297 int cfd;
1298 size_t len;
1299 char *fnam, *tmpc;
1300 int ret;
1301 struct stat sb;
1302
1303 tmpc = find_mounted_controller(controller, &cfd);
1304 if (!tmpc)
1305 return false;
1306
1307 /* Make sure we pass a relative path to *at() family of functions.
1308 * . + /cgroup + / + f + \0
1309 */
1310 len = strlen(cgroup) + strlen(f) + 3;
1311 fnam = alloca(len);
1312 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1313 if (ret < 0 || (size_t)ret >= len)
1314 return false;
1315
1316 ret = fstatat(cfd, fnam, &sb, 0);
1317 if (ret < 0 || !S_ISDIR(sb.st_mode))
1318 return false;
1319
1320 return true;
1321 }
1322
1323 #define SEND_CREDS_OK 0
1324 #define SEND_CREDS_NOTSK 1
1325 #define SEND_CREDS_FAIL 2
1326 static bool recv_creds(int sock, struct ucred *cred, char *v);
1327 static int wait_for_pid(pid_t pid);
1328 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1329 static int send_creds_clone_wrapper(void *arg);
1330
1331 /*
1332 * clone a task which switches to @task's namespace and writes '1'.
1333 * over a unix sock so we can read the task's reaper's pid in our
1334 * namespace
1335 *
1336 * Note: glibc's fork() does not respect pidns, which can lead to failed
1337 * assertions inside glibc (and thus failed forks) if the child's pid in
1338 * the pidns and the parent pid outside are identical. Using clone prevents
1339 * this issue.
1340 */
1341 static void write_task_init_pid_exit(int sock, pid_t target)
1342 {
1343 char fnam[100];
1344 pid_t pid;
1345 int fd, ret;
1346 size_t stack_size = sysconf(_SC_PAGESIZE);
1347 void *stack = alloca(stack_size);
1348
1349 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1350 if (ret < 0 || ret >= sizeof(fnam))
1351 _exit(1);
1352
1353 fd = open(fnam, O_RDONLY);
1354 if (fd < 0) {
1355 perror("write_task_init_pid_exit open of ns/pid");
1356 _exit(1);
1357 }
1358 if (setns(fd, 0)) {
1359 perror("write_task_init_pid_exit setns 1");
1360 close(fd);
1361 _exit(1);
1362 }
1363 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1364 if (pid < 0)
1365 _exit(1);
1366 if (pid != 0) {
1367 if (!wait_for_pid(pid))
1368 _exit(1);
1369 _exit(0);
1370 }
1371 }
1372
1373 static int send_creds_clone_wrapper(void *arg) {
1374 struct ucred cred;
1375 char v;
1376 int sock = *(int *)arg;
1377
1378 /* we are the child */
1379 cred.uid = 0;
1380 cred.gid = 0;
1381 cred.pid = 1;
1382 v = '1';
1383 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1384 return 1;
1385 return 0;
1386 }
1387
1388 static pid_t get_init_pid_for_task(pid_t task)
1389 {
1390 int sock[2];
1391 pid_t pid;
1392 pid_t ret = -1;
1393 char v = '0';
1394 struct ucred cred;
1395
1396 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1397 perror("socketpair");
1398 return -1;
1399 }
1400
1401 pid = fork();
1402 if (pid < 0)
1403 goto out;
1404 if (!pid) {
1405 close(sock[1]);
1406 write_task_init_pid_exit(sock[0], task);
1407 _exit(0);
1408 }
1409
1410 if (!recv_creds(sock[1], &cred, &v))
1411 goto out;
1412 ret = cred.pid;
1413
1414 out:
1415 close(sock[0]);
1416 close(sock[1]);
1417 if (pid > 0)
1418 wait_for_pid(pid);
1419 return ret;
1420 }
1421
1422 static pid_t lookup_initpid_in_store(pid_t qpid)
1423 {
1424 pid_t answer = 0;
1425 struct stat sb;
1426 struct pidns_init_store *e;
1427 char fnam[100];
1428
1429 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1430 store_lock();
1431 if (stat(fnam, &sb) < 0)
1432 goto out;
1433 e = lookup_verify_initpid(&sb);
1434 if (e) {
1435 answer = e->initpid;
1436 goto out;
1437 }
1438 answer = get_init_pid_for_task(qpid);
1439 if (answer > 0)
1440 save_initpid(&sb, answer);
1441
1442 out:
1443 /* we prune at end in case we are returning
1444 * the value we were about to return */
1445 prune_initpid_store();
1446 store_unlock();
1447 return answer;
1448 }
1449
1450 static int wait_for_pid(pid_t pid)
1451 {
1452 int status, ret;
1453
1454 if (pid <= 0)
1455 return -1;
1456
1457 again:
1458 ret = waitpid(pid, &status, 0);
1459 if (ret == -1) {
1460 if (errno == EINTR)
1461 goto again;
1462 return -1;
1463 }
1464 if (ret != pid)
1465 goto again;
1466 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1467 return -1;
1468 return 0;
1469 }
1470
1471
1472 /*
1473 * append pid to *src.
1474 * src: a pointer to a char* in which ot append the pid.
1475 * sz: the number of characters printed so far, minus trailing \0.
1476 * asz: the allocated size so far
1477 * pid: the pid to append
1478 */
1479 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1480 {
1481 char tmp[30];
1482
1483 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1484
1485 if (!*src || tmplen + *sz + 1 >= *asz) {
1486 char *tmp;
1487 do {
1488 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1489 } while (!tmp);
1490 *src = tmp;
1491 *asz += BUF_RESERVE_SIZE;
1492 }
1493 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1494 *sz += tmplen;
1495 }
1496
1497 /*
1498 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1499 * valid in the caller's namespace, return the id mapped into
1500 * pid's namespace.
1501 * Returns the mapped id, or -1 on error.
1502 */
1503 unsigned int
1504 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1505 {
1506 unsigned int nsuid, // base id for a range in the idfile's namespace
1507 hostuid, // base id for a range in the caller's namespace
1508 count; // number of ids in this range
1509 char line[400];
1510 int ret;
1511
1512 fseek(idfile, 0L, SEEK_SET);
1513 while (fgets(line, 400, idfile)) {
1514 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1515 if (ret != 3)
1516 continue;
1517 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1518 /*
1519 * uids wrapped around - unexpected as this is a procfile,
1520 * so just bail.
1521 */
1522 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1523 nsuid, hostuid, count, line);
1524 return -1;
1525 }
1526 if (hostuid <= in_id && hostuid+count > in_id) {
1527 /*
1528 * now since hostuid <= in_id < hostuid+count, and
1529 * hostuid+count and nsuid+count do not wrap around,
1530 * we know that nsuid+(in_id-hostuid) which must be
1531 * less that nsuid+(count) must not wrap around
1532 */
1533 return (in_id - hostuid) + nsuid;
1534 }
1535 }
1536
1537 // no answer found
1538 return -1;
1539 }
1540
1541 /*
1542 * for is_privileged_over,
1543 * specify whether we require the calling uid to be root in his
1544 * namespace
1545 */
1546 #define NS_ROOT_REQD true
1547 #define NS_ROOT_OPT false
1548
1549 #define PROCLEN 100
1550
1551 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1552 {
1553 char fpath[PROCLEN];
1554 int ret;
1555 bool answer = false;
1556 uid_t nsuid;
1557
1558 if (victim == -1 || uid == -1)
1559 return false;
1560
1561 /*
1562 * If the request is one not requiring root in the namespace,
1563 * then having the same uid suffices. (i.e. uid 1000 has write
1564 * access to files owned by uid 1000
1565 */
1566 if (!req_ns_root && uid == victim)
1567 return true;
1568
1569 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1570 if (ret < 0 || ret >= PROCLEN)
1571 return false;
1572 FILE *f = fopen(fpath, "r");
1573 if (!f)
1574 return false;
1575
1576 /* if caller's not root in his namespace, reject */
1577 nsuid = convert_id_to_ns(f, uid);
1578 if (nsuid)
1579 goto out;
1580
1581 /*
1582 * If victim is not mapped into caller's ns, reject.
1583 * XXX I'm not sure this check is needed given that fuse
1584 * will be sending requests where the vfs has converted
1585 */
1586 nsuid = convert_id_to_ns(f, victim);
1587 if (nsuid == -1)
1588 goto out;
1589
1590 answer = true;
1591
1592 out:
1593 fclose(f);
1594 return answer;
1595 }
1596
1597 static bool perms_include(int fmode, mode_t req_mode)
1598 {
1599 mode_t r;
1600
1601 switch (req_mode & O_ACCMODE) {
1602 case O_RDONLY:
1603 r = S_IROTH;
1604 break;
1605 case O_WRONLY:
1606 r = S_IWOTH;
1607 break;
1608 case O_RDWR:
1609 r = S_IROTH | S_IWOTH;
1610 break;
1611 default:
1612 return false;
1613 }
1614 return ((fmode & r) == r);
1615 }
1616
1617
1618 /*
1619 * taskcg is a/b/c
1620 * querycg is /a/b/c/d/e
1621 * we return 'd'
1622 */
1623 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1624 {
1625 char *start, *end;
1626
1627 if (strlen(taskcg) <= strlen(querycg)) {
1628 lxcfs_error("%s\n", "I was fed bad input.");
1629 return NULL;
1630 }
1631
1632 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1633 start = strdup(taskcg + 1);
1634 else
1635 start = strdup(taskcg + strlen(querycg) + 1);
1636 if (!start)
1637 return NULL;
1638 end = strchr(start, '/');
1639 if (end)
1640 *end = '\0';
1641 return start;
1642 }
1643
1644 static void stripnewline(char *x)
1645 {
1646 size_t l = strlen(x);
1647 if (l && x[l-1] == '\n')
1648 x[l-1] = '\0';
1649 }
1650
1651 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1652 {
1653 __do_free char *line = NULL;
1654 __do_fclose FILE *f = NULL;
1655 int cfd;
1656 char fnam[PROCLEN];
1657 char *answer = NULL;
1658 size_t len = 0;
1659 int ret;
1660 const char *h = find_mounted_controller(contrl, &cfd);
1661 if (!h)
1662 return NULL;
1663
1664 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1665 if (ret < 0 || ret >= PROCLEN)
1666 return NULL;
1667 if (!(f = fopen(fnam, "r")))
1668 return NULL;
1669
1670 while (getline(&line, &len, f) != -1) {
1671 char *c1, *c2;
1672
1673 if (!line[0])
1674 continue;
1675
1676 c1 = strchr(line, ':');
1677 if (!c1)
1678 return NULL;
1679
1680 c1++;
1681
1682 c2 = strchr(c1, ':');
1683 if (!c2)
1684 return NULL;
1685
1686 *c2 = '\0';
1687
1688 if (strcmp(c1, h) != 0)
1689 continue;
1690
1691 c2++;
1692
1693 stripnewline(c2);
1694
1695 do {
1696 answer = strdup(c2);
1697 } while (!answer);
1698
1699 break;
1700 }
1701
1702 return answer;
1703 }
1704
1705 /*
1706 * check whether a fuse context may access a cgroup dir or file
1707 *
1708 * If file is not null, it is a cgroup file to check under cg.
1709 * If file is null, then we are checking perms on cg itself.
1710 *
1711 * For files we can check the mode of the list_keys result.
1712 * For cgroups, we must make assumptions based on the files under the
1713 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1714 * yet.
1715 */
1716 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1717 {
1718 struct cgfs_files *k = NULL;
1719 bool ret = false;
1720
1721 k = cgfs_get_key(contrl, cg, file);
1722 if (!k)
1723 return false;
1724
1725 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1726 if (perms_include(k->mode >> 6, mode)) {
1727 ret = true;
1728 goto out;
1729 }
1730 }
1731 if (fc->gid == k->gid) {
1732 if (perms_include(k->mode >> 3, mode)) {
1733 ret = true;
1734 goto out;
1735 }
1736 }
1737 ret = perms_include(k->mode, mode);
1738
1739 out:
1740 free_key(k);
1741 return ret;
1742 }
1743
1744 #define INITSCOPE "/init.scope"
1745 static void prune_init_slice(char *cg)
1746 {
1747 char *point;
1748 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1749
1750 if (cg_len < initscope_len)
1751 return;
1752
1753 point = cg + cg_len - initscope_len;
1754 if (strcmp(point, INITSCOPE) == 0) {
1755 if (point == cg)
1756 *(point+1) = '\0';
1757 else
1758 *point = '\0';
1759 }
1760 }
1761
1762 /*
1763 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1764 * If pid is in /a, he may act on /a/b, but not on /b.
1765 * if the answer is false and nextcg is not NULL, then *nextcg will point
1766 * to a string containing the next cgroup directory under cg, which must be
1767 * freed by the caller.
1768 */
1769 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1770 {
1771 __do_free char *c2 = NULL;
1772 bool answer = false;
1773 char *linecmp;
1774
1775 c2 = get_pid_cgroup(pid, contrl);
1776 if (!c2)
1777 return false;
1778 prune_init_slice(c2);
1779
1780 /*
1781 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1782 * they pass in a cgroup without leading '/'
1783 *
1784 * The original line here was:
1785 * linecmp = *cg == '/' ? c2 : c2+1;
1786 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1787 * Serge, do you know?
1788 */
1789 if (*cg == '/' || !strncmp(cg, "./", 2))
1790 linecmp = c2;
1791 else
1792 linecmp = c2 + 1;
1793 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1794 if (nextcg)
1795 *nextcg = get_next_cgroup_dir(linecmp, cg);
1796 goto out;
1797 }
1798 answer = true;
1799
1800 out:
1801 return answer;
1802 }
1803
1804 /*
1805 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1806 */
1807 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1808 {
1809 __do_free char *c2 = NULL;
1810 bool answer = false;
1811 char *task_cg;
1812 size_t target_len, task_len;
1813
1814 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1815 return true;
1816
1817 c2 = get_pid_cgroup(pid, contrl);
1818 if (!c2)
1819 return false;
1820 prune_init_slice(c2);
1821
1822 task_cg = c2 + 1;
1823 target_len = strlen(cg);
1824 task_len = strlen(task_cg);
1825 if (task_len == 0) {
1826 /* Task is in the root cg, it can see everything. This case is
1827 * not handled by the strmcps below, since they test for the
1828 * last /, but that is the first / that we've chopped off
1829 * above.
1830 */
1831 answer = true;
1832 goto out;
1833 }
1834 if (strcmp(cg, task_cg) == 0) {
1835 answer = true;
1836 goto out;
1837 }
1838 if (target_len < task_len) {
1839 /* looking up a parent dir */
1840 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1841 answer = true;
1842 goto out;
1843 }
1844 if (target_len > task_len) {
1845 /* looking up a child dir */
1846 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1847 answer = true;
1848 goto out;
1849 }
1850
1851 out:
1852 return answer;
1853 }
1854
1855 /*
1856 * given /cgroup/freezer/a/b, return "freezer".
1857 * the returned char* should NOT be freed.
1858 */
1859 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1860 {
1861 const char *p1;
1862 char *contr, *slash;
1863
1864 if (strlen(path) < 9) {
1865 errno = EACCES;
1866 return NULL;
1867 }
1868 if (*(path + 7) != '/') {
1869 errno = EINVAL;
1870 return NULL;
1871 }
1872 p1 = path + 8;
1873 contr = strdupa(p1);
1874 if (!contr) {
1875 errno = ENOMEM;
1876 return NULL;
1877 }
1878 slash = strstr(contr, "/");
1879 if (slash)
1880 *slash = '\0';
1881
1882 int i;
1883 for (i = 0; i < num_hierarchies; i++) {
1884 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1885 return hierarchies[i];
1886 }
1887 errno = ENOENT;
1888 return NULL;
1889 }
1890
1891 /*
1892 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1893 * Note that the returned value may include files (keynames) etc
1894 */
1895 static const char *find_cgroup_in_path(const char *path)
1896 {
1897 const char *p1;
1898
1899 if (strlen(path) < 9) {
1900 errno = EACCES;
1901 return NULL;
1902 }
1903 p1 = strstr(path + 8, "/");
1904 if (!p1) {
1905 errno = EINVAL;
1906 return NULL;
1907 }
1908 errno = 0;
1909 return p1 + 1;
1910 }
1911
1912 /*
1913 * split the last path element from the path in @cg.
1914 * @dir is newly allocated and should be freed, @last not
1915 */
1916 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1917 {
1918 char *p;
1919
1920 do {
1921 *dir = strdup(cg);
1922 } while (!*dir);
1923 *last = strrchr(cg, '/');
1924 if (!*last) {
1925 *last = NULL;
1926 return;
1927 }
1928 p = strrchr(*dir, '/');
1929 *p = '\0';
1930 }
1931
1932 /*
1933 * FUSE ops for /cgroup
1934 */
1935
1936 int cg_getattr(const char *path, struct stat *sb)
1937 {
1938 __do_free char * cgdir = NULL;
1939 struct timespec now;
1940 struct fuse_context *fc = fuse_get_context();
1941 char *last = NULL, *path1, *path2;
1942 struct cgfs_files *k = NULL;
1943 const char *cgroup;
1944 const char *controller = NULL;
1945 int ret = -ENOENT;
1946
1947
1948 if (!fc)
1949 return -EIO;
1950
1951 memset(sb, 0, sizeof(struct stat));
1952
1953 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1954 return -EINVAL;
1955
1956 sb->st_uid = sb->st_gid = 0;
1957 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1958 sb->st_size = 0;
1959
1960 if (strcmp(path, "/cgroup") == 0) {
1961 sb->st_mode = S_IFDIR | 00755;
1962 sb->st_nlink = 2;
1963 return 0;
1964 }
1965
1966 controller = pick_controller_from_path(fc, path);
1967 if (!controller)
1968 return -errno;
1969 cgroup = find_cgroup_in_path(path);
1970 if (!cgroup) {
1971 /* this is just /cgroup/controller, return it as a dir */
1972 sb->st_mode = S_IFDIR | 00755;
1973 sb->st_nlink = 2;
1974 return 0;
1975 }
1976
1977 get_cgdir_and_path(cgroup, &cgdir, &last);
1978
1979 if (!last) {
1980 path1 = "/";
1981 path2 = cgdir;
1982 } else {
1983 path1 = cgdir;
1984 path2 = last;
1985 }
1986
1987 pid_t initpid = lookup_initpid_in_store(fc->pid);
1988 if (initpid <= 0)
1989 initpid = fc->pid;
1990 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1991 * Then check that caller's cgroup is under path if last is a child
1992 * cgroup, or cgdir if last is a file */
1993
1994 if (is_child_cgroup(controller, path1, path2)) {
1995 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1996 ret = -ENOENT;
1997 goto out;
1998 }
1999 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2000 /* this is just /cgroup/controller, return it as a dir */
2001 sb->st_mode = S_IFDIR | 00555;
2002 sb->st_nlink = 2;
2003 ret = 0;
2004 goto out;
2005 }
2006 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
2007 ret = -EACCES;
2008 goto out;
2009 }
2010
2011 // get uid, gid, from '/tasks' file and make up a mode
2012 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2013 sb->st_mode = S_IFDIR | 00755;
2014 k = cgfs_get_key(controller, cgroup, NULL);
2015 if (!k) {
2016 sb->st_uid = sb->st_gid = 0;
2017 } else {
2018 sb->st_uid = k->uid;
2019 sb->st_gid = k->gid;
2020 }
2021 free_key(k);
2022 sb->st_nlink = 2;
2023 ret = 0;
2024 goto out;
2025 }
2026
2027 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
2028 sb->st_mode = S_IFREG | k->mode;
2029 sb->st_nlink = 1;
2030 sb->st_uid = k->uid;
2031 sb->st_gid = k->gid;
2032 sb->st_size = 0;
2033 free_key(k);
2034 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2035 ret = -ENOENT;
2036 goto out;
2037 }
2038 ret = 0;
2039 }
2040
2041 out:
2042 return ret;
2043 }
2044
2045 int cg_opendir(const char *path, struct fuse_file_info *fi)
2046 {
2047 struct fuse_context *fc = fuse_get_context();
2048 const char *cgroup;
2049 struct file_info *dir_info;
2050 char *controller = NULL;
2051
2052 if (!fc)
2053 return -EIO;
2054
2055 if (strcmp(path, "/cgroup") == 0) {
2056 cgroup = NULL;
2057 controller = NULL;
2058 } else {
2059 // return list of keys for the controller, and list of child cgroups
2060 controller = pick_controller_from_path(fc, path);
2061 if (!controller)
2062 return -errno;
2063
2064 cgroup = find_cgroup_in_path(path);
2065 if (!cgroup) {
2066 /* this is just /cgroup/controller, return its contents */
2067 cgroup = "/";
2068 }
2069 }
2070
2071 pid_t initpid = lookup_initpid_in_store(fc->pid);
2072 if (initpid <= 0)
2073 initpid = fc->pid;
2074 if (cgroup) {
2075 if (!caller_may_see_dir(initpid, controller, cgroup))
2076 return -ENOENT;
2077 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2078 return -EACCES;
2079 }
2080
2081 /* we'll free this at cg_releasedir */
2082 dir_info = malloc(sizeof(*dir_info));
2083 if (!dir_info)
2084 return -ENOMEM;
2085 dir_info->controller = must_copy_string(controller);
2086 dir_info->cgroup = must_copy_string(cgroup);
2087 dir_info->type = LXC_TYPE_CGDIR;
2088 dir_info->buf = NULL;
2089 dir_info->file = NULL;
2090 dir_info->buflen = 0;
2091
2092 fi->fh = (unsigned long)dir_info;
2093 return 0;
2094 }
2095
2096 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2097 struct fuse_file_info *fi)
2098 {
2099 __do_free char *nextcg = NULL;
2100 struct file_info *d = (struct file_info *)fi->fh;
2101 struct cgfs_files **list = NULL;
2102 int i, ret;
2103 struct fuse_context *fc = fuse_get_context();
2104 char **clist = NULL;
2105
2106 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2107 return -EIO;
2108
2109 if (d->type != LXC_TYPE_CGDIR) {
2110 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
2111 return -EIO;
2112 }
2113 if (!d->cgroup && !d->controller) {
2114 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2115 int i;
2116
2117 for (i = 0; i < num_hierarchies; i++) {
2118 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2119 return -EIO;
2120 }
2121 }
2122 return 0;
2123 }
2124
2125 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2126 // not a valid cgroup
2127 ret = -EINVAL;
2128 goto out;
2129 }
2130
2131 pid_t initpid = lookup_initpid_in_store(fc->pid);
2132 if (initpid <= 0)
2133 initpid = fc->pid;
2134 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2135 if (nextcg) {
2136 ret = filler(buf, nextcg, NULL, 0);
2137 if (ret != 0) {
2138 ret = -EIO;
2139 goto out;
2140 }
2141 }
2142 ret = 0;
2143 goto out;
2144 }
2145
2146 for (i = 0; list && list[i]; i++) {
2147 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2148 ret = -EIO;
2149 goto out;
2150 }
2151 }
2152
2153 // now get the list of child cgroups
2154
2155 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2156 ret = 0;
2157 goto out;
2158 }
2159 if (clist) {
2160 for (i = 0; clist[i]; i++) {
2161 if (filler(buf, clist[i], NULL, 0) != 0) {
2162 ret = -EIO;
2163 goto out;
2164 }
2165 }
2166 }
2167 ret = 0;
2168
2169 out:
2170 free_keys(list);
2171 if (clist) {
2172 for (i = 0; clist[i]; i++)
2173 __free_move__(clist[i]);
2174 __free_move__(clist);
2175 }
2176 return ret;
2177 }
2178
2179 static void do_release_file_info(struct fuse_file_info *fi)
2180 {
2181 struct file_info *f = (struct file_info *)fi->fh;
2182
2183 if (!f)
2184 return;
2185
2186 fi->fh = 0;
2187
2188 __free_move__(f->controller);
2189 __free_move__(f->cgroup);
2190 __free_move__(f->file);
2191 __free_move__(f->buf);
2192 __free_move__(f);
2193 }
2194
2195 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2196 {
2197 do_release_file_info(fi);
2198 return 0;
2199 }
2200
2201 int cg_open(const char *path, struct fuse_file_info *fi)
2202 {
2203 __do_free char *cgdir = NULL;
2204 const char *cgroup;
2205 char *last = NULL, *path1, *path2, *controller;
2206 struct cgfs_files *k = NULL;
2207 struct file_info *file_info;
2208 struct fuse_context *fc = fuse_get_context();
2209 int ret;
2210
2211 if (!fc)
2212 return -EIO;
2213
2214 controller = pick_controller_from_path(fc, path);
2215 if (!controller)
2216 return -errno;
2217 cgroup = find_cgroup_in_path(path);
2218 if (!cgroup)
2219 return -errno;
2220
2221 get_cgdir_and_path(cgroup, &cgdir, &last);
2222 if (!last) {
2223 path1 = "/";
2224 path2 = cgdir;
2225 } else {
2226 path1 = cgdir;
2227 path2 = last;
2228 }
2229
2230 k = cgfs_get_key(controller, path1, path2);
2231 if (!k) {
2232 ret = -EINVAL;
2233 goto out;
2234 }
2235 free_key(k);
2236
2237 pid_t initpid = lookup_initpid_in_store(fc->pid);
2238 if (initpid <= 0)
2239 initpid = fc->pid;
2240 if (!caller_may_see_dir(initpid, controller, path1)) {
2241 ret = -ENOENT;
2242 goto out;
2243 }
2244 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2245 ret = -EACCES;
2246 goto out;
2247 }
2248
2249 /* we'll free this at cg_release */
2250 file_info = malloc(sizeof(*file_info));
2251 if (!file_info) {
2252 ret = -ENOMEM;
2253 goto out;
2254 }
2255 file_info->controller = must_copy_string(controller);
2256 file_info->cgroup = must_copy_string(path1);
2257 file_info->file = must_copy_string(path2);
2258 file_info->type = LXC_TYPE_CGFILE;
2259 file_info->buf = NULL;
2260 file_info->buflen = 0;
2261
2262 fi->fh = (unsigned long)file_info;
2263 ret = 0;
2264
2265 out:
2266 return ret;
2267 }
2268
2269 int cg_access(const char *path, int mode)
2270 {
2271 __do_free char *cgdir = NULL;
2272 int ret;
2273 const char *cgroup;
2274 char *path1, *path2, *controller;
2275 char *last = NULL;
2276 struct cgfs_files *k = NULL;
2277 struct fuse_context *fc = fuse_get_context();
2278
2279 if (strcmp(path, "/cgroup") == 0)
2280 return 0;
2281
2282 if (!fc)
2283 return -EIO;
2284
2285 controller = pick_controller_from_path(fc, path);
2286 if (!controller)
2287 return -errno;
2288 cgroup = find_cgroup_in_path(path);
2289 if (!cgroup) {
2290 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2291 if ((mode & W_OK) == 0)
2292 return 0;
2293 return -EACCES;
2294 }
2295
2296 get_cgdir_and_path(cgroup, &cgdir, &last);
2297 if (!last) {
2298 path1 = "/";
2299 path2 = cgdir;
2300 } else {
2301 path1 = cgdir;
2302 path2 = last;
2303 }
2304
2305 k = cgfs_get_key(controller, path1, path2);
2306 if (!k) {
2307 if ((mode & W_OK) == 0)
2308 ret = 0;
2309 else
2310 ret = -EACCES;
2311 goto out;
2312 }
2313 free_key(k);
2314
2315 pid_t initpid = lookup_initpid_in_store(fc->pid);
2316 if (initpid <= 0)
2317 initpid = fc->pid;
2318 if (!caller_may_see_dir(initpid, controller, path1)) {
2319 ret = -ENOENT;
2320 goto out;
2321 }
2322 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2323 ret = -EACCES;
2324 goto out;
2325 }
2326
2327 ret = 0;
2328
2329 out:
2330 return ret;
2331 }
2332
2333 int cg_release(const char *path, struct fuse_file_info *fi)
2334 {
2335 do_release_file_info(fi);
2336 return 0;
2337 }
2338
2339 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2340
2341 static bool wait_for_sock(int sock, int timeout)
2342 {
2343 struct epoll_event ev;
2344 int epfd, ret, now, starttime, deltatime, saved_errno;
2345
2346 if ((starttime = time(NULL)) < 0)
2347 return false;
2348
2349 if ((epfd = epoll_create(1)) < 0) {
2350 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2351 return false;
2352 }
2353
2354 ev.events = POLLIN_SET;
2355 ev.data.fd = sock;
2356 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2357 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2358 close(epfd);
2359 return false;
2360 }
2361
2362 again:
2363 if ((now = time(NULL)) < 0) {
2364 close(epfd);
2365 return false;
2366 }
2367
2368 deltatime = (starttime + timeout) - now;
2369 if (deltatime < 0) { // timeout
2370 errno = 0;
2371 close(epfd);
2372 return false;
2373 }
2374 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2375 if (ret < 0 && errno == EINTR)
2376 goto again;
2377 saved_errno = errno;
2378 close(epfd);
2379
2380 if (ret <= 0) {
2381 errno = saved_errno;
2382 return false;
2383 }
2384 return true;
2385 }
2386
2387 static int msgrecv(int sockfd, void *buf, size_t len)
2388 {
2389 if (!wait_for_sock(sockfd, 2))
2390 return -1;
2391 return recv(sockfd, buf, len, MSG_DONTWAIT);
2392 }
2393
2394 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2395 {
2396 struct msghdr msg = { 0 };
2397 struct iovec iov;
2398 struct cmsghdr *cmsg;
2399 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2400 char buf[1];
2401 buf[0] = 'p';
2402
2403 if (pingfirst) {
2404 if (msgrecv(sock, buf, 1) != 1) {
2405 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2406 return SEND_CREDS_FAIL;
2407 }
2408 }
2409
2410 msg.msg_control = cmsgbuf;
2411 msg.msg_controllen = sizeof(cmsgbuf);
2412
2413 cmsg = CMSG_FIRSTHDR(&msg);
2414 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2415 cmsg->cmsg_level = SOL_SOCKET;
2416 cmsg->cmsg_type = SCM_CREDENTIALS;
2417 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2418
2419 msg.msg_name = NULL;
2420 msg.msg_namelen = 0;
2421
2422 buf[0] = v;
2423 iov.iov_base = buf;
2424 iov.iov_len = sizeof(buf);
2425 msg.msg_iov = &iov;
2426 msg.msg_iovlen = 1;
2427
2428 if (sendmsg(sock, &msg, 0) < 0) {
2429 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2430 if (errno == 3)
2431 return SEND_CREDS_NOTSK;
2432 return SEND_CREDS_FAIL;
2433 }
2434
2435 return SEND_CREDS_OK;
2436 }
2437
2438 static bool recv_creds(int sock, struct ucred *cred, char *v)
2439 {
2440 struct msghdr msg = { 0 };
2441 struct iovec iov;
2442 struct cmsghdr *cmsg;
2443 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2444 char buf[1];
2445 int ret;
2446 int optval = 1;
2447
2448 *v = '1';
2449
2450 cred->pid = -1;
2451 cred->uid = -1;
2452 cred->gid = -1;
2453
2454 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2455 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2456 return false;
2457 }
2458 buf[0] = '1';
2459 if (write(sock, buf, 1) != 1) {
2460 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2461 return false;
2462 }
2463
2464 msg.msg_name = NULL;
2465 msg.msg_namelen = 0;
2466 msg.msg_control = cmsgbuf;
2467 msg.msg_controllen = sizeof(cmsgbuf);
2468
2469 iov.iov_base = buf;
2470 iov.iov_len = sizeof(buf);
2471 msg.msg_iov = &iov;
2472 msg.msg_iovlen = 1;
2473
2474 if (!wait_for_sock(sock, 2)) {
2475 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2476 return false;
2477 }
2478 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2479 if (ret < 0) {
2480 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2481 return false;
2482 }
2483
2484 cmsg = CMSG_FIRSTHDR(&msg);
2485
2486 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2487 cmsg->cmsg_level == SOL_SOCKET &&
2488 cmsg->cmsg_type == SCM_CREDENTIALS) {
2489 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2490 }
2491 *v = buf[0];
2492
2493 return true;
2494 }
2495
2496 struct pid_ns_clone_args {
2497 int *cpipe;
2498 int sock;
2499 pid_t tpid;
2500 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2501 };
2502
2503 /*
2504 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2505 * with clone(). This simply writes '1' as ACK back to the parent
2506 * before calling the actual wrapped function.
2507 */
2508 static int pid_ns_clone_wrapper(void *arg) {
2509 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2510 char b = '1';
2511
2512 close(args->cpipe[0]);
2513 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2514 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2515 close(args->cpipe[1]);
2516 return args->wrapped(args->sock, args->tpid);
2517 }
2518
2519 /*
2520 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2521 * int value back over the socket. This shifts the pid from the
2522 * sender's pidns into tpid's pidns.
2523 */
2524 static int pid_to_ns(int sock, pid_t tpid)
2525 {
2526 char v = '0';
2527 struct ucred cred;
2528
2529 while (recv_creds(sock, &cred, &v)) {
2530 if (v == '1')
2531 return 0;
2532 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2533 return 1;
2534 }
2535 return 0;
2536 }
2537
2538
2539 /*
2540 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2541 * in your old pidns. Only children which you clone will be in the target
2542 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2543 * actually convert pids.
2544 *
2545 * Note: glibc's fork() does not respect pidns, which can lead to failed
2546 * assertions inside glibc (and thus failed forks) if the child's pid in
2547 * the pidns and the parent pid outside are identical. Using clone prevents
2548 * this issue.
2549 */
2550 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2551 {
2552 int newnsfd = -1, ret, cpipe[2];
2553 char fnam[100];
2554 pid_t cpid;
2555 char v;
2556
2557 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2558 if (ret < 0 || ret >= sizeof(fnam))
2559 _exit(1);
2560 newnsfd = open(fnam, O_RDONLY);
2561 if (newnsfd < 0)
2562 _exit(1);
2563 if (setns(newnsfd, 0) < 0)
2564 _exit(1);
2565 close(newnsfd);
2566
2567 if (pipe(cpipe) < 0)
2568 _exit(1);
2569
2570 struct pid_ns_clone_args args = {
2571 .cpipe = cpipe,
2572 .sock = sock,
2573 .tpid = tpid,
2574 .wrapped = &pid_to_ns
2575 };
2576 size_t stack_size = sysconf(_SC_PAGESIZE);
2577 void *stack = alloca(stack_size);
2578
2579 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2580 if (cpid < 0)
2581 _exit(1);
2582
2583 // give the child 1 second to be done forking and
2584 // write its ack
2585 if (!wait_for_sock(cpipe[0], 1))
2586 _exit(1);
2587 ret = read(cpipe[0], &v, 1);
2588 if (ret != sizeof(char) || v != '1')
2589 _exit(1);
2590
2591 if (!wait_for_pid(cpid))
2592 _exit(1);
2593 _exit(0);
2594 }
2595
2596 /*
2597 * To read cgroup files with a particular pid, we will setns into the child
2598 * pidns, open a pipe, fork a child - which will be the first to really be in
2599 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2600 */
2601 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2602 {
2603 int sock[2] = {-1, -1};
2604 __do_free char *tmpdata = NULL;
2605 int ret;
2606 pid_t qpid, cpid = -1;
2607 bool answer = false;
2608 char v = '0';
2609 struct ucred cred;
2610 size_t sz = 0, asz = 0;
2611
2612 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2613 return false;
2614
2615 /*
2616 * Now we read the pids from returned data one by one, pass
2617 * them into a child in the target namespace, read back the
2618 * translated pids, and put them into our to-return data
2619 */
2620
2621 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2622 perror("socketpair");
2623 return false;
2624 }
2625
2626 cpid = fork();
2627 if (cpid == -1)
2628 goto out;
2629
2630 if (!cpid) // child - exits when done
2631 pid_to_ns_wrapper(sock[1], tpid);
2632
2633 char *ptr = tmpdata;
2634 cred.uid = 0;
2635 cred.gid = 0;
2636 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2637 cred.pid = qpid;
2638 ret = send_creds(sock[0], &cred, v, true);
2639
2640 if (ret == SEND_CREDS_NOTSK)
2641 goto next;
2642 if (ret == SEND_CREDS_FAIL)
2643 goto out;
2644
2645 // read converted results
2646 if (!wait_for_sock(sock[0], 2)) {
2647 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2648 goto out;
2649 }
2650 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2651 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2652 goto out;
2653 }
2654 must_strcat_pid(d, &sz, &asz, qpid);
2655 next:
2656 ptr = strchr(ptr, '\n');
2657 if (!ptr)
2658 break;
2659 ptr++;
2660 }
2661
2662 cred.pid = getpid();
2663 v = '1';
2664 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2665 // failed to ask child to exit
2666 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2667 goto out;
2668 }
2669
2670 answer = true;
2671
2672 out:
2673 if (cpid != -1)
2674 wait_for_pid(cpid);
2675 if (sock[0] != -1) {
2676 close(sock[0]);
2677 close(sock[1]);
2678 }
2679 return answer;
2680 }
2681
2682 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2683 struct fuse_file_info *fi)
2684 {
2685 __do_free char *data = NULL;
2686 struct fuse_context *fc = fuse_get_context();
2687 struct file_info *f = (struct file_info *)fi->fh;
2688 struct cgfs_files *k = NULL;
2689 int ret, s;
2690 bool r;
2691
2692 if (f->type != LXC_TYPE_CGFILE) {
2693 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2694 return -EIO;
2695 }
2696
2697 if (offset)
2698 return 0;
2699
2700 if (!fc)
2701 return -EIO;
2702
2703 if (!f->controller)
2704 return -EINVAL;
2705
2706 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2707 return -EINVAL;
2708 }
2709 free_key(k);
2710
2711
2712 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2713 ret = -EACCES;
2714 goto out;
2715 }
2716
2717 if (strcmp(f->file, "tasks") == 0 ||
2718 strcmp(f->file, "/tasks") == 0 ||
2719 strcmp(f->file, "/cgroup.procs") == 0 ||
2720 strcmp(f->file, "cgroup.procs") == 0)
2721 // special case - we have to translate the pids
2722 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2723 else
2724 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2725
2726 if (!r) {
2727 ret = -EINVAL;
2728 goto out;
2729 }
2730
2731 if (!data) {
2732 ret = 0;
2733 goto out;
2734 }
2735 s = strlen(data);
2736 if (s > size)
2737 s = size;
2738 memcpy(buf, data, s);
2739 if (s > 0 && s < size && data[s-1] != '\n')
2740 buf[s++] = '\n';
2741
2742 ret = s;
2743
2744 out:
2745 return ret;
2746 }
2747
2748 static int pid_from_ns(int sock, pid_t tpid)
2749 {
2750 pid_t vpid;
2751 struct ucred cred;
2752 char v;
2753 int ret;
2754
2755 cred.uid = 0;
2756 cred.gid = 0;
2757 while (1) {
2758 if (!wait_for_sock(sock, 2)) {
2759 lxcfs_error("%s\n", "Timeout reading from parent.");
2760 return 1;
2761 }
2762 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2763 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2764 return 1;
2765 }
2766 if (vpid == -1) // done
2767 break;
2768 v = '0';
2769 cred.pid = vpid;
2770 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2771 v = '1';
2772 cred.pid = getpid();
2773 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2774 return 1;
2775 }
2776 }
2777 return 0;
2778 }
2779
2780 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2781 {
2782 int newnsfd = -1, ret, cpipe[2];
2783 char fnam[100];
2784 pid_t cpid;
2785 char v;
2786
2787 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2788 if (ret < 0 || ret >= sizeof(fnam))
2789 _exit(1);
2790 newnsfd = open(fnam, O_RDONLY);
2791 if (newnsfd < 0)
2792 _exit(1);
2793 if (setns(newnsfd, 0) < 0)
2794 _exit(1);
2795 close(newnsfd);
2796
2797 if (pipe(cpipe) < 0)
2798 _exit(1);
2799
2800 struct pid_ns_clone_args args = {
2801 .cpipe = cpipe,
2802 .sock = sock,
2803 .tpid = tpid,
2804 .wrapped = &pid_from_ns
2805 };
2806 size_t stack_size = sysconf(_SC_PAGESIZE);
2807 void *stack = alloca(stack_size);
2808
2809 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2810 if (cpid < 0)
2811 _exit(1);
2812
2813 // give the child 1 second to be done forking and
2814 // write its ack
2815 if (!wait_for_sock(cpipe[0], 1))
2816 _exit(1);
2817 ret = read(cpipe[0], &v, 1);
2818 if (ret != sizeof(char) || v != '1')
2819 _exit(1);
2820
2821 if (!wait_for_pid(cpid))
2822 _exit(1);
2823 _exit(0);
2824 }
2825
2826 /*
2827 * Given host @uid, return the uid to which it maps in
2828 * @pid's user namespace, or -1 if none.
2829 */
2830 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2831 {
2832 FILE *f;
2833 char line[400];
2834
2835 sprintf(line, "/proc/%d/uid_map", pid);
2836 if ((f = fopen(line, "r")) == NULL) {
2837 return false;
2838 }
2839
2840 *answer = convert_id_to_ns(f, uid);
2841 fclose(f);
2842
2843 if (*answer == -1)
2844 return false;
2845 return true;
2846 }
2847
2848 /*
2849 * get_pid_creds: get the real uid and gid of @pid from
2850 * /proc/$$/status
2851 * (XXX should we use euid here?)
2852 */
2853 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2854 {
2855 char line[400];
2856 uid_t u;
2857 gid_t g;
2858 FILE *f;
2859
2860 *uid = -1;
2861 *gid = -1;
2862 sprintf(line, "/proc/%d/status", pid);
2863 if ((f = fopen(line, "r")) == NULL) {
2864 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2865 return;
2866 }
2867 while (fgets(line, 400, f)) {
2868 if (strncmp(line, "Uid:", 4) == 0) {
2869 if (sscanf(line+4, "%u", &u) != 1) {
2870 lxcfs_error("bad uid line for pid %u\n", pid);
2871 fclose(f);
2872 return;
2873 }
2874 *uid = u;
2875 } else if (strncmp(line, "Gid:", 4) == 0) {
2876 if (sscanf(line+4, "%u", &g) != 1) {
2877 lxcfs_error("bad gid line for pid %u\n", pid);
2878 fclose(f);
2879 return;
2880 }
2881 *gid = g;
2882 }
2883 }
2884 fclose(f);
2885 }
2886
2887 /*
2888 * May the requestor @r move victim @v to a new cgroup?
2889 * This is allowed if
2890 * . they are the same task
2891 * . they are ownedy by the same uid
2892 * . @r is root on the host, or
2893 * . @v's uid is mapped into @r's where @r is root.
2894 */
2895 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2896 {
2897 uid_t v_uid, tmpuid;
2898 gid_t v_gid;
2899
2900 if (r == v)
2901 return true;
2902 if (r_uid == 0)
2903 return true;
2904 get_pid_creds(v, &v_uid, &v_gid);
2905 if (r_uid == v_uid)
2906 return true;
2907 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2908 && hostuid_to_ns(v_uid, r, &tmpuid))
2909 return true;
2910 return false;
2911 }
2912
2913 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2914 const char *file, const char *buf)
2915 {
2916 int sock[2] = {-1, -1};
2917 pid_t qpid, cpid = -1;
2918 FILE *pids_file = NULL;
2919 bool answer = false, fail = false;
2920
2921 pids_file = open_pids_file(contrl, cg);
2922 if (!pids_file)
2923 return false;
2924
2925 /*
2926 * write the pids to a socket, have helper in writer's pidns
2927 * call movepid for us
2928 */
2929 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2930 perror("socketpair");
2931 goto out;
2932 }
2933
2934 cpid = fork();
2935 if (cpid == -1)
2936 goto out;
2937
2938 if (!cpid) { // child
2939 fclose(pids_file);
2940 pid_from_ns_wrapper(sock[1], tpid);
2941 }
2942
2943 const char *ptr = buf;
2944 while (sscanf(ptr, "%d", &qpid) == 1) {
2945 struct ucred cred;
2946 char v;
2947
2948 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2949 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2950 goto out;
2951 }
2952
2953 if (recv_creds(sock[0], &cred, &v)) {
2954 if (v == '0') {
2955 if (!may_move_pid(tpid, tuid, cred.pid)) {
2956 fail = true;
2957 break;
2958 }
2959 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2960 fail = true;
2961 }
2962 }
2963
2964 ptr = strchr(ptr, '\n');
2965 if (!ptr)
2966 break;
2967 ptr++;
2968 }
2969
2970 /* All good, write the value */
2971 qpid = -1;
2972 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2973 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2974
2975 if (!fail)
2976 answer = true;
2977
2978 out:
2979 if (cpid != -1)
2980 wait_for_pid(cpid);
2981 if (sock[0] != -1) {
2982 close(sock[0]);
2983 close(sock[1]);
2984 }
2985 if (pids_file) {
2986 if (fclose(pids_file) != 0)
2987 answer = false;
2988 }
2989 return answer;
2990 }
2991
2992 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2993 struct fuse_file_info *fi)
2994 {
2995 struct fuse_context *fc = fuse_get_context();
2996 char *localbuf = NULL;
2997 struct cgfs_files *k = NULL;
2998 struct file_info *f = (struct file_info *)fi->fh;
2999 bool r;
3000
3001 if (f->type != LXC_TYPE_CGFILE) {
3002 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
3003 return -EIO;
3004 }
3005
3006 if (offset)
3007 return 0;
3008
3009 if (!fc)
3010 return -EIO;
3011
3012 localbuf = alloca(size+1);
3013 localbuf[size] = '\0';
3014 memcpy(localbuf, buf, size);
3015
3016 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
3017 size = -EINVAL;
3018 goto out;
3019 }
3020
3021 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
3022 size = -EACCES;
3023 goto out;
3024 }
3025
3026 if (strcmp(f->file, "tasks") == 0 ||
3027 strcmp(f->file, "/tasks") == 0 ||
3028 strcmp(f->file, "/cgroup.procs") == 0 ||
3029 strcmp(f->file, "cgroup.procs") == 0)
3030 // special case - we have to translate the pids
3031 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
3032 else
3033 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
3034
3035 if (!r)
3036 size = -EINVAL;
3037
3038 out:
3039 free_key(k);
3040 return size;
3041 }
3042
3043 int cg_chown(const char *path, uid_t uid, gid_t gid)
3044 {
3045 __do_free char *cgdir = NULL;
3046 struct fuse_context *fc = fuse_get_context();
3047 char *last = NULL, *path1, *path2, *controller;
3048 struct cgfs_files *k = NULL;
3049 const char *cgroup;
3050 int ret;
3051
3052 if (!fc)
3053 return -EIO;
3054
3055 if (strcmp(path, "/cgroup") == 0)
3056 return -EPERM;
3057
3058 controller = pick_controller_from_path(fc, path);
3059 if (!controller)
3060 return errno == ENOENT ? -EPERM : -errno;
3061
3062 cgroup = find_cgroup_in_path(path);
3063 if (!cgroup)
3064 /* this is just /cgroup/controller */
3065 return -EPERM;
3066
3067 get_cgdir_and_path(cgroup, &cgdir, &last);
3068
3069 if (!last) {
3070 path1 = "/";
3071 path2 = cgdir;
3072 } else {
3073 path1 = cgdir;
3074 path2 = last;
3075 }
3076
3077 if (is_child_cgroup(controller, path1, path2)) {
3078 // get uid, gid, from '/tasks' file and make up a mode
3079 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3080 k = cgfs_get_key(controller, cgroup, "tasks");
3081
3082 } else
3083 k = cgfs_get_key(controller, path1, path2);
3084
3085 if (!k) {
3086 ret = -EINVAL;
3087 goto out;
3088 }
3089
3090 /*
3091 * This being a fuse request, the uid and gid must be valid
3092 * in the caller's namespace. So we can just check to make
3093 * sure that the caller is root in his uid, and privileged
3094 * over the file's current owner.
3095 */
3096 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3097 ret = -EACCES;
3098 goto out;
3099 }
3100
3101 ret = cgfs_chown_file(controller, cgroup, uid, gid);
3102
3103 out:
3104 free_key(k);
3105
3106 return ret;
3107 }
3108
3109 int cg_chmod(const char *path, mode_t mode)
3110 {
3111 __do_free char *cgdir = NULL;
3112 struct fuse_context *fc = fuse_get_context();
3113 char *last = NULL, *path1, *path2, *controller;
3114 struct cgfs_files *k = NULL;
3115 const char *cgroup;
3116 int ret;
3117
3118 if (!fc)
3119 return -EIO;
3120
3121 if (strcmp(path, "/cgroup") == 0)
3122 return -EPERM;
3123
3124 controller = pick_controller_from_path(fc, path);
3125 if (!controller)
3126 return errno == ENOENT ? -EPERM : -errno;
3127
3128 cgroup = find_cgroup_in_path(path);
3129 if (!cgroup)
3130 /* this is just /cgroup/controller */
3131 return -EPERM;
3132
3133 get_cgdir_and_path(cgroup, &cgdir, &last);
3134
3135 if (!last) {
3136 path1 = "/";
3137 path2 = cgdir;
3138 } else {
3139 path1 = cgdir;
3140 path2 = last;
3141 }
3142
3143 if (is_child_cgroup(controller, path1, path2)) {
3144 // get uid, gid, from '/tasks' file and make up a mode
3145 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3146 k = cgfs_get_key(controller, cgroup, "tasks");
3147
3148 } else
3149 k = cgfs_get_key(controller, path1, path2);
3150
3151 if (!k) {
3152 ret = -EINVAL;
3153 goto out;
3154 }
3155
3156 /*
3157 * This being a fuse request, the uid and gid must be valid
3158 * in the caller's namespace. So we can just check to make
3159 * sure that the caller is root in his uid, and privileged
3160 * over the file's current owner.
3161 */
3162 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3163 ret = -EPERM;
3164 goto out;
3165 }
3166
3167 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3168 ret = -EINVAL;
3169 goto out;
3170 }
3171
3172 ret = 0;
3173 out:
3174 free_key(k);
3175 return ret;
3176 }
3177
3178 int cg_mkdir(const char *path, mode_t mode)
3179 {
3180 __do_free char *cgdir = NULL, *next = NULL;
3181 struct fuse_context *fc = fuse_get_context();
3182 char *last = NULL, *path1, *controller;
3183 const char *cgroup;
3184 int ret;
3185
3186 if (!fc)
3187 return -EIO;
3188
3189 controller = pick_controller_from_path(fc, path);
3190 if (!controller)
3191 return errno == ENOENT ? -EPERM : -errno;
3192
3193 cgroup = find_cgroup_in_path(path);
3194 if (!cgroup)
3195 return -errno;
3196
3197 get_cgdir_and_path(cgroup, &cgdir, &last);
3198 if (!last)
3199 path1 = "/";
3200 else
3201 path1 = cgdir;
3202
3203 pid_t initpid = lookup_initpid_in_store(fc->pid);
3204 if (initpid <= 0)
3205 initpid = fc->pid;
3206 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3207 if (!next)
3208 ret = -EINVAL;
3209 else if (last && strcmp(next, last) == 0)
3210 ret = -EEXIST;
3211 else
3212 ret = -EPERM;
3213 goto out;
3214 }
3215
3216 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3217 ret = -EACCES;
3218 goto out;
3219 }
3220 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3221 ret = -EACCES;
3222 goto out;
3223 }
3224
3225 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3226
3227 out:
3228 return ret;
3229 }
3230
3231 int cg_rmdir(const char *path)
3232 {
3233 __do_free char *cgdir = NULL, *next = NULL;
3234 struct fuse_context *fc = fuse_get_context();
3235 char *last = NULL, *controller;
3236 const char *cgroup;
3237 int ret;
3238
3239 if (!fc)
3240 return -EIO;
3241
3242 controller = pick_controller_from_path(fc, path);
3243 if (!controller) /* Someone's trying to delete "/cgroup". */
3244 return -EPERM;
3245
3246 cgroup = find_cgroup_in_path(path);
3247 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3248 return -EPERM;
3249
3250 get_cgdir_and_path(cgroup, &cgdir, &last);
3251 if (!last) {
3252 /* Someone's trying to delete a cgroup on the same level as the
3253 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3254 * rmdir "/cgroup/blkio/init.slice".
3255 */
3256 ret = -EPERM;
3257 goto out;
3258 }
3259
3260 pid_t initpid = lookup_initpid_in_store(fc->pid);
3261 if (initpid <= 0)
3262 initpid = fc->pid;
3263 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3264 if (!last || (next && (strcmp(next, last) == 0)))
3265 ret = -EBUSY;
3266 else
3267 ret = -ENOENT;
3268 goto out;
3269 }
3270
3271 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3272 ret = -EACCES;
3273 goto out;
3274 }
3275 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3276 ret = -EACCES;
3277 goto out;
3278 }
3279
3280 if (!cgfs_remove(controller, cgroup)) {
3281 ret = -EINVAL;
3282 goto out;
3283 }
3284
3285 ret = 0;
3286
3287 out:
3288 return ret;
3289 }
3290
3291 static bool startswith(const char *line, const char *pref)
3292 {
3293 if (strncmp(line, pref, strlen(pref)) == 0)
3294 return true;
3295 return false;
3296 }
3297
3298 static void parse_memstat(char *memstat, unsigned long *cached,
3299 unsigned long *active_anon, unsigned long *inactive_anon,
3300 unsigned long *active_file, unsigned long *inactive_file,
3301 unsigned long *unevictable, unsigned long *shmem)
3302 {
3303 char *eol;
3304
3305 while (*memstat) {
3306 if (startswith(memstat, "total_cache")) {
3307 sscanf(memstat + 11, "%lu", cached);
3308 *cached /= 1024;
3309 } else if (startswith(memstat, "total_active_anon")) {
3310 sscanf(memstat + 17, "%lu", active_anon);
3311 *active_anon /= 1024;
3312 } else if (startswith(memstat, "total_inactive_anon")) {
3313 sscanf(memstat + 19, "%lu", inactive_anon);
3314 *inactive_anon /= 1024;
3315 } else if (startswith(memstat, "total_active_file")) {
3316 sscanf(memstat + 17, "%lu", active_file);
3317 *active_file /= 1024;
3318 } else if (startswith(memstat, "total_inactive_file")) {
3319 sscanf(memstat + 19, "%lu", inactive_file);
3320 *inactive_file /= 1024;
3321 } else if (startswith(memstat, "total_unevictable")) {
3322 sscanf(memstat + 17, "%lu", unevictable);
3323 *unevictable /= 1024;
3324 } else if (startswith(memstat, "total_shmem")) {
3325 sscanf(memstat + 11, "%lu", shmem);
3326 *shmem /= 1024;
3327 }
3328 eol = strchr(memstat, '\n');
3329 if (!eol)
3330 return;
3331 memstat = eol+1;
3332 }
3333 }
3334
3335 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3336 {
3337 char *eol;
3338 char key[32];
3339
3340 memset(key, 0, 32);
3341 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3342
3343 size_t len = strlen(key);
3344 *v = 0;
3345
3346 while (*str) {
3347 if (startswith(str, key)) {
3348 sscanf(str + len, "%lu", v);
3349 return;
3350 }
3351 eol = strchr(str, '\n');
3352 if (!eol)
3353 return;
3354 str = eol+1;
3355 }
3356 }
3357
3358 static int read_file(const char *path, char *buf, size_t size,
3359 struct file_info *d)
3360 {
3361 __do_free char *line = NULL;
3362 __do_fclose FILE *f = NULL;
3363 size_t linelen = 0, total_len = 0;
3364 char *cache = d->buf;
3365 size_t cache_size = d->buflen;
3366
3367 f = fopen(path, "r");
3368 if (!f)
3369 return 0;
3370
3371 while (getline(&line, &linelen, f) != -1) {
3372 ssize_t l = snprintf(cache, cache_size, "%s", line);
3373 if (l < 0) {
3374 perror("Error writing to cache");
3375 return 0;
3376 }
3377 if (l >= cache_size) {
3378 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3379 return 0;
3380 }
3381 cache += l;
3382 cache_size -= l;
3383 total_len += l;
3384 }
3385
3386 d->size = total_len;
3387 if (total_len > size)
3388 total_len = size;
3389
3390 /* read from off 0 */
3391 memcpy(buf, d->buf, total_len);
3392 return total_len;
3393 }
3394
3395 /*
3396 * FUSE ops for /proc
3397 */
3398
3399 static unsigned long get_memlimit(const char *cgroup, const char *file)
3400 {
3401 __do_free char *memlimit_str = NULL;
3402 unsigned long memlimit = -1;
3403
3404 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3405 memlimit = strtoul(memlimit_str, NULL, 10);
3406
3407 return memlimit;
3408 }
3409
3410 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3411 {
3412 char *copy = strdupa(cgroup);
3413 unsigned long memlimit = 0, retlimit;
3414
3415 retlimit = get_memlimit(copy, file);
3416
3417 while (strcmp(copy, "/") != 0) {
3418 copy = dirname(copy);
3419 memlimit = get_memlimit(copy, file);
3420 if (memlimit != -1 && memlimit < retlimit)
3421 retlimit = memlimit;
3422 };
3423
3424 return retlimit;
3425 }
3426
3427 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3428 struct fuse_file_info *fi)
3429 {
3430 __do_free char *cg = NULL, *line = NULL, *memusage_str = NULL,
3431 *memstat_str = NULL, *memswlimit_str = NULL,
3432 *memswusage_str = NULL;
3433 __do_fclose FILE *f = NULL;
3434 struct fuse_context *fc = fuse_get_context();
3435 struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
3436 struct file_info *d = (struct file_info *)fi->fh;
3437 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3438 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3439 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
3440 hostswtotal = 0;
3441 size_t linelen = 0, total_len = 0, rv = 0;
3442 char *cache = d->buf;
3443 size_t cache_size = d->buflen;
3444
3445 if (offset){
3446 if (offset > d->size)
3447 return -EINVAL;
3448 if (!d->cached)
3449 return 0;
3450 int left = d->size - offset;
3451 total_len = left > size ? size: left;
3452 memcpy(buf, cache + offset, total_len);
3453 return total_len;
3454 }
3455
3456 pid_t initpid = lookup_initpid_in_store(fc->pid);
3457 if (initpid <= 0)
3458 initpid = fc->pid;
3459 cg = get_pid_cgroup(initpid, "memory");
3460 if (!cg)
3461 return read_file("/proc/meminfo", buf, size, d);
3462 prune_init_slice(cg);
3463
3464 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3465 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3466 goto err;
3467 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3468 goto err;
3469
3470 // Following values are allowed to fail, because swapaccount might be turned
3471 // off for current kernel
3472 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3473 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3474 {
3475 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3476 memswusage = strtoul(memswusage_str, NULL, 10);
3477
3478 memswlimit = memswlimit / 1024;
3479 memswusage = memswusage / 1024;
3480 }
3481
3482 memusage = strtoul(memusage_str, NULL, 10);
3483 memlimit /= 1024;
3484 memusage /= 1024;
3485
3486 parse_memstat(memstat_str, &cached, &active_anon,
3487 &inactive_anon, &active_file, &inactive_file,
3488 &unevictable, &shmem);
3489
3490 f = fopen("/proc/meminfo", "r");
3491 if (!f)
3492 goto err;
3493
3494 while (getline(&line, &linelen, f) != -1) {
3495 ssize_t l;
3496 char *printme, lbuf[100];
3497
3498 memset(lbuf, 0, 100);
3499 if (startswith(line, "MemTotal:")) {
3500 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3501 if (hosttotal < memlimit)
3502 memlimit = hosttotal;
3503 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3504 printme = lbuf;
3505 } else if (startswith(line, "MemFree:")) {
3506 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3507 printme = lbuf;
3508 } else if (startswith(line, "MemAvailable:")) {
3509 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
3510 printme = lbuf;
3511 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 && opts->swap_off == false) {
3512 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3513 if (hostswtotal < memswlimit)
3514 memswlimit = hostswtotal;
3515 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
3516 printme = lbuf;
3517 } else if (startswith(line, "SwapTotal:") && opts->swap_off == true) {
3518 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", 0UL);
3519 printme = lbuf;
3520 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0 && opts->swap_off == false) {
3521 unsigned long swaptotal = memswlimit,
3522 swapusage = memswusage - memusage,
3523 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3524 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3525 printme = lbuf;
3526 } else if (startswith(line, "SwapFree:") && opts->swap_off == true) {
3527 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", 0UL);
3528 printme = lbuf;
3529 } else if (startswith(line, "Slab:")) {
3530 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3531 printme = lbuf;
3532 } else if (startswith(line, "Buffers:")) {
3533 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3534 printme = lbuf;
3535 } else if (startswith(line, "Cached:")) {
3536 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3537 printme = lbuf;
3538 } else if (startswith(line, "SwapCached:")) {
3539 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3540 printme = lbuf;
3541 } else if (startswith(line, "Active:")) {
3542 snprintf(lbuf, 100, "Active: %8lu kB\n",
3543 active_anon + active_file);
3544 printme = lbuf;
3545 } else if (startswith(line, "Inactive:")) {
3546 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3547 inactive_anon + inactive_file);
3548 printme = lbuf;
3549 } else if (startswith(line, "Active(anon)")) {
3550 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3551 printme = lbuf;
3552 } else if (startswith(line, "Inactive(anon)")) {
3553 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3554 printme = lbuf;
3555 } else if (startswith(line, "Active(file)")) {
3556 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3557 printme = lbuf;
3558 } else if (startswith(line, "Inactive(file)")) {
3559 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3560 printme = lbuf;
3561 } else if (startswith(line, "Unevictable")) {
3562 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3563 printme = lbuf;
3564 } else if (startswith(line, "SReclaimable")) {
3565 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3566 printme = lbuf;
3567 } else if (startswith(line, "SUnreclaim")) {
3568 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3569 printme = lbuf;
3570 } else if (startswith(line, "Shmem:")) {
3571 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3572 printme = lbuf;
3573 } else if (startswith(line, "ShmemHugePages")) {
3574 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3575 printme = lbuf;
3576 } else if (startswith(line, "ShmemPmdMapped")) {
3577 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3578 printme = lbuf;
3579 } else
3580 printme = line;
3581
3582 l = snprintf(cache, cache_size, "%s", printme);
3583 if (l < 0) {
3584 perror("Error writing to cache");
3585 rv = 0;
3586 goto err;
3587
3588 }
3589 if (l >= cache_size) {
3590 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3591 rv = 0;
3592 goto err;
3593 }
3594
3595 cache += l;
3596 cache_size -= l;
3597 total_len += l;
3598 }
3599
3600 d->cached = 1;
3601 d->size = total_len;
3602 if (total_len > size ) total_len = size;
3603 memcpy(buf, d->buf, total_len);
3604
3605 rv = total_len;
3606 err:
3607 return rv;
3608 }
3609
3610 /*
3611 * Read the cpuset.cpus for cg
3612 * Return the answer in a newly allocated string which must be freed
3613 */
3614 static char *get_cpuset(const char *cg)
3615 {
3616 char *answer;
3617
3618 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3619 return NULL;
3620 return answer;
3621 }
3622
3623 bool cpu_in_cpuset(int cpu, const char *cpuset);
3624
3625 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3626 {
3627 int cpu;
3628
3629 if (sscanf(line, "processor : %d", &cpu) != 1)
3630 return false;
3631 return cpu_in_cpuset(cpu, cpuset);
3632 }
3633
3634 /*
3635 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3636 * depending on `param`. Parameter value is returned throuh `value`.
3637 */
3638 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3639 {
3640 __do_free char *str = NULL;
3641 bool rv = false;
3642 char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3643
3644 sprintf(file, "cpu.cfs_%s_us", param);
3645
3646 if (!cgfs_get_value("cpu", cg, file, &str))
3647 goto err;
3648
3649 if (sscanf(str, "%ld", value) != 1)
3650 goto err;
3651
3652 rv = true;
3653
3654 err:
3655 return rv;
3656 }
3657
3658 /*
3659 * Return the maximum number of visible CPUs based on CPU quotas.
3660 * If there is no quota set, zero is returned.
3661 */
3662 int max_cpu_count(const char *cg)
3663 {
3664 int rv, nprocs;
3665 int64_t cfs_quota, cfs_period;
3666
3667 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3668 return 0;
3669
3670 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3671 return 0;
3672
3673 if (cfs_quota <= 0 || cfs_period <= 0)
3674 return 0;
3675
3676 rv = cfs_quota / cfs_period;
3677
3678 /* In case quota/period does not yield a whole number, add one CPU for
3679 * the remainder.
3680 */
3681 if ((cfs_quota % cfs_period) > 0)
3682 rv += 1;
3683
3684 nprocs = get_nprocs();
3685
3686 if (rv > nprocs)
3687 rv = nprocs;
3688
3689 return rv;
3690 }
3691
3692 /*
3693 * Determine whether CPU views should be used or not.
3694 */
3695 bool use_cpuview(const char *cg)
3696 {
3697 int cfd;
3698 char *tmpc;
3699
3700 tmpc = find_mounted_controller("cpu", &cfd);
3701 if (!tmpc)
3702 return false;
3703
3704 tmpc = find_mounted_controller("cpuacct", &cfd);
3705 if (!tmpc)
3706 return false;
3707
3708 return true;
3709 }
3710
3711 /*
3712 * check whether this is a '^processor" line in /proc/cpuinfo
3713 */
3714 static bool is_processor_line(const char *line)
3715 {
3716 int cpu;
3717
3718 if (sscanf(line, "processor : %d", &cpu) == 1)
3719 return true;
3720 return false;
3721 }
3722
3723 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3724 struct fuse_file_info *fi)
3725 {
3726 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
3727 __do_fclose FILE *f = NULL;
3728 struct fuse_context *fc = fuse_get_context();
3729 struct file_info *d = (struct file_info *)fi->fh;
3730 size_t linelen = 0, total_len = 0, rv = 0;
3731 bool am_printing = false, firstline = true, is_s390x = false;
3732 int curcpu = -1, cpu, max_cpus = 0;
3733 bool use_view;
3734 char *cache = d->buf;
3735 size_t cache_size = d->buflen;
3736
3737 if (offset){
3738 if (offset > d->size)
3739 return -EINVAL;
3740 if (!d->cached)
3741 return 0;
3742 int left = d->size - offset;
3743 total_len = left > size ? size: left;
3744 memcpy(buf, cache + offset, total_len);
3745 return total_len;
3746 }
3747
3748 pid_t initpid = lookup_initpid_in_store(fc->pid);
3749 if (initpid <= 0)
3750 initpid = fc->pid;
3751 cg = get_pid_cgroup(initpid, "cpuset");
3752 if (!cg)
3753 return read_file("proc/cpuinfo", buf, size, d);
3754 prune_init_slice(cg);
3755
3756 cpuset = get_cpuset(cg);
3757 if (!cpuset)
3758 goto err;
3759
3760 use_view = use_cpuview(cg);
3761
3762 if (use_view)
3763 max_cpus = max_cpu_count(cg);
3764
3765 f = fopen("/proc/cpuinfo", "r");
3766 if (!f)
3767 goto err;
3768
3769 while (getline(&line, &linelen, f) != -1) {
3770 ssize_t l;
3771 if (firstline) {
3772 firstline = false;
3773 if (strstr(line, "IBM/S390") != NULL) {
3774 is_s390x = true;
3775 am_printing = true;
3776 continue;
3777 }
3778 }
3779 if (strncmp(line, "# processors:", 12) == 0)
3780 continue;
3781 if (is_processor_line(line)) {
3782 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3783 break;
3784 am_printing = cpuline_in_cpuset(line, cpuset);
3785 if (am_printing) {
3786 curcpu ++;
3787 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3788 if (l < 0) {
3789 perror("Error writing to cache");
3790 rv = 0;
3791 goto err;
3792 }
3793 if (l >= cache_size) {
3794 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3795 rv = 0;
3796 goto err;
3797 }
3798 cache += l;
3799 cache_size -= l;
3800 total_len += l;
3801 }
3802 continue;
3803 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3804 char *p;
3805 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3806 break;
3807 if (!cpu_in_cpuset(cpu, cpuset))
3808 continue;
3809 curcpu ++;
3810 p = strchr(line, ':');
3811 if (!p || !*p)
3812 goto err;
3813 p++;
3814 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3815 if (l < 0) {
3816 perror("Error writing to cache");
3817 rv = 0;
3818 goto err;
3819 }
3820 if (l >= cache_size) {
3821 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3822 rv = 0;
3823 goto err;
3824 }
3825 cache += l;
3826 cache_size -= l;
3827 total_len += l;
3828 continue;
3829
3830 }
3831 if (am_printing) {
3832 l = snprintf(cache, cache_size, "%s", line);
3833 if (l < 0) {
3834 perror("Error writing to cache");
3835 rv = 0;
3836 goto err;
3837 }
3838 if (l >= cache_size) {
3839 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3840 rv = 0;
3841 goto err;
3842 }
3843 cache += l;
3844 cache_size -= l;
3845 total_len += l;
3846 }
3847 }
3848
3849 if (is_s390x) {
3850 __do_free char *origcache = d->buf;
3851 ssize_t l;
3852 do {
3853 d->buf = malloc(d->buflen);
3854 } while (!d->buf);
3855 cache = d->buf;
3856 cache_size = d->buflen;
3857 total_len = 0;
3858 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3859 if (l < 0 || l >= cache_size)
3860 goto err;
3861 cache_size -= l;
3862 cache += l;
3863 total_len += l;
3864 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3865 if (l < 0 || l >= cache_size)
3866 goto err;
3867 cache_size -= l;
3868 cache += l;
3869 total_len += l;
3870 l = snprintf(cache, cache_size, "%s", origcache);
3871 if (l < 0 || l >= cache_size)
3872 goto err;
3873 total_len += l;
3874 }
3875
3876 d->cached = 1;
3877 d->size = total_len;
3878 if (total_len > size ) total_len = size;
3879
3880 /* read from off 0 */
3881 memcpy(buf, d->buf, total_len);
3882 rv = total_len;
3883 err:
3884 return rv;
3885 }
3886
3887 static uint64_t get_reaper_start_time(pid_t pid)
3888 {
3889 int ret;
3890 FILE *f;
3891 uint64_t starttime;
3892 /* strlen("/proc/") = 6
3893 * +
3894 * LXCFS_NUMSTRLEN64
3895 * +
3896 * strlen("/stat") = 5
3897 * +
3898 * \0 = 1
3899 * */
3900 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3901 char path[__PROC_PID_STAT_LEN];
3902 pid_t qpid;
3903
3904 qpid = lookup_initpid_in_store(pid);
3905 if (qpid <= 0) {
3906 /* Caller can check for EINVAL on 0. */
3907 errno = EINVAL;
3908 return 0;
3909 }
3910
3911 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3912 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3913 /* Caller can check for EINVAL on 0. */
3914 errno = EINVAL;
3915 return 0;
3916 }
3917
3918 f = fopen(path, "r");
3919 if (!f) {
3920 /* Caller can check for EINVAL on 0. */
3921 errno = EINVAL;
3922 return 0;
3923 }
3924
3925 /* Note that the *scanf() argument supression requires that length
3926 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3927 * at us. It's like telling someone you're not married and then asking
3928 * if you can bring your wife to the party.
3929 */
3930 ret = fscanf(f, "%*d " /* (1) pid %d */
3931 "%*s " /* (2) comm %s */
3932 "%*c " /* (3) state %c */
3933 "%*d " /* (4) ppid %d */
3934 "%*d " /* (5) pgrp %d */
3935 "%*d " /* (6) session %d */
3936 "%*d " /* (7) tty_nr %d */
3937 "%*d " /* (8) tpgid %d */
3938 "%*u " /* (9) flags %u */
3939 "%*u " /* (10) minflt %lu */
3940 "%*u " /* (11) cminflt %lu */
3941 "%*u " /* (12) majflt %lu */
3942 "%*u " /* (13) cmajflt %lu */
3943 "%*u " /* (14) utime %lu */
3944 "%*u " /* (15) stime %lu */
3945 "%*d " /* (16) cutime %ld */
3946 "%*d " /* (17) cstime %ld */
3947 "%*d " /* (18) priority %ld */
3948 "%*d " /* (19) nice %ld */
3949 "%*d " /* (20) num_threads %ld */
3950 "%*d " /* (21) itrealvalue %ld */
3951 "%" PRIu64, /* (22) starttime %llu */
3952 &starttime);
3953 if (ret != 1) {
3954 fclose(f);
3955 /* Caller can check for EINVAL on 0. */
3956 errno = EINVAL;
3957 return 0;
3958 }
3959
3960 fclose(f);
3961
3962 errno = 0;
3963 return starttime;
3964 }
3965
3966 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3967 {
3968 uint64_t clockticks;
3969 int64_t ticks_per_sec;
3970
3971 clockticks = get_reaper_start_time(pid);
3972 if (clockticks == 0 && errno == EINVAL) {
3973 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3974 return 0;
3975 }
3976
3977 ticks_per_sec = sysconf(_SC_CLK_TCK);
3978 if (ticks_per_sec < 0 && errno == EINVAL) {
3979 lxcfs_debug(
3980 "%s\n",
3981 "failed to determine number of clock ticks in a second");
3982 return 0;
3983 }
3984
3985 return (clockticks /= ticks_per_sec);
3986 }
3987
3988 static uint64_t get_reaper_age(pid_t pid)
3989 {
3990 uint64_t procstart, uptime, procage;
3991
3992 /* We need to substract the time the process has started since system
3993 * boot minus the time when the system has started to get the actual
3994 * reaper age.
3995 */
3996 procstart = get_reaper_start_time_in_sec(pid);
3997 procage = procstart;
3998 if (procstart > 0) {
3999 int ret;
4000 struct timespec spec;
4001
4002 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
4003 if (ret < 0)
4004 return 0;
4005 /* We could make this more precise here by using the tv_nsec
4006 * field in the timespec struct and convert it to milliseconds
4007 * and then create a double for the seconds and milliseconds but
4008 * that seems more work than it is worth.
4009 */
4010 uptime = spec.tv_sec;
4011 procage = uptime - procstart;
4012 }
4013
4014 return procage;
4015 }
4016
4017 /*
4018 * Returns 0 on success.
4019 * It is the caller's responsibility to free `return_usage`, unless this
4020 * function returns an error.
4021 */
4022 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
4023 {
4024 int cpucount = get_nprocs_conf();
4025 struct cpuacct_usage *cpu_usage;
4026 int rv = 0, i, j, ret, read_pos = 0, read_cnt;
4027 int cg_cpu;
4028 uint64_t cg_user, cg_system;
4029 int64_t ticks_per_sec;
4030 char *usage_str = NULL;
4031
4032 ticks_per_sec = sysconf(_SC_CLK_TCK);
4033
4034 if (ticks_per_sec < 0 && errno == EINVAL) {
4035 lxcfs_debug(
4036 "%s\n",
4037 "read_cpuacct_usage_all failed to determine number of clock ticks "
4038 "in a second");
4039 return -1;
4040 }
4041
4042 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4043 if (!cpu_usage)
4044 return -ENOMEM;
4045
4046 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4047 rv = -1;
4048 goto err;
4049 }
4050
4051 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4052 lxcfs_error("read_cpuacct_usage_all reading first line from "
4053 "%s/cpuacct.usage_all failed.\n", cg);
4054 rv = -1;
4055 goto err;
4056 }
4057
4058 read_pos += read_cnt;
4059
4060 for (i = 0, j = 0; i < cpucount; i++) {
4061 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4062 &cg_system, &read_cnt);
4063
4064 if (ret == EOF)
4065 break;
4066
4067 if (ret != 3) {
4068 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4069 "failed.\n", cg);
4070 rv = -1;
4071 goto err;
4072 }
4073
4074 read_pos += read_cnt;
4075
4076 /* Convert the time from nanoseconds to USER_HZ */
4077 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4078 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4079 j++;
4080 }
4081
4082 rv = 0;
4083 *return_usage = cpu_usage;
4084 *size = cpucount;
4085
4086 err:
4087 if (usage_str)
4088 free(usage_str);
4089
4090 if (rv != 0) {
4091 free(cpu_usage);
4092 *return_usage = NULL;
4093 }
4094
4095 return rv;
4096 }
4097
4098 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4099 {
4100 int i;
4101 unsigned long sum = 0;
4102
4103 for (i = 0; i < cpu_count; i++) {
4104 if (!newer[i].online)
4105 continue;
4106
4107 /* When cpuset is changed on the fly, the CPUs might get reordered.
4108 * We could either reset all counters, or check that the substractions
4109 * below will return expected results.
4110 */
4111 if (newer[i].user > older[i].user)
4112 diff[i].user = newer[i].user - older[i].user;
4113 else
4114 diff[i].user = 0;
4115
4116 if (newer[i].system > older[i].system)
4117 diff[i].system = newer[i].system - older[i].system;
4118 else
4119 diff[i].system = 0;
4120
4121 if (newer[i].idle > older[i].idle)
4122 diff[i].idle = newer[i].idle - older[i].idle;
4123 else
4124 diff[i].idle = 0;
4125
4126 sum += diff[i].user;
4127 sum += diff[i].system;
4128 sum += diff[i].idle;
4129 }
4130
4131 return sum;
4132 }
4133
4134 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4135 {
4136 unsigned long free_space, to_add;
4137
4138 free_space = threshold - usage->user - usage->system;
4139
4140 if (free_space > usage->idle)
4141 free_space = usage->idle;
4142
4143 to_add = free_space > *surplus ? *surplus : free_space;
4144
4145 *counter += to_add;
4146 usage->idle -= to_add;
4147 *surplus -= to_add;
4148 }
4149
4150 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4151 {
4152 struct cg_proc_stat *first = NULL, *prev, *tmp;
4153
4154 for (prev = NULL; node; ) {
4155 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4156 tmp = node;
4157 lxcfs_debug("Removing stat node for %s\n", node->cg);
4158
4159 if (prev)
4160 prev->next = node->next;
4161 else
4162 first = node->next;
4163
4164 node = node->next;
4165 free_proc_stat_node(tmp);
4166 } else {
4167 if (!first)
4168 first = node;
4169 prev = node;
4170 node = node->next;
4171 }
4172 }
4173
4174 return first;
4175 }
4176
4177 #define PROC_STAT_PRUNE_INTERVAL 10
4178 static void prune_proc_stat_history(void)
4179 {
4180 int i;
4181 time_t now = time(NULL);
4182
4183 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4184 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4185
4186 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4187 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4188 return;
4189 }
4190
4191 if (proc_stat_history[i]->next) {
4192 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4193 proc_stat_history[i]->lastcheck = now;
4194 }
4195
4196 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4197 }
4198 }
4199
4200 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4201 {
4202 struct cg_proc_stat *node;
4203
4204 pthread_rwlock_rdlock(&head->lock);
4205
4206 if (!head->next) {
4207 pthread_rwlock_unlock(&head->lock);
4208 return NULL;
4209 }
4210
4211 node = head->next;
4212
4213 do {
4214 if (strcmp(cg, node->cg) == 0)
4215 goto out;
4216 } while ((node = node->next));
4217
4218 node = NULL;
4219
4220 out:
4221 pthread_rwlock_unlock(&head->lock);
4222 prune_proc_stat_history();
4223 return node;
4224 }
4225
4226 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4227 {
4228 struct cg_proc_stat *node;
4229 int i;
4230
4231 node = malloc(sizeof(struct cg_proc_stat));
4232 if (!node)
4233 goto err;
4234
4235 node->cg = NULL;
4236 node->usage = NULL;
4237 node->view = NULL;
4238
4239 node->cg = malloc(strlen(cg) + 1);
4240 if (!node->cg)
4241 goto err;
4242
4243 strcpy(node->cg, cg);
4244
4245 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4246 if (!node->usage)
4247 goto err;
4248
4249 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4250
4251 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4252 if (!node->view)
4253 goto err;
4254
4255 node->cpu_count = cpu_count;
4256 node->next = NULL;
4257
4258 if (pthread_mutex_init(&node->lock, NULL) != 0) {
4259 lxcfs_error("%s\n", "Failed to initialize node lock");
4260 goto err;
4261 }
4262
4263 for (i = 0; i < cpu_count; i++) {
4264 node->view[i].user = 0;
4265 node->view[i].system = 0;
4266 node->view[i].idle = 0;
4267 }
4268
4269 return node;
4270
4271 err:
4272 if (node && node->cg)
4273 free(node->cg);
4274 if (node && node->usage)
4275 free(node->usage);
4276 if (node && node->view)
4277 free(node->view);
4278 if (node)
4279 free(node);
4280
4281 return NULL;
4282 }
4283
4284 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4285 {
4286 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4287 struct cg_proc_stat_head *head = proc_stat_history[hash];
4288 struct cg_proc_stat *node, *rv = new_node;
4289
4290 pthread_rwlock_wrlock(&head->lock);
4291
4292 if (!head->next) {
4293 head->next = new_node;
4294 goto out;
4295 }
4296
4297 node = head->next;
4298
4299 for (;;) {
4300 if (strcmp(node->cg, new_node->cg) == 0) {
4301 /* The node is already present, return it */
4302 free_proc_stat_node(new_node);
4303 rv = node;
4304 goto out;
4305 }
4306
4307 if (node->next) {
4308 node = node->next;
4309 continue;
4310 }
4311
4312 node->next = new_node;
4313 goto out;
4314 }
4315
4316 out:
4317 pthread_rwlock_unlock(&head->lock);
4318 return rv;
4319 }
4320
4321 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4322 {
4323 struct cpuacct_usage *new_usage, *new_view;
4324 int i;
4325
4326 /* Allocate new memory */
4327 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4328 if (!new_usage)
4329 return false;
4330
4331 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4332 if (!new_view) {
4333 free(new_usage);
4334 return false;
4335 }
4336
4337 /* Copy existing data & initialize new elements */
4338 for (i = 0; i < cpu_count; i++) {
4339 if (i < node->cpu_count) {
4340 new_usage[i].user = node->usage[i].user;
4341 new_usage[i].system = node->usage[i].system;
4342 new_usage[i].idle = node->usage[i].idle;
4343
4344 new_view[i].user = node->view[i].user;
4345 new_view[i].system = node->view[i].system;
4346 new_view[i].idle = node->view[i].idle;
4347 } else {
4348 new_usage[i].user = 0;
4349 new_usage[i].system = 0;
4350 new_usage[i].idle = 0;
4351
4352 new_view[i].user = 0;
4353 new_view[i].system = 0;
4354 new_view[i].idle = 0;
4355 }
4356 }
4357
4358 free(node->usage);
4359 free(node->view);
4360
4361 node->usage = new_usage;
4362 node->view = new_view;
4363 node->cpu_count = cpu_count;
4364
4365 return true;
4366 }
4367
4368 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4369 {
4370 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4371 struct cg_proc_stat_head *head = proc_stat_history[hash];
4372 struct cg_proc_stat *node;
4373
4374 node = find_proc_stat_node(head, cg);
4375
4376 if (!node) {
4377 node = new_proc_stat_node(usage, cpu_count, cg);
4378 if (!node)
4379 return NULL;
4380
4381 node = add_proc_stat_node(node);
4382 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4383 }
4384
4385 pthread_mutex_lock(&node->lock);
4386
4387 /* If additional CPUs on the host have been enabled, CPU usage counter
4388 * arrays have to be expanded */
4389 if (node->cpu_count < cpu_count) {
4390 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4391 node->cpu_count, cpu_count, cg);
4392
4393 if (!expand_proc_stat_node(node, cpu_count)) {
4394 pthread_mutex_unlock(&node->lock);
4395 lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4396 node->cpu_count, cpu_count, cg);
4397 return NULL;
4398 }
4399 }
4400
4401 return node;
4402 }
4403
4404 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4405 {
4406 int i;
4407
4408 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4409 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4410
4411 for (i = 0; i < cpu_count; i++) {
4412 node->view[i].user = 0;
4413 node->view[i].system = 0;
4414 node->view[i].idle = 0;
4415 }
4416
4417 node->cpu_count = cpu_count;
4418 }
4419
4420 static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, FILE *f, char *buf, size_t buf_size)
4421 {
4422 char *line = NULL;
4423 size_t linelen = 0, total_len = 0, rv = 0, l;
4424 int curcpu = -1; /* cpu numbering starts at 0 */
4425 int physcpu, i;
4426 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4427 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4428 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4429 unsigned long user_surplus = 0, system_surplus = 0;
4430 unsigned long total_sum, threshold;
4431 struct cg_proc_stat *stat_node;
4432 struct cpuacct_usage *diff = NULL;
4433 int nprocs = get_nprocs_conf();
4434
4435 if (cg_cpu_usage_size < nprocs)
4436 nprocs = cg_cpu_usage_size;
4437
4438 /* Read all CPU stats and stop when we've encountered other lines */
4439 while (getline(&line, &linelen, f) != -1) {
4440 int ret;
4441 char cpu_char[10]; /* That's a lot of cores */
4442 uint64_t all_used, cg_used;
4443
4444 if (strlen(line) == 0)
4445 continue;
4446 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4447 /* not a ^cpuN line containing a number N */
4448 break;
4449 }
4450
4451 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4452 continue;
4453
4454 if (physcpu >= cg_cpu_usage_size)
4455 continue;
4456
4457 curcpu ++;
4458 cpu_cnt ++;
4459
4460 if (!cpu_in_cpuset(physcpu, cpuset)) {
4461 for (i = curcpu; i <= physcpu; i++) {
4462 cg_cpu_usage[i].online = false;
4463 }
4464 continue;
4465 }
4466
4467 if (curcpu < physcpu) {
4468 /* Some CPUs may be disabled */
4469 for (i = curcpu; i < physcpu; i++)
4470 cg_cpu_usage[i].online = false;
4471
4472 curcpu = physcpu;
4473 }
4474
4475 cg_cpu_usage[curcpu].online = true;
4476
4477 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4478 &user,
4479 &nice,
4480 &system,
4481 &idle,
4482 &iowait,
4483 &irq,
4484 &softirq,
4485 &steal,
4486 &guest,
4487 &guest_nice);
4488
4489 if (ret != 10)
4490 continue;
4491
4492 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4493 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4494
4495 if (all_used >= cg_used) {
4496 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4497
4498 } else {
4499 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4500 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4501 curcpu, cg, all_used, cg_used);
4502 cg_cpu_usage[curcpu].idle = idle;
4503 }
4504 }
4505
4506 /* Cannot use more CPUs than is available due to cpuset */
4507 if (max_cpus > cpu_cnt)
4508 max_cpus = cpu_cnt;
4509
4510 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4511
4512 if (!stat_node) {
4513 lxcfs_error("unable to find/create stat node for %s\n", cg);
4514 rv = 0;
4515 goto err;
4516 }
4517
4518 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4519 if (!diff) {
4520 rv = 0;
4521 goto err;
4522 }
4523
4524 /*
4525 * If the new values are LOWER than values stored in memory, it means
4526 * the cgroup has been reset/recreated and we should reset too.
4527 */
4528 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4529 if (!cg_cpu_usage[curcpu].online)
4530 continue;
4531
4532 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4533 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4534
4535 break;
4536 }
4537
4538 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4539
4540 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4541 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4542
4543 if (!stat_node->usage[curcpu].online)
4544 continue;
4545
4546 i++;
4547
4548 stat_node->usage[curcpu].user += diff[curcpu].user;
4549 stat_node->usage[curcpu].system += diff[curcpu].system;
4550 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4551
4552 if (max_cpus > 0 && i >= max_cpus) {
4553 user_surplus += diff[curcpu].user;
4554 system_surplus += diff[curcpu].system;
4555 }
4556 }
4557
4558 /* Calculate usage counters of visible CPUs */
4559 if (max_cpus > 0) {
4560 /* threshold = maximum usage per cpu, including idle */
4561 threshold = total_sum / cpu_cnt * max_cpus;
4562
4563 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4564 if (i == max_cpus)
4565 break;
4566
4567 if (!stat_node->usage[curcpu].online)
4568 continue;
4569
4570 i++;
4571
4572 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4573 continue;
4574
4575 /* Add user */
4576 add_cpu_usage(
4577 &user_surplus,
4578 &diff[curcpu],
4579 &diff[curcpu].user,
4580 threshold);
4581
4582 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4583 continue;
4584
4585 /* If there is still room, add system */
4586 add_cpu_usage(
4587 &system_surplus,
4588 &diff[curcpu],
4589 &diff[curcpu].system,
4590 threshold);
4591 }
4592
4593 if (user_surplus > 0)
4594 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4595 if (system_surplus > 0)
4596 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4597
4598 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4599 if (i == max_cpus)
4600 break;
4601
4602 if (!stat_node->usage[curcpu].online)
4603 continue;
4604
4605 i++;
4606
4607 stat_node->view[curcpu].user += diff[curcpu].user;
4608 stat_node->view[curcpu].system += diff[curcpu].system;
4609 stat_node->view[curcpu].idle += diff[curcpu].idle;
4610
4611 user_sum += stat_node->view[curcpu].user;
4612 system_sum += stat_node->view[curcpu].system;
4613 idle_sum += stat_node->view[curcpu].idle;
4614 }
4615
4616 } else {
4617 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4618 if (!stat_node->usage[curcpu].online)
4619 continue;
4620
4621 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4622 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4623 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4624
4625 user_sum += stat_node->view[curcpu].user;
4626 system_sum += stat_node->view[curcpu].system;
4627 idle_sum += stat_node->view[curcpu].idle;
4628 }
4629 }
4630
4631 /* Render the file */
4632 /* cpu-all */
4633 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4634 user_sum,
4635 system_sum,
4636 idle_sum);
4637
4638 if (l < 0) {
4639 perror("Error writing to cache");
4640 rv = 0;
4641 goto err;
4642
4643 }
4644 if (l >= buf_size) {
4645 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4646 rv = 0;
4647 goto err;
4648 }
4649
4650 buf += l;
4651 buf_size -= l;
4652 total_len += l;
4653
4654 /* Render visible CPUs */
4655 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4656 if (!stat_node->usage[curcpu].online)
4657 continue;
4658
4659 i++;
4660
4661 if (max_cpus > 0 && i == max_cpus)
4662 break;
4663
4664 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4665 i,
4666 stat_node->view[curcpu].user,
4667 stat_node->view[curcpu].system,
4668 stat_node->view[curcpu].idle);
4669
4670 if (l < 0) {
4671 perror("Error writing to cache");
4672 rv = 0;
4673 goto err;
4674
4675 }
4676 if (l >= buf_size) {
4677 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4678 rv = 0;
4679 goto err;
4680 }
4681
4682 buf += l;
4683 buf_size -= l;
4684 total_len += l;
4685 }
4686
4687 /* Pass the rest of /proc/stat, start with the last line read */
4688 l = snprintf(buf, buf_size, "%s", line);
4689
4690 if (l < 0) {
4691 perror("Error writing to cache");
4692 rv = 0;
4693 goto err;
4694
4695 }
4696 if (l >= buf_size) {
4697 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4698 rv = 0;
4699 goto err;
4700 }
4701
4702 buf += l;
4703 buf_size -= l;
4704 total_len += l;
4705
4706 /* Pass the rest of the host's /proc/stat */
4707 while (getline(&line, &linelen, f) != -1) {
4708 l = snprintf(buf, buf_size, "%s", line);
4709 if (l < 0) {
4710 perror("Error writing to cache");
4711 rv = 0;
4712 goto err;
4713 }
4714 if (l >= buf_size) {
4715 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4716 rv = 0;
4717 goto err;
4718 }
4719 buf += l;
4720 buf_size -= l;
4721 total_len += l;
4722 }
4723
4724 rv = total_len;
4725
4726 err:
4727 if (stat_node)
4728 pthread_mutex_unlock(&stat_node->lock);
4729 if (line)
4730 free(line);
4731 if (diff)
4732 free(diff);
4733 return rv;
4734 }
4735
4736 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4737 static int proc_stat_read(char *buf, size_t size, off_t offset,
4738 struct fuse_file_info *fi)
4739 {
4740 struct fuse_context *fc = fuse_get_context();
4741 struct file_info *d = (struct file_info *)fi->fh;
4742 char *cg;
4743 char *cpuset = NULL;
4744 char *line = NULL;
4745 size_t linelen = 0, total_len = 0, rv = 0;
4746 int curcpu = -1; /* cpu numbering starts at 0 */
4747 int physcpu = 0;
4748 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4749 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
4750 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4751 char cpuall[CPUALL_MAX_SIZE];
4752 /* reserve for cpu all */
4753 char *cache = d->buf + CPUALL_MAX_SIZE;
4754 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4755 FILE *f = NULL;
4756 struct cpuacct_usage *cg_cpu_usage = NULL;
4757 int cg_cpu_usage_size = 0;
4758
4759 if (offset){
4760 if (offset > d->size)
4761 return -EINVAL;
4762 if (!d->cached)
4763 return 0;
4764 int left = d->size - offset;
4765 total_len = left > size ? size: left;
4766 memcpy(buf, d->buf + offset, total_len);
4767 return total_len;
4768 }
4769
4770 pid_t initpid = lookup_initpid_in_store(fc->pid);
4771 if (initpid <= 0)
4772 initpid = fc->pid;
4773 cg = get_pid_cgroup(initpid, "cpuset");
4774 if (!cg)
4775 return read_file("/proc/stat", buf, size, d);
4776 prune_init_slice(cg);
4777
4778 cpuset = get_cpuset(cg);
4779 if (!cpuset)
4780 goto err;
4781
4782 /*
4783 * Read cpuacct.usage_all for all CPUs.
4784 * If the cpuacct cgroup is present, it is used to calculate the container's
4785 * CPU usage. If not, values from the host's /proc/stat are used.
4786 */
4787 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
4788 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
4789 "falling back to the host's /proc/stat");
4790 }
4791
4792 f = fopen("/proc/stat", "r");
4793 if (!f)
4794 goto err;
4795
4796 //skip first line
4797 if (getline(&line, &linelen, f) < 0) {
4798 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4799 goto err;
4800 }
4801
4802 if (use_cpuview(cg) && cg_cpu_usage) {
4803 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4804 f, d->buf, d->buflen);
4805 goto out;
4806 }
4807
4808 while (getline(&line, &linelen, f) != -1) {
4809 ssize_t l;
4810 char cpu_char[10]; /* That's a lot of cores */
4811 char *c;
4812 uint64_t all_used, cg_used, new_idle;
4813 int ret;
4814
4815 if (strlen(line) == 0)
4816 continue;
4817 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4818 /* not a ^cpuN line containing a number N, just print it */
4819 l = snprintf(cache, cache_size, "%s", line);
4820 if (l < 0) {
4821 perror("Error writing to cache");
4822 rv = 0;
4823 goto err;
4824 }
4825 if (l >= cache_size) {
4826 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4827 rv = 0;
4828 goto err;
4829 }
4830 cache += l;
4831 cache_size -= l;
4832 total_len += l;
4833 continue;
4834 }
4835
4836 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4837 continue;
4838 if (!cpu_in_cpuset(physcpu, cpuset))
4839 continue;
4840 curcpu ++;
4841
4842 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4843 &user,
4844 &nice,
4845 &system,
4846 &idle,
4847 &iowait,
4848 &irq,
4849 &softirq,
4850 &steal,
4851 &guest,
4852 &guest_nice);
4853
4854 if (ret != 10 || !cg_cpu_usage) {
4855 c = strchr(line, ' ');
4856 if (!c)
4857 continue;
4858 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4859 if (l < 0) {
4860 perror("Error writing to cache");
4861 rv = 0;
4862 goto err;
4863
4864 }
4865 if (l >= cache_size) {
4866 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4867 rv = 0;
4868 goto err;
4869 }
4870
4871 cache += l;
4872 cache_size -= l;
4873 total_len += l;
4874
4875 if (ret != 10)
4876 continue;
4877 }
4878
4879 if (cg_cpu_usage) {
4880 if (physcpu >= cg_cpu_usage_size)
4881 break;
4882
4883 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4884 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
4885
4886 if (all_used >= cg_used) {
4887 new_idle = idle + (all_used - cg_used);
4888
4889 } else {
4890 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4891 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4892 curcpu, cg, all_used, cg_used);
4893 new_idle = idle;
4894 }
4895
4896 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4897 curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
4898 new_idle);
4899
4900 if (l < 0) {
4901 perror("Error writing to cache");
4902 rv = 0;
4903 goto err;
4904
4905 }
4906 if (l >= cache_size) {
4907 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4908 rv = 0;
4909 goto err;
4910 }
4911
4912 cache += l;
4913 cache_size -= l;
4914 total_len += l;
4915
4916 user_sum += cg_cpu_usage[physcpu].user;
4917 system_sum += cg_cpu_usage[physcpu].system;
4918 idle_sum += new_idle;
4919
4920 } else {
4921 user_sum += user;
4922 nice_sum += nice;
4923 system_sum += system;
4924 idle_sum += idle;
4925 iowait_sum += iowait;
4926 irq_sum += irq;
4927 softirq_sum += softirq;
4928 steal_sum += steal;
4929 guest_sum += guest;
4930 guest_nice_sum += guest_nice;
4931 }
4932 }
4933
4934 cache = d->buf;
4935
4936 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4937 user_sum,
4938 nice_sum,
4939 system_sum,
4940 idle_sum,
4941 iowait_sum,
4942 irq_sum,
4943 softirq_sum,
4944 steal_sum,
4945 guest_sum,
4946 guest_nice_sum);
4947 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4948 memcpy(cache, cpuall, cpuall_len);
4949 cache += cpuall_len;
4950 } else {
4951 /* shouldn't happen */
4952 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4953 cpuall_len = 0;
4954 }
4955
4956 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4957 total_len += cpuall_len;
4958
4959 out:
4960 d->cached = 1;
4961 d->size = total_len;
4962 if (total_len > size)
4963 total_len = size;
4964
4965 memcpy(buf, d->buf, total_len);
4966 rv = total_len;
4967
4968 err:
4969 if (f)
4970 fclose(f);
4971 if (cg_cpu_usage)
4972 free(cg_cpu_usage);
4973 free(line);
4974 free(cpuset);
4975 free(cg);
4976 return rv;
4977 }
4978
4979 /* This function retrieves the busy time of a group of tasks by looking at
4980 * cpuacct.usage. Unfortunately, this only makes sense when the container has
4981 * been given it's own cpuacct cgroup. If not, this function will take the busy
4982 * time of all other taks that do not actually belong to the container into
4983 * account as well. If someone has a clever solution for this please send a
4984 * patch!
4985 */
4986 static unsigned long get_reaper_busy(pid_t task)
4987 {
4988 pid_t initpid = lookup_initpid_in_store(task);
4989 char *cgroup = NULL, *usage_str = NULL;
4990 unsigned long usage = 0;
4991
4992 if (initpid <= 0)
4993 return 0;
4994
4995 cgroup = get_pid_cgroup(initpid, "cpuacct");
4996 if (!cgroup)
4997 goto out;
4998 prune_init_slice(cgroup);
4999 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
5000 goto out;
5001 usage = strtoul(usage_str, NULL, 10);
5002 usage /= 1000000000;
5003
5004 out:
5005 free(cgroup);
5006 free(usage_str);
5007 return usage;
5008 }
5009
5010 #if RELOADTEST
5011 void iwashere(void)
5012 {
5013 int fd;
5014
5015 fd = creat("/tmp/lxcfs-iwashere", 0644);
5016 if (fd >= 0)
5017 close(fd);
5018 }
5019 #endif
5020
5021 /*
5022 * We read /proc/uptime and reuse its second field.
5023 * For the first field, we use the mtime for the reaper for
5024 * the calling pid as returned by getreaperage
5025 */
5026 static int proc_uptime_read(char *buf, size_t size, off_t offset,
5027 struct fuse_file_info *fi)
5028 {
5029 struct fuse_context *fc = fuse_get_context();
5030 struct file_info *d = (struct file_info *)fi->fh;
5031 unsigned long int busytime = get_reaper_busy(fc->pid);
5032 char *cache = d->buf;
5033 ssize_t total_len = 0;
5034 uint64_t idletime, reaperage;
5035
5036 #if RELOADTEST
5037 iwashere();
5038 #endif
5039
5040 if (offset){
5041 if (!d->cached)
5042 return 0;
5043 if (offset > d->size)
5044 return -EINVAL;
5045 int left = d->size - offset;
5046 total_len = left > size ? size: left;
5047 memcpy(buf, cache + offset, total_len);
5048 return total_len;
5049 }
5050
5051 reaperage = get_reaper_age(fc->pid);
5052 /* To understand why this is done, please read the comment to the
5053 * get_reaper_busy() function.
5054 */
5055 idletime = reaperage;
5056 if (reaperage >= busytime)
5057 idletime = reaperage - busytime;
5058
5059 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
5060 if (total_len < 0 || total_len >= d->buflen){
5061 lxcfs_error("%s\n", "failed to write to cache");
5062 return 0;
5063 }
5064
5065 d->size = (int)total_len;
5066 d->cached = 1;
5067
5068 if (total_len > size) total_len = size;
5069
5070 memcpy(buf, d->buf, total_len);
5071 return total_len;
5072 }
5073
5074 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5075 struct fuse_file_info *fi)
5076 {
5077 char dev_name[72];
5078 struct fuse_context *fc = fuse_get_context();
5079 struct file_info *d = (struct file_info *)fi->fh;
5080 char *cg;
5081 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5082 *io_wait_time_str = NULL, *io_service_time_str = NULL;
5083 unsigned long read = 0, write = 0;
5084 unsigned long read_merged = 0, write_merged = 0;
5085 unsigned long read_sectors = 0, write_sectors = 0;
5086 unsigned long read_ticks = 0, write_ticks = 0;
5087 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5088 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5089 char *cache = d->buf;
5090 size_t cache_size = d->buflen;
5091 char *line = NULL;
5092 size_t linelen = 0, total_len = 0, rv = 0;
5093 unsigned int major = 0, minor = 0;
5094 int i = 0;
5095 FILE *f = NULL;
5096
5097 if (offset){
5098 if (offset > d->size)
5099 return -EINVAL;
5100 if (!d->cached)
5101 return 0;
5102 int left = d->size - offset;
5103 total_len = left > size ? size: left;
5104 memcpy(buf, cache + offset, total_len);
5105 return total_len;
5106 }
5107
5108 pid_t initpid = lookup_initpid_in_store(fc->pid);
5109 if (initpid <= 0)
5110 initpid = fc->pid;
5111 cg = get_pid_cgroup(initpid, "blkio");
5112 if (!cg)
5113 return read_file("/proc/diskstats", buf, size, d);
5114 prune_init_slice(cg);
5115
5116 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
5117 goto err;
5118 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
5119 goto err;
5120 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
5121 goto err;
5122 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
5123 goto err;
5124 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
5125 goto err;
5126
5127
5128 f = fopen("/proc/diskstats", "r");
5129 if (!f)
5130 goto err;
5131
5132 while (getline(&line, &linelen, f) != -1) {
5133 ssize_t l;
5134 char lbuf[256];
5135
5136 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5137 if (i != 3)
5138 continue;
5139
5140 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5141 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5142 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5143 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5144 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5145 read_sectors = read_sectors/512;
5146 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5147 write_sectors = write_sectors/512;
5148
5149 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5150 rd_svctm = rd_svctm/1000000;
5151 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5152 rd_wait = rd_wait/1000000;
5153 read_ticks = rd_svctm + rd_wait;
5154
5155 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5156 wr_svctm = wr_svctm/1000000;
5157 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5158 wr_wait = wr_wait/1000000;
5159 write_ticks = wr_svctm + wr_wait;
5160
5161 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5162 tot_ticks = tot_ticks/1000000;
5163
5164 memset(lbuf, 0, 256);
5165 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5166 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5167 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5168 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5169 else
5170 continue;
5171
5172 l = snprintf(cache, cache_size, "%s", lbuf);
5173 if (l < 0) {
5174 perror("Error writing to fuse buf");
5175 rv = 0;
5176 goto err;
5177 }
5178 if (l >= cache_size) {
5179 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5180 rv = 0;
5181 goto err;
5182 }
5183 cache += l;
5184 cache_size -= l;
5185 total_len += l;
5186 }
5187
5188 d->cached = 1;
5189 d->size = total_len;
5190 if (total_len > size ) total_len = size;
5191 memcpy(buf, d->buf, total_len);
5192
5193 rv = total_len;
5194 err:
5195 free(cg);
5196 if (f)
5197 fclose(f);
5198 free(line);
5199 free(io_serviced_str);
5200 free(io_merged_str);
5201 free(io_service_bytes_str);
5202 free(io_wait_time_str);
5203 free(io_service_time_str);
5204 return rv;
5205 }
5206
5207 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5208 struct fuse_file_info *fi)
5209 {
5210 struct fuse_context *fc = fuse_get_context();
5211 struct file_info *d = (struct file_info *)fi->fh;
5212 char *cg = NULL;
5213 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
5214 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
5215 ssize_t total_len = 0, rv = 0;
5216 ssize_t l = 0;
5217 char *cache = d->buf;
5218
5219 if (offset) {
5220 if (offset > d->size)
5221 return -EINVAL;
5222 if (!d->cached)
5223 return 0;
5224 int left = d->size - offset;
5225 total_len = left > size ? size: left;
5226 memcpy(buf, cache + offset, total_len);
5227 return total_len;
5228 }
5229
5230 pid_t initpid = lookup_initpid_in_store(fc->pid);
5231 if (initpid <= 0)
5232 initpid = fc->pid;
5233 cg = get_pid_cgroup(initpid, "memory");
5234 if (!cg)
5235 return read_file("/proc/swaps", buf, size, d);
5236 prune_init_slice(cg);
5237
5238 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
5239
5240 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
5241 goto err;
5242
5243 memusage = strtoul(memusage_str, NULL, 10);
5244
5245 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5246 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5247
5248 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
5249 memswusage = strtoul(memswusage_str, NULL, 10);
5250
5251 swap_total = (memswlimit - memlimit) / 1024;
5252 swap_free = (memswusage - memusage) / 1024;
5253 }
5254
5255 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5256
5257 /* When no mem + swap limit is specified or swapaccount=0*/
5258 if (!memswlimit) {
5259 char *line = NULL;
5260 size_t linelen = 0;
5261 FILE *f = fopen("/proc/meminfo", "r");
5262
5263 if (!f)
5264 goto err;
5265
5266 while (getline(&line, &linelen, f) != -1) {
5267 if (startswith(line, "SwapTotal:")) {
5268 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
5269 } else if (startswith(line, "SwapFree:")) {
5270 sscanf(line, "SwapFree: %8lu kB", &swap_free);
5271 }
5272 }
5273
5274 free(line);
5275 fclose(f);
5276 }
5277
5278 if (swap_total > 0) {
5279 l = snprintf(d->buf + total_len, d->size - total_len,
5280 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5281 swap_total, swap_free);
5282 total_len += l;
5283 }
5284
5285 if (total_len < 0 || l < 0) {
5286 perror("Error writing to cache");
5287 rv = 0;
5288 goto err;
5289 }
5290
5291 d->cached = 1;
5292 d->size = (int)total_len;
5293
5294 if (total_len > size) total_len = size;
5295 memcpy(buf, d->buf, total_len);
5296 rv = total_len;
5297
5298 err:
5299 free(cg);
5300 free(memswlimit_str);
5301 free(memlimit_str);
5302 free(memusage_str);
5303 free(memswusage_str);
5304 return rv;
5305 }
5306 /*
5307 * Find the process pid from cgroup path.
5308 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5309 * @pid_buf : put pid to pid_buf.
5310 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5311 * @depth : the depth of cgroup in container.
5312 * @sum : return the number of pid.
5313 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5314 */
5315 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5316 {
5317 DIR *dir;
5318 int fd;
5319 struct dirent *file;
5320 FILE *f = NULL;
5321 size_t linelen = 0;
5322 char *line = NULL;
5323 int pd;
5324 char *path_dir, *path;
5325 char **pid;
5326
5327 /* path = dpath + "/cgroup.procs" + /0 */
5328 do {
5329 path = malloc(strlen(dpath) + 20);
5330 } while (!path);
5331
5332 strcpy(path, dpath);
5333 fd = openat(cfd, path, O_RDONLY);
5334 if (fd < 0)
5335 goto out;
5336
5337 dir = fdopendir(fd);
5338 if (dir == NULL) {
5339 close(fd);
5340 goto out;
5341 }
5342
5343 while (((file = readdir(dir)) != NULL) && depth > 0) {
5344 if (strncmp(file->d_name, ".", 1) == 0)
5345 continue;
5346 if (strncmp(file->d_name, "..", 1) == 0)
5347 continue;
5348 if (file->d_type == DT_DIR) {
5349 /* path + '/' + d_name +/0 */
5350 do {
5351 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5352 } while (!path_dir);
5353 strcpy(path_dir, path);
5354 strcat(path_dir, "/");
5355 strcat(path_dir, file->d_name);
5356 pd = depth - 1;
5357 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5358 free(path_dir);
5359 }
5360 }
5361 closedir(dir);
5362
5363 strcat(path, "/cgroup.procs");
5364 fd = openat(cfd, path, O_RDONLY);
5365 if (fd < 0)
5366 goto out;
5367
5368 f = fdopen(fd, "r");
5369 if (!f) {
5370 close(fd);
5371 goto out;
5372 }
5373
5374 while (getline(&line, &linelen, f) != -1) {
5375 do {
5376 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5377 } while (!pid);
5378 *pid_buf = pid;
5379 do {
5380 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5381 } while (*(*pid_buf + sum) == NULL);
5382 strcpy(*(*pid_buf + sum), line);
5383 sum++;
5384 }
5385 fclose(f);
5386 out:
5387 if (line)
5388 free(line);
5389 free(path);
5390 return sum;
5391 }
5392 /*
5393 * calc_load calculates the load according to the following formula:
5394 * load1 = load0 * exp + active * (1 - exp)
5395 *
5396 * @load1: the new loadavg.
5397 * @load0: the former loadavg.
5398 * @active: the total number of running pid at this moment.
5399 * @exp: the fixed-point defined in the beginning.
5400 */
5401 static unsigned long
5402 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5403 {
5404 unsigned long newload;
5405
5406 active = active > 0 ? active * FIXED_1 : 0;
5407 newload = load * exp + active * (FIXED_1 - exp);
5408 if (active >= load)
5409 newload += FIXED_1 - 1;
5410
5411 return newload / FIXED_1;
5412 }
5413
5414 /*
5415 * Return 0 means that container p->cg is closed.
5416 * Return -1 means that error occurred in refresh.
5417 * Positive num equals the total number of pid.
5418 */
5419 static int refresh_load(struct load_node *p, char *path)
5420 {
5421 FILE *f = NULL;
5422 char **idbuf;
5423 char proc_path[256];
5424 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5425 char *line = NULL;
5426 size_t linelen = 0;
5427 int sum, length;
5428 DIR *dp;
5429 struct dirent *file;
5430
5431 do {
5432 idbuf = malloc(sizeof(char *));
5433 } while (!idbuf);
5434 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5435 /* normal exit */
5436 if (sum == 0)
5437 goto out;
5438
5439 for (i = 0; i < sum; i++) {
5440 /*clean up '\n' */
5441 length = strlen(idbuf[i])-1;
5442 idbuf[i][length] = '\0';
5443 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5444 if (ret < 0 || ret > 255) {
5445 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5446 i = sum;
5447 sum = -1;
5448 goto err_out;
5449 }
5450
5451 dp = opendir(proc_path);
5452 if (!dp) {
5453 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5454 continue;
5455 }
5456 while ((file = readdir(dp)) != NULL) {
5457 if (strncmp(file->d_name, ".", 1) == 0)
5458 continue;
5459 if (strncmp(file->d_name, "..", 1) == 0)
5460 continue;
5461 total_pid++;
5462 /* We make the biggest pid become last_pid.*/
5463 ret = atof(file->d_name);
5464 last_pid = (ret > last_pid) ? ret : last_pid;
5465
5466 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5467 if (ret < 0 || ret > 255) {
5468 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5469 i = sum;
5470 sum = -1;
5471 closedir(dp);
5472 goto err_out;
5473 }
5474 f = fopen(proc_path, "r");
5475 if (f != NULL) {
5476 while (getline(&line, &linelen, f) != -1) {
5477 /* Find State */
5478 if ((line[0] == 'S') && (line[1] == 't'))
5479 break;
5480 }
5481 if ((line[7] == 'R') || (line[7] == 'D'))
5482 run_pid++;
5483 fclose(f);
5484 }
5485 }
5486 closedir(dp);
5487 }
5488 /*Calculate the loadavg.*/
5489 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5490 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5491 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5492 p->run_pid = run_pid;
5493 p->total_pid = total_pid;
5494 p->last_pid = last_pid;
5495
5496 free(line);
5497 err_out:
5498 for (; i > 0; i--)
5499 free(idbuf[i-1]);
5500 out:
5501 free(idbuf);
5502 return sum;
5503 }
5504 /*
5505 * Traverse the hash table and update it.
5506 */
5507 void *load_begin(void *arg)
5508 {
5509
5510 char *path = NULL;
5511 int i, sum, length, ret;
5512 struct load_node *f;
5513 int first_node;
5514 clock_t time1, time2;
5515
5516 while (1) {
5517 if (loadavg_stop == 1)
5518 return NULL;
5519
5520 time1 = clock();
5521 for (i = 0; i < LOAD_SIZE; i++) {
5522 pthread_mutex_lock(&load_hash[i].lock);
5523 if (load_hash[i].next == NULL) {
5524 pthread_mutex_unlock(&load_hash[i].lock);
5525 continue;
5526 }
5527 f = load_hash[i].next;
5528 first_node = 1;
5529 while (f) {
5530 length = strlen(f->cg) + 2;
5531 do {
5532 /* strlen(f->cg) + '.' or '' + \0 */
5533 path = malloc(length);
5534 } while (!path);
5535
5536 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5537 if (ret < 0 || ret > length - 1) {
5538 /* snprintf failed, ignore the node.*/
5539 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5540 goto out;
5541 }
5542 sum = refresh_load(f, path);
5543 if (sum == 0) {
5544 f = del_node(f, i);
5545 } else {
5546 out: f = f->next;
5547 }
5548 free(path);
5549 /* load_hash[i].lock locks only on the first node.*/
5550 if (first_node == 1) {
5551 first_node = 0;
5552 pthread_mutex_unlock(&load_hash[i].lock);
5553 }
5554 }
5555 }
5556
5557 if (loadavg_stop == 1)
5558 return NULL;
5559
5560 time2 = clock();
5561 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5562 }
5563 }
5564
5565 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5566 struct fuse_file_info *fi)
5567 {
5568 struct fuse_context *fc = fuse_get_context();
5569 struct file_info *d = (struct file_info *)fi->fh;
5570 pid_t initpid;
5571 char *cg;
5572 size_t total_len = 0;
5573 char *cache = d->buf;
5574 struct load_node *n;
5575 int hash;
5576 int cfd, rv = 0;
5577 unsigned long a, b, c;
5578
5579 if (offset) {
5580 if (offset > d->size)
5581 return -EINVAL;
5582 if (!d->cached)
5583 return 0;
5584 int left = d->size - offset;
5585 total_len = left > size ? size : left;
5586 memcpy(buf, cache + offset, total_len);
5587 return total_len;
5588 }
5589 if (!loadavg)
5590 return read_file("/proc/loadavg", buf, size, d);
5591
5592 initpid = lookup_initpid_in_store(fc->pid);
5593 if (initpid <= 0)
5594 initpid = fc->pid;
5595 cg = get_pid_cgroup(initpid, "cpu");
5596 if (!cg)
5597 return read_file("/proc/loadavg", buf, size, d);
5598
5599 prune_init_slice(cg);
5600 hash = calc_hash(cg) % LOAD_SIZE;
5601 n = locate_node(cg, hash);
5602
5603 /* First time */
5604 if (n == NULL) {
5605 if (!find_mounted_controller("cpu", &cfd)) {
5606 /*
5607 * In locate_node() above, pthread_rwlock_unlock() isn't used
5608 * because delete is not allowed before read has ended.
5609 */
5610 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5611 rv = 0;
5612 goto err;
5613 }
5614 do {
5615 n = malloc(sizeof(struct load_node));
5616 } while (!n);
5617
5618 do {
5619 n->cg = malloc(strlen(cg)+1);
5620 } while (!n->cg);
5621 strcpy(n->cg, cg);
5622 n->avenrun[0] = 0;
5623 n->avenrun[1] = 0;
5624 n->avenrun[2] = 0;
5625 n->run_pid = 0;
5626 n->total_pid = 1;
5627 n->last_pid = initpid;
5628 n->cfd = cfd;
5629 insert_node(&n, hash);
5630 }
5631 a = n->avenrun[0] + (FIXED_1/200);
5632 b = n->avenrun[1] + (FIXED_1/200);
5633 c = n->avenrun[2] + (FIXED_1/200);
5634 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5635 LOAD_INT(a), LOAD_FRAC(a),
5636 LOAD_INT(b), LOAD_FRAC(b),
5637 LOAD_INT(c), LOAD_FRAC(c),
5638 n->run_pid, n->total_pid, n->last_pid);
5639 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5640 if (total_len < 0 || total_len >= d->buflen) {
5641 lxcfs_error("%s\n", "Failed to write to cache");
5642 rv = 0;
5643 goto err;
5644 }
5645 d->size = (int)total_len;
5646 d->cached = 1;
5647
5648 if (total_len > size)
5649 total_len = size;
5650 memcpy(buf, d->buf, total_len);
5651 rv = total_len;
5652
5653 err:
5654 free(cg);
5655 return rv;
5656 }
5657 /* Return a positive number on success, return 0 on failure.*/
5658 pthread_t load_daemon(int load_use)
5659 {
5660 int ret;
5661 pthread_t pid;
5662
5663 ret = init_load();
5664 if (ret == -1) {
5665 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5666 return 0;
5667 }
5668 ret = pthread_create(&pid, NULL, load_begin, NULL);
5669 if (ret != 0) {
5670 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5671 load_free();
5672 return 0;
5673 }
5674 /* use loadavg, here loadavg = 1*/
5675 loadavg = load_use;
5676 return pid;
5677 }
5678
5679 /* Returns 0 on success. */
5680 int stop_load_daemon(pthread_t pid)
5681 {
5682 int s;
5683
5684 /* Signal the thread to gracefully stop */
5685 loadavg_stop = 1;
5686
5687 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5688 if (s != 0) {
5689 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5690 return -1;
5691 }
5692
5693 load_free();
5694 loadavg_stop = 0;
5695
5696 return 0;
5697 }
5698
5699 static off_t get_procfile_size(const char *which)
5700 {
5701 FILE *f = fopen(which, "r");
5702 char *line = NULL;
5703 size_t len = 0;
5704 ssize_t sz, answer = 0;
5705 if (!f)
5706 return 0;
5707
5708 while ((sz = getline(&line, &len, f)) != -1)
5709 answer += sz;
5710 fclose (f);
5711 free(line);
5712
5713 return answer;
5714 }
5715
5716 int proc_getattr(const char *path, struct stat *sb)
5717 {
5718 struct timespec now;
5719
5720 memset(sb, 0, sizeof(struct stat));
5721 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5722 return -EINVAL;
5723 sb->st_uid = sb->st_gid = 0;
5724 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5725 if (strcmp(path, "/proc") == 0) {
5726 sb->st_mode = S_IFDIR | 00555;
5727 sb->st_nlink = 2;
5728 return 0;
5729 }
5730 if (strcmp(path, "/proc/meminfo") == 0 ||
5731 strcmp(path, "/proc/cpuinfo") == 0 ||
5732 strcmp(path, "/proc/uptime") == 0 ||
5733 strcmp(path, "/proc/stat") == 0 ||
5734 strcmp(path, "/proc/diskstats") == 0 ||
5735 strcmp(path, "/proc/swaps") == 0 ||
5736 strcmp(path, "/proc/loadavg") == 0) {
5737 sb->st_size = 0;
5738 sb->st_mode = S_IFREG | 00444;
5739 sb->st_nlink = 1;
5740 return 0;
5741 }
5742
5743 return -ENOENT;
5744 }
5745
5746 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5747 struct fuse_file_info *fi)
5748 {
5749 if (filler(buf, ".", NULL, 0) != 0 ||
5750 filler(buf, "..", NULL, 0) != 0 ||
5751 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5752 filler(buf, "meminfo", NULL, 0) != 0 ||
5753 filler(buf, "stat", NULL, 0) != 0 ||
5754 filler(buf, "uptime", NULL, 0) != 0 ||
5755 filler(buf, "diskstats", NULL, 0) != 0 ||
5756 filler(buf, "swaps", NULL, 0) != 0 ||
5757 filler(buf, "loadavg", NULL, 0) != 0)
5758 return -EINVAL;
5759 return 0;
5760 }
5761
5762 int proc_open(const char *path, struct fuse_file_info *fi)
5763 {
5764 int type = -1;
5765 struct file_info *info;
5766
5767 if (strcmp(path, "/proc/meminfo") == 0)
5768 type = LXC_TYPE_PROC_MEMINFO;
5769 else if (strcmp(path, "/proc/cpuinfo") == 0)
5770 type = LXC_TYPE_PROC_CPUINFO;
5771 else if (strcmp(path, "/proc/uptime") == 0)
5772 type = LXC_TYPE_PROC_UPTIME;
5773 else if (strcmp(path, "/proc/stat") == 0)
5774 type = LXC_TYPE_PROC_STAT;
5775 else if (strcmp(path, "/proc/diskstats") == 0)
5776 type = LXC_TYPE_PROC_DISKSTATS;
5777 else if (strcmp(path, "/proc/swaps") == 0)
5778 type = LXC_TYPE_PROC_SWAPS;
5779 else if (strcmp(path, "/proc/loadavg") == 0)
5780 type = LXC_TYPE_PROC_LOADAVG;
5781 if (type == -1)
5782 return -ENOENT;
5783
5784 info = malloc(sizeof(*info));
5785 if (!info)
5786 return -ENOMEM;
5787
5788 memset(info, 0, sizeof(*info));
5789 info->type = type;
5790
5791 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5792 do {
5793 info->buf = malloc(info->buflen);
5794 } while (!info->buf);
5795 memset(info->buf, 0, info->buflen);
5796 /* set actual size to buffer size */
5797 info->size = info->buflen;
5798
5799 fi->fh = (unsigned long)info;
5800 return 0;
5801 }
5802
5803 int proc_access(const char *path, int mask)
5804 {
5805 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5806 return 0;
5807
5808 /* these are all read-only */
5809 if ((mask & ~R_OK) != 0)
5810 return -EACCES;
5811 return 0;
5812 }
5813
5814 int proc_release(const char *path, struct fuse_file_info *fi)
5815 {
5816 do_release_file_info(fi);
5817 return 0;
5818 }
5819
5820 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5821 struct fuse_file_info *fi)
5822 {
5823 struct file_info *f = (struct file_info *) fi->fh;
5824
5825 switch (f->type) {
5826 case LXC_TYPE_PROC_MEMINFO:
5827 return proc_meminfo_read(buf, size, offset, fi);
5828 case LXC_TYPE_PROC_CPUINFO:
5829 return proc_cpuinfo_read(buf, size, offset, fi);
5830 case LXC_TYPE_PROC_UPTIME:
5831 return proc_uptime_read(buf, size, offset, fi);
5832 case LXC_TYPE_PROC_STAT:
5833 return proc_stat_read(buf, size, offset, fi);
5834 case LXC_TYPE_PROC_DISKSTATS:
5835 return proc_diskstats_read(buf, size, offset, fi);
5836 case LXC_TYPE_PROC_SWAPS:
5837 return proc_swaps_read(buf, size, offset, fi);
5838 case LXC_TYPE_PROC_LOADAVG:
5839 return proc_loadavg_read(buf, size, offset, fi);
5840 default:
5841 return -EINVAL;
5842 }
5843 }
5844
5845 /*
5846 * Functions needed to setup cgroups in the __constructor__.
5847 */
5848
5849 static bool mkdir_p(const char *dir, mode_t mode)
5850 {
5851 const char *tmp = dir;
5852 const char *orig = dir;
5853 char *makeme;
5854
5855 do {
5856 dir = tmp + strspn(tmp, "/");
5857 tmp = dir + strcspn(dir, "/");
5858 makeme = strndup(orig, dir - orig);
5859 if (!makeme)
5860 return false;
5861 if (mkdir(makeme, mode) && errno != EEXIST) {
5862 lxcfs_error("Failed to create directory '%s': %s.\n",
5863 makeme, strerror(errno));
5864 free(makeme);
5865 return false;
5866 }
5867 free(makeme);
5868 } while(tmp != dir);
5869
5870 return true;
5871 }
5872
5873 static bool umount_if_mounted(void)
5874 {
5875 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5876 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5877 return false;
5878 }
5879 return true;
5880 }
5881
5882 /* __typeof__ should be safe to use with all compilers. */
5883 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5884 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5885 {
5886 return (fs->f_type == (fs_type_magic)magic_val);
5887 }
5888
5889 /*
5890 * looking at fs/proc_namespace.c, it appears we can
5891 * actually expect the rootfs entry to very specifically contain
5892 * " - rootfs rootfs "
5893 * IIUC, so long as we've chrooted so that rootfs is not our root,
5894 * the rootfs entry should always be skipped in mountinfo contents.
5895 */
5896 static bool is_on_ramfs(void)
5897 {
5898 FILE *f;
5899 char *p, *p2;
5900 char *line = NULL;
5901 size_t len = 0;
5902 int i;
5903
5904 f = fopen("/proc/self/mountinfo", "r");
5905 if (!f)
5906 return false;
5907
5908 while (getline(&line, &len, f) != -1) {
5909 for (p = line, i = 0; p && i < 4; i++)
5910 p = strchr(p + 1, ' ');
5911 if (!p)
5912 continue;
5913 p2 = strchr(p + 1, ' ');
5914 if (!p2)
5915 continue;
5916 *p2 = '\0';
5917 if (strcmp(p + 1, "/") == 0) {
5918 // this is '/'. is it the ramfs?
5919 p = strchr(p2 + 1, '-');
5920 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5921 free(line);
5922 fclose(f);
5923 return true;
5924 }
5925 }
5926 }
5927 free(line);
5928 fclose(f);
5929 return false;
5930 }
5931
5932 static int pivot_enter()
5933 {
5934 int ret = -1, oldroot = -1, newroot = -1;
5935
5936 oldroot = open("/", O_DIRECTORY | O_RDONLY);
5937 if (oldroot < 0) {
5938 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5939 return ret;
5940 }
5941
5942 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5943 if (newroot < 0) {
5944 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5945 goto err;
5946 }
5947
5948 /* change into new root fs */
5949 if (fchdir(newroot) < 0) {
5950 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5951 goto err;
5952 }
5953
5954 /* pivot_root into our new root fs */
5955 if (pivot_root(".", ".") < 0) {
5956 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5957 goto err;
5958 }
5959
5960 /*
5961 * At this point the old-root is mounted on top of our new-root.
5962 * To unmounted it we must not be chdir'd into it, so escape back
5963 * to the old-root.
5964 */
5965 if (fchdir(oldroot) < 0) {
5966 lxcfs_error("%s\n", "Failed to enter old root.");
5967 goto err;
5968 }
5969
5970 if (umount2(".", MNT_DETACH) < 0) {
5971 lxcfs_error("%s\n", "Failed to detach old root.");
5972 goto err;
5973 }
5974
5975 if (fchdir(newroot) < 0) {
5976 lxcfs_error("%s\n", "Failed to re-enter new root.");
5977 goto err;
5978 }
5979
5980 ret = 0;
5981
5982 err:
5983 if (oldroot > 0)
5984 close(oldroot);
5985 if (newroot > 0)
5986 close(newroot);
5987
5988 return ret;
5989 }
5990
5991 static int chroot_enter()
5992 {
5993 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5994 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5995 return -1;
5996 }
5997
5998 if (chroot(".") < 0) {
5999 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
6000 return -1;
6001 }
6002
6003 if (chdir("/") < 0) {
6004 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
6005 return -1;
6006 }
6007
6008 return 0;
6009 }
6010
6011 static int permute_and_enter(void)
6012 {
6013 struct statfs sb;
6014
6015 if (statfs("/", &sb) < 0) {
6016 lxcfs_error("%s\n", "Could not stat / mountpoint.");
6017 return -1;
6018 }
6019
6020 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
6021 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
6022 * /proc/1/mountinfo. */
6023 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
6024 return chroot_enter();
6025
6026 if (pivot_enter() < 0) {
6027 lxcfs_error("%s\n", "Could not perform pivot root.");
6028 return -1;
6029 }
6030
6031 return 0;
6032 }
6033
6034 /* Prepare our new clean root. */
6035 static int permute_prepare(void)
6036 {
6037 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
6038 lxcfs_error("%s\n", "Failed to create directory for new root.");
6039 return -1;
6040 }
6041
6042 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
6043 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
6044 return -1;
6045 }
6046
6047 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
6048 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
6049 return -1;
6050 }
6051
6052 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
6053 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
6054 return -1;
6055 }
6056
6057 return 0;
6058 }
6059
6060 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6061 static bool permute_root(void)
6062 {
6063 /* Prepare new root. */
6064 if (permute_prepare() < 0)
6065 return false;
6066
6067 /* Pivot into new root. */
6068 if (permute_and_enter() < 0)
6069 return false;
6070
6071 return true;
6072 }
6073
6074 static int preserve_mnt_ns(int pid)
6075 {
6076 int ret;
6077 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6078 char path[len];
6079
6080 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6081 if (ret < 0 || (size_t)ret >= len)
6082 return -1;
6083
6084 return open(path, O_RDONLY | O_CLOEXEC);
6085 }
6086
6087 static bool cgfs_prepare_mounts(void)
6088 {
6089 if (!mkdir_p(BASEDIR, 0700)) {
6090 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6091 return false;
6092 }
6093
6094 if (!umount_if_mounted()) {
6095 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6096 return false;
6097 }
6098
6099 if (unshare(CLONE_NEWNS) < 0) {
6100 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6101 return false;
6102 }
6103
6104 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6105 if (cgroup_mount_ns_fd < 0) {
6106 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6107 return false;
6108 }
6109
6110 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6111 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6112 return false;
6113 }
6114
6115 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6116 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6117 return false;
6118 }
6119
6120 return true;
6121 }
6122
6123 static bool cgfs_mount_hierarchies(void)
6124 {
6125 char *target;
6126 size_t clen, len;
6127 int i, ret;
6128
6129 for (i = 0; i < num_hierarchies; i++) {
6130 char *controller = hierarchies[i];
6131
6132 clen = strlen(controller);
6133 len = strlen(BASEDIR) + clen + 2;
6134 target = malloc(len);
6135 if (!target)
6136 return false;
6137
6138 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
6139 if (ret < 0 || ret >= len) {
6140 free(target);
6141 return false;
6142 }
6143 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
6144 free(target);
6145 return false;
6146 }
6147 if (!strcmp(controller, "unified"))
6148 ret = mount("none", target, "cgroup2", 0, NULL);
6149 else
6150 ret = mount(controller, target, "cgroup", 0, controller);
6151 if (ret < 0) {
6152 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
6153 free(target);
6154 return false;
6155 }
6156
6157 fd_hierarchies[i] = open(target, O_DIRECTORY);
6158 if (fd_hierarchies[i] < 0) {
6159 free(target);
6160 return false;
6161 }
6162 free(target);
6163 }
6164 return true;
6165 }
6166
6167 static bool cgfs_setup_controllers(void)
6168 {
6169 if (!cgfs_prepare_mounts())
6170 return false;
6171
6172 if (!cgfs_mount_hierarchies()) {
6173 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6174 return false;
6175 }
6176
6177 if (!permute_root())
6178 return false;
6179
6180 return true;
6181 }
6182
6183 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
6184 {
6185 FILE *f;
6186 char *cret, *line = NULL;
6187 char cwd[MAXPATHLEN];
6188 size_t len = 0;
6189 int i, init_ns = -1;
6190 bool found_unified = false;
6191
6192 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
6193 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
6194 return;
6195 }
6196
6197 while (getline(&line, &len, f) != -1) {
6198 char *idx, *p, *p2;
6199
6200 p = strchr(line, ':');
6201 if (!p)
6202 goto out;
6203 idx = line;
6204 *(p++) = '\0';
6205
6206 p2 = strrchr(p, ':');
6207 if (!p2)
6208 goto out;
6209 *p2 = '\0';
6210
6211 /* With cgroupv2 /proc/self/cgroup can contain entries of the
6212 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
6213 * because it parses out the empty string "" and later on passes
6214 * it to mount(). Let's skip such entries.
6215 */
6216 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
6217 found_unified = true;
6218 p = "unified";
6219 }
6220
6221 if (!store_hierarchy(line, p))
6222 goto out;
6223 }
6224
6225 /* Preserve initial namespace. */
6226 init_ns = preserve_mnt_ns(getpid());
6227 if (init_ns < 0) {
6228 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
6229 goto out;
6230 }
6231
6232 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
6233 if (!fd_hierarchies) {
6234 lxcfs_error("%s\n", strerror(errno));
6235 goto out;
6236 }
6237
6238 for (i = 0; i < num_hierarchies; i++)
6239 fd_hierarchies[i] = -1;
6240
6241 cret = getcwd(cwd, MAXPATHLEN);
6242 if (!cret)
6243 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6244
6245 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6246 * to privately mount lxcfs cgroups. */
6247 if (!cgfs_setup_controllers()) {
6248 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
6249 goto out;
6250 }
6251
6252 if (setns(init_ns, 0) < 0) {
6253 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
6254 goto out;
6255 }
6256
6257 if (!cret || chdir(cwd) < 0)
6258 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6259
6260 if (!init_cpuview()) {
6261 lxcfs_error("%s\n", "failed to init CPU view");
6262 goto out;
6263 }
6264
6265 print_subsystems();
6266
6267 out:
6268 free(line);
6269 fclose(f);
6270 if (init_ns >= 0)
6271 close(init_ns);
6272 }
6273
6274 static void __attribute__((destructor)) free_subsystems(void)
6275 {
6276 int i;
6277
6278 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6279
6280 for (i = 0; i < num_hierarchies; i++) {
6281 if (hierarchies[i])
6282 free(hierarchies[i]);
6283 if (fd_hierarchies && fd_hierarchies[i] >= 0)
6284 close(fd_hierarchies[i]);
6285 }
6286 free(hierarchies);
6287 free(fd_hierarchies);
6288 free_cpuview();
6289
6290 if (cgroup_mount_ns_fd >= 0)
6291 close(cgroup_mount_ns_fd);
6292 }