]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
e838441c01213255c479057c1a5b2c7f1b2ac829
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #define __STDC_FORMAT_MACROS
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <fuse.h>
16 #include <inttypes.h>
17 #include <libgen.h>
18 #include <pthread.h>
19 #include <sched.h>
20 #include <stdbool.h>
21 #include <stdint.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <time.h>
26 #include <unistd.h>
27 #include <wait.h>
28 #include <linux/magic.h>
29 #include <linux/sched.h>
30 #include <sys/epoll.h>
31 #include <sys/mman.h>
32 #include <sys/mount.h>
33 #include <sys/param.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/sysinfo.h>
37 #include <sys/vfs.h>
38
39 #include "bindings.h"
40 #include "config.h" // for VERSION
41
42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
43 #define LXCFS_NUMSTRLEN64 21
44
45 /* Define pivot_root() if missing from the C library */
46 #ifndef HAVE_PIVOT_ROOT
47 static int pivot_root(const char * new_root, const char * put_old)
48 {
49 #ifdef __NR_pivot_root
50 return syscall(__NR_pivot_root, new_root, put_old);
51 #else
52 errno = ENOSYS;
53 return -1;
54 #endif
55 }
56 #else
57 extern int pivot_root(const char * new_root, const char * put_old);
58 #endif
59
60 enum {
61 LXC_TYPE_CGDIR,
62 LXC_TYPE_CGFILE,
63 LXC_TYPE_PROC_MEMINFO,
64 LXC_TYPE_PROC_CPUINFO,
65 LXC_TYPE_PROC_UPTIME,
66 LXC_TYPE_PROC_STAT,
67 LXC_TYPE_PROC_DISKSTATS,
68 LXC_TYPE_PROC_SWAPS,
69 LXC_TYPE_PROC_LOADAVG,
70 };
71
72 struct file_info {
73 char *controller;
74 char *cgroup;
75 char *file;
76 int type;
77 char *buf; // unused as of yet
78 int buflen;
79 int size; //actual data size
80 int cached;
81 };
82
83 struct cpuacct_usage {
84 uint64_t user;
85 uint64_t system;
86 uint64_t idle;
87 };
88
89 /* The function of hash table.*/
90 #define LOAD_SIZE 100 /*the size of hash_table */
91 #define FLUSH_TIME 5 /*the flush rate */
92 #define DEPTH_DIR 3 /*the depth of per cgroup */
93 /* The function of calculate loadavg .*/
94 #define FSHIFT 11 /* nr of bits of precision */
95 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
96 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
97 #define EXP_5 2014 /* 1/exp(5sec/5min) */
98 #define EXP_15 2037 /* 1/exp(5sec/15min) */
99 #define LOAD_INT(x) ((x) >> FSHIFT)
100 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
101 /*
102 * This parameter is used for proc_loadavg_read().
103 * 1 means use loadavg, 0 means not use.
104 */
105 static int loadavg = 0;
106 static volatile sig_atomic_t loadavg_stop = 0;
107 static int calc_hash(const char *name)
108 {
109 unsigned int hash = 0;
110 unsigned int x = 0;
111 /* ELFHash algorithm. */
112 while (*name) {
113 hash = (hash << 4) + *name++;
114 x = hash & 0xf0000000;
115 if (x != 0)
116 hash ^= (x >> 24);
117 hash &= ~x;
118 }
119 return (hash & 0x7fffffff);
120 }
121
122 struct load_node {
123 char *cg; /*cg */
124 unsigned long avenrun[3]; /* Load averages */
125 unsigned int run_pid;
126 unsigned int total_pid;
127 unsigned int last_pid;
128 int cfd; /* The file descriptor of the mounted cgroup */
129 struct load_node *next;
130 struct load_node **pre;
131 };
132
133 struct load_head {
134 /*
135 * The lock is about insert load_node and refresh load_node.To the first
136 * load_node of each hash bucket, insert and refresh in this hash bucket is
137 * mutually exclusive.
138 */
139 pthread_mutex_t lock;
140 /*
141 * The rdlock is about read loadavg and delete load_node.To each hash
142 * bucket, read and delete is mutually exclusive. But at the same time, we
143 * allow paratactic read operation. This rdlock is at list level.
144 */
145 pthread_rwlock_t rdlock;
146 /*
147 * The rilock is about read loadavg and insert load_node.To the first
148 * load_node of each hash bucket, read and insert is mutually exclusive.
149 * But at the same time, we allow paratactic read operation.
150 */
151 pthread_rwlock_t rilock;
152 struct load_node *next;
153 };
154
155 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
156 /*
157 * init_load initialize the hash table.
158 * Return 0 on success, return -1 on failure.
159 */
160 static int init_load(void)
161 {
162 int i;
163 int ret;
164
165 for (i = 0; i < LOAD_SIZE; i++) {
166 load_hash[i].next = NULL;
167 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
168 if (ret != 0) {
169 lxcfs_error("%s\n", "Failed to initialize lock");
170 goto out3;
171 }
172 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
173 if (ret != 0) {
174 lxcfs_error("%s\n", "Failed to initialize rdlock");
175 goto out2;
176 }
177 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
178 if (ret != 0) {
179 lxcfs_error("%s\n", "Failed to initialize rilock");
180 goto out1;
181 }
182 }
183 return 0;
184 out1:
185 pthread_rwlock_destroy(&load_hash[i].rdlock);
186 out2:
187 pthread_mutex_destroy(&load_hash[i].lock);
188 out3:
189 while (i > 0) {
190 i--;
191 pthread_mutex_destroy(&load_hash[i].lock);
192 pthread_rwlock_destroy(&load_hash[i].rdlock);
193 pthread_rwlock_destroy(&load_hash[i].rilock);
194 }
195 return -1;
196 }
197
198 static void insert_node(struct load_node **n, int locate)
199 {
200 struct load_node *f;
201
202 pthread_mutex_lock(&load_hash[locate].lock);
203 pthread_rwlock_wrlock(&load_hash[locate].rilock);
204 f = load_hash[locate].next;
205 load_hash[locate].next = *n;
206
207 (*n)->pre = &(load_hash[locate].next);
208 if (f)
209 f->pre = &((*n)->next);
210 (*n)->next = f;
211 pthread_mutex_unlock(&load_hash[locate].lock);
212 pthread_rwlock_unlock(&load_hash[locate].rilock);
213 }
214 /*
215 * locate_node() finds special node. Not return NULL means success.
216 * It should be noted that rdlock isn't unlocked at the end of code
217 * because this function is used to read special node. Delete is not
218 * allowed before read has ended.
219 * unlock rdlock only in proc_loadavg_read().
220 */
221 static struct load_node *locate_node(char *cg, int locate)
222 {
223 struct load_node *f = NULL;
224 int i = 0;
225
226 pthread_rwlock_rdlock(&load_hash[locate].rilock);
227 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
228 if (load_hash[locate].next == NULL) {
229 pthread_rwlock_unlock(&load_hash[locate].rilock);
230 return f;
231 }
232 f = load_hash[locate].next;
233 pthread_rwlock_unlock(&load_hash[locate].rilock);
234 while (f && ((i = strcmp(f->cg, cg)) != 0))
235 f = f->next;
236 return f;
237 }
238 /* Delete the load_node n and return the next node of it. */
239 static struct load_node *del_node(struct load_node *n, int locate)
240 {
241 struct load_node *g;
242
243 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
244 if (n->next == NULL) {
245 *(n->pre) = NULL;
246 } else {
247 *(n->pre) = n->next;
248 n->next->pre = n->pre;
249 }
250 g = n->next;
251 free(n->cg);
252 free(n);
253 pthread_rwlock_unlock(&load_hash[locate].rdlock);
254 return g;
255 }
256
257 static void load_free(void)
258 {
259 int i;
260 struct load_node *f, *p;
261
262 for (i = 0; i < LOAD_SIZE; i++) {
263 pthread_mutex_lock(&load_hash[i].lock);
264 pthread_rwlock_wrlock(&load_hash[i].rilock);
265 pthread_rwlock_wrlock(&load_hash[i].rdlock);
266 if (load_hash[i].next == NULL) {
267 pthread_mutex_unlock(&load_hash[i].lock);
268 pthread_mutex_destroy(&load_hash[i].lock);
269 pthread_rwlock_unlock(&load_hash[i].rilock);
270 pthread_rwlock_destroy(&load_hash[i].rilock);
271 pthread_rwlock_unlock(&load_hash[i].rdlock);
272 pthread_rwlock_destroy(&load_hash[i].rdlock);
273 continue;
274 }
275 for (f = load_hash[i].next; f; ) {
276 free(f->cg);
277 p = f->next;
278 free(f);
279 f = p;
280 }
281 pthread_mutex_unlock(&load_hash[i].lock);
282 pthread_mutex_destroy(&load_hash[i].lock);
283 pthread_rwlock_unlock(&load_hash[i].rilock);
284 pthread_rwlock_destroy(&load_hash[i].rilock);
285 pthread_rwlock_unlock(&load_hash[i].rdlock);
286 pthread_rwlock_destroy(&load_hash[i].rdlock);
287 }
288 }
289
290 /* Data for CPU view */
291 struct cg_proc_stat {
292 char *cg;
293 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
294 struct cpuacct_usage *view; // Usage stats reported to the container
295 int cpu_count;
296 pthread_mutex_t lock; // For node manipulation
297 struct cg_proc_stat *next;
298 };
299
300 struct cg_proc_stat_head {
301 struct cg_proc_stat *next;
302 time_t lastcheck;
303
304 /*
305 * For access to the list. Reading can be parallel, pruning is exclusive.
306 */
307 pthread_rwlock_t lock;
308 };
309
310 #define CPUVIEW_HASH_SIZE 100
311 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
312
313 static bool cpuview_init_head(struct cg_proc_stat_head **head)
314 {
315 *head = malloc(sizeof(struct cg_proc_stat_head));
316 if (!(*head)) {
317 lxcfs_error("%s\n", strerror(errno));
318 return false;
319 }
320
321 (*head)->lastcheck = time(NULL);
322 (*head)->next = NULL;
323
324 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
325 lxcfs_error("%s\n", "Failed to initialize list lock");
326 free(*head);
327 return false;
328 }
329
330 return true;
331 }
332
333 static bool init_cpuview()
334 {
335 int i;
336
337 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
338 proc_stat_history[i] = NULL;
339
340 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
341 if (!cpuview_init_head(&proc_stat_history[i]))
342 goto err;
343 }
344
345 return true;
346
347 err:
348 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
349 if (proc_stat_history[i]) {
350 free(proc_stat_history[i]);
351 proc_stat_history[i] = NULL;
352 }
353 }
354
355 return false;
356 }
357
358 static void free_proc_stat_node(struct cg_proc_stat *node)
359 {
360 pthread_mutex_destroy(&node->lock);
361 free(node->cg);
362 free(node->usage);
363 free(node->view);
364 free(node);
365 }
366
367 static void cpuview_free_head(struct cg_proc_stat_head *head)
368 {
369 struct cg_proc_stat *node, *tmp;
370
371 if (head->next) {
372 node = head->next;
373
374 for (;;) {
375 tmp = node;
376 node = node->next;
377 free_proc_stat_node(tmp);
378
379 if (!node)
380 break;
381 }
382 }
383
384 pthread_rwlock_destroy(&head->lock);
385 free(head);
386 }
387
388 static void free_cpuview()
389 {
390 int i;
391
392 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
393 if (proc_stat_history[i])
394 cpuview_free_head(proc_stat_history[i]);
395 }
396 }
397
398 /* Reserve buffer size to account for file size changes. */
399 #define BUF_RESERVE_SIZE 512
400
401 /*
402 * A table caching which pid is init for a pid namespace.
403 * When looking up which pid is init for $qpid, we first
404 * 1. Stat /proc/$qpid/ns/pid.
405 * 2. Check whether the ino_t is in our store.
406 * a. if not, fork a child in qpid's ns to send us
407 * ucred.pid = 1, and read the initpid. Cache
408 * initpid and creation time for /proc/initpid
409 * in a new store entry.
410 * b. if so, verify that /proc/initpid still matches
411 * what we have saved. If not, clear the store
412 * entry and go back to a. If so, return the
413 * cached initpid.
414 */
415 struct pidns_init_store {
416 ino_t ino; // inode number for /proc/$pid/ns/pid
417 pid_t initpid; // the pid of nit in that ns
418 long int ctime; // the time at which /proc/$initpid was created
419 struct pidns_init_store *next;
420 long int lastcheck;
421 };
422
423 /* lol - look at how they are allocated in the kernel */
424 #define PIDNS_HASH_SIZE 4096
425 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
426
427 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
428 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
429 static void lock_mutex(pthread_mutex_t *l)
430 {
431 int ret;
432
433 if ((ret = pthread_mutex_lock(l)) != 0) {
434 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
435 exit(1);
436 }
437 }
438
439 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
440 * Number of hierarchies mounted. */
441 static int num_hierarchies;
442
443 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
444 * Hierachies mounted {cpuset, blkio, ...}:
445 * Initialized via __constructor__ collect_and_mount_subsystems(). */
446 static char **hierarchies;
447
448 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
449 * Open file descriptors:
450 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
451 * private mount namespace.
452 * Initialized via __constructor__ collect_and_mount_subsystems().
453 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
454 * mounts and respective files in the private namespace even when located in
455 * another namespace using the *at() family of functions
456 * {openat(), fchownat(), ...}. */
457 static int *fd_hierarchies;
458 static int cgroup_mount_ns_fd = -1;
459
460 static void unlock_mutex(pthread_mutex_t *l)
461 {
462 int ret;
463
464 if ((ret = pthread_mutex_unlock(l)) != 0) {
465 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
466 exit(1);
467 }
468 }
469
470 static void store_lock(void)
471 {
472 lock_mutex(&pidns_store_mutex);
473 }
474
475 static void store_unlock(void)
476 {
477 unlock_mutex(&pidns_store_mutex);
478 }
479
480 /* Must be called under store_lock */
481 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
482 {
483 struct stat initsb;
484 char fnam[100];
485
486 snprintf(fnam, 100, "/proc/%d", e->initpid);
487 if (stat(fnam, &initsb) < 0)
488 return false;
489
490 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
491 initsb.st_ctime, e->initpid);
492
493 if (e->ctime != initsb.st_ctime)
494 return false;
495 return true;
496 }
497
498 /* Must be called under store_lock */
499 static void remove_initpid(struct pidns_init_store *e)
500 {
501 struct pidns_init_store *tmp;
502 int h;
503
504 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
505
506 h = HASH(e->ino);
507 if (pidns_hash_table[h] == e) {
508 pidns_hash_table[h] = e->next;
509 free(e);
510 return;
511 }
512
513 tmp = pidns_hash_table[h];
514 while (tmp) {
515 if (tmp->next == e) {
516 tmp->next = e->next;
517 free(e);
518 return;
519 }
520 tmp = tmp->next;
521 }
522 }
523
524 #define PURGE_SECS 5
525 /* Must be called under store_lock */
526 static void prune_initpid_store(void)
527 {
528 static long int last_prune = 0;
529 struct pidns_init_store *e, *prev, *delme;
530 long int now, threshold;
531 int i;
532
533 if (!last_prune) {
534 last_prune = time(NULL);
535 return;
536 }
537 now = time(NULL);
538 if (now < last_prune + PURGE_SECS)
539 return;
540
541 lxcfs_debug("%s\n", "Pruning.");
542
543 last_prune = now;
544 threshold = now - 2 * PURGE_SECS;
545
546 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
547 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
548 if (e->lastcheck < threshold) {
549
550 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
551
552 delme = e;
553 if (prev)
554 prev->next = e->next;
555 else
556 pidns_hash_table[i] = e->next;
557 e = e->next;
558 free(delme);
559 } else {
560 prev = e;
561 e = e->next;
562 }
563 }
564 }
565 }
566
567 /* Must be called under store_lock */
568 static void save_initpid(struct stat *sb, pid_t pid)
569 {
570 struct pidns_init_store *e;
571 char fpath[100];
572 struct stat procsb;
573 int h;
574
575 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
576
577 snprintf(fpath, 100, "/proc/%d", pid);
578 if (stat(fpath, &procsb) < 0)
579 return;
580 do {
581 e = malloc(sizeof(*e));
582 } while (!e);
583 e->ino = sb->st_ino;
584 e->initpid = pid;
585 e->ctime = procsb.st_ctime;
586 h = HASH(e->ino);
587 e->next = pidns_hash_table[h];
588 e->lastcheck = time(NULL);
589 pidns_hash_table[h] = e;
590 }
591
592 /*
593 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
594 * entry for the inode number and creation time. Verify that the init pid
595 * is still valid. If not, remove it. Return the entry if valid, NULL
596 * otherwise.
597 * Must be called under store_lock
598 */
599 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
600 {
601 int h = HASH(sb->st_ino);
602 struct pidns_init_store *e = pidns_hash_table[h];
603
604 while (e) {
605 if (e->ino == sb->st_ino) {
606 if (initpid_still_valid(e, sb)) {
607 e->lastcheck = time(NULL);
608 return e;
609 }
610 remove_initpid(e);
611 return NULL;
612 }
613 e = e->next;
614 }
615
616 return NULL;
617 }
618
619 static int is_dir(const char *path, int fd)
620 {
621 struct stat statbuf;
622 int ret = fstatat(fd, path, &statbuf, fd);
623 if (ret == 0 && S_ISDIR(statbuf.st_mode))
624 return 1;
625 return 0;
626 }
627
628 static char *must_copy_string(const char *str)
629 {
630 char *dup = NULL;
631 if (!str)
632 return NULL;
633 do {
634 dup = strdup(str);
635 } while (!dup);
636
637 return dup;
638 }
639
640 static inline void drop_trailing_newlines(char *s)
641 {
642 int l;
643
644 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
645 s[l-1] = '\0';
646 }
647
648 #define BATCH_SIZE 50
649 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
650 {
651 int newbatches = (newlen / BATCH_SIZE) + 1;
652 int oldbatches = (oldlen / BATCH_SIZE) + 1;
653
654 if (!*mem || newbatches > oldbatches) {
655 char *tmp;
656 do {
657 tmp = realloc(*mem, newbatches * BATCH_SIZE);
658 } while (!tmp);
659 *mem = tmp;
660 }
661 }
662 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
663 {
664 size_t newlen = *len + linelen;
665 dorealloc(contents, *len, newlen + 1);
666 memcpy(*contents + *len, line, linelen+1);
667 *len = newlen;
668 }
669
670 static char *slurp_file(const char *from, int fd)
671 {
672 char *line = NULL;
673 char *contents = NULL;
674 FILE *f = fdopen(fd, "r");
675 size_t len = 0, fulllen = 0;
676 ssize_t linelen;
677
678 if (!f)
679 return NULL;
680
681 while ((linelen = getline(&line, &len, f)) != -1) {
682 append_line(&contents, &fulllen, line, linelen);
683 }
684 fclose(f);
685
686 if (contents)
687 drop_trailing_newlines(contents);
688 free(line);
689 return contents;
690 }
691
692 static bool write_string(const char *fnam, const char *string, int fd)
693 {
694 FILE *f;
695 size_t len, ret;
696
697 f = fdopen(fd, "w");
698 if (!f)
699 return false;
700
701 len = strlen(string);
702 ret = fwrite(string, 1, len, f);
703 if (ret != len) {
704 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
705 strerror(errno), string, fnam);
706 fclose(f);
707 return false;
708 }
709
710 if (fclose(f) < 0) {
711 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
712 return false;
713 }
714
715 return true;
716 }
717
718 struct cgfs_files {
719 char *name;
720 uint32_t uid, gid;
721 uint32_t mode;
722 };
723
724 #define ALLOC_NUM 20
725 static bool store_hierarchy(char *stridx, char *h)
726 {
727 if (num_hierarchies % ALLOC_NUM == 0) {
728 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
729 n *= ALLOC_NUM;
730 char **tmp = realloc(hierarchies, n * sizeof(char *));
731 if (!tmp) {
732 lxcfs_error("%s\n", strerror(errno));
733 exit(1);
734 }
735 hierarchies = tmp;
736 }
737
738 hierarchies[num_hierarchies++] = must_copy_string(h);
739 return true;
740 }
741
742 static void print_subsystems(void)
743 {
744 int i;
745
746 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
747 fprintf(stderr, "hierarchies:\n");
748 for (i = 0; i < num_hierarchies; i++) {
749 if (hierarchies[i])
750 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
751 fd_hierarchies[i], hierarchies[i]);
752 }
753 }
754
755 static bool in_comma_list(const char *needle, const char *haystack)
756 {
757 const char *s = haystack, *e;
758 size_t nlen = strlen(needle);
759
760 while (*s && (e = strchr(s, ','))) {
761 if (nlen != e - s) {
762 s = e + 1;
763 continue;
764 }
765 if (strncmp(needle, s, nlen) == 0)
766 return true;
767 s = e + 1;
768 }
769 if (strcmp(needle, s) == 0)
770 return true;
771 return false;
772 }
773
774 /* do we need to do any massaging here? I'm not sure... */
775 /* Return the mounted controller and store the corresponding open file descriptor
776 * referring to the controller mountpoint in the private lxcfs namespace in
777 * @cfd.
778 */
779 static char *find_mounted_controller(const char *controller, int *cfd)
780 {
781 int i;
782
783 for (i = 0; i < num_hierarchies; i++) {
784 if (!hierarchies[i])
785 continue;
786 if (strcmp(hierarchies[i], controller) == 0) {
787 *cfd = fd_hierarchies[i];
788 return hierarchies[i];
789 }
790 if (in_comma_list(controller, hierarchies[i])) {
791 *cfd = fd_hierarchies[i];
792 return hierarchies[i];
793 }
794 }
795
796 return NULL;
797 }
798
799 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
800 const char *value)
801 {
802 int ret, fd, cfd;
803 size_t len;
804 char *fnam, *tmpc;
805
806 tmpc = find_mounted_controller(controller, &cfd);
807 if (!tmpc)
808 return false;
809
810 /* Make sure we pass a relative path to *at() family of functions.
811 * . + /cgroup + / + file + \0
812 */
813 len = strlen(cgroup) + strlen(file) + 3;
814 fnam = alloca(len);
815 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
816 if (ret < 0 || (size_t)ret >= len)
817 return false;
818
819 fd = openat(cfd, fnam, O_WRONLY);
820 if (fd < 0)
821 return false;
822
823 return write_string(fnam, value, fd);
824 }
825
826 // Chown all the files in the cgroup directory. We do this when we create
827 // a cgroup on behalf of a user.
828 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
829 {
830 struct dirent *direntp;
831 char path[MAXPATHLEN];
832 size_t len;
833 DIR *d;
834 int fd1, ret;
835
836 len = strlen(dirname);
837 if (len >= MAXPATHLEN) {
838 lxcfs_error("Pathname too long: %s\n", dirname);
839 return;
840 }
841
842 fd1 = openat(fd, dirname, O_DIRECTORY);
843 if (fd1 < 0)
844 return;
845
846 d = fdopendir(fd1);
847 if (!d) {
848 lxcfs_error("Failed to open %s\n", dirname);
849 return;
850 }
851
852 while ((direntp = readdir(d))) {
853 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
854 continue;
855 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
856 if (ret < 0 || ret >= MAXPATHLEN) {
857 lxcfs_error("Pathname too long under %s\n", dirname);
858 continue;
859 }
860 if (fchownat(fd, path, uid, gid, 0) < 0)
861 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
862 }
863 closedir(d);
864 }
865
866 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
867 {
868 int cfd;
869 size_t len;
870 char *dirnam, *tmpc;
871
872 tmpc = find_mounted_controller(controller, &cfd);
873 if (!tmpc)
874 return -EINVAL;
875
876 /* Make sure we pass a relative path to *at() family of functions.
877 * . + /cg + \0
878 */
879 len = strlen(cg) + 2;
880 dirnam = alloca(len);
881 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
882
883 if (mkdirat(cfd, dirnam, 0755) < 0)
884 return -errno;
885
886 if (uid == 0 && gid == 0)
887 return 0;
888
889 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
890 return -errno;
891
892 chown_all_cgroup_files(dirnam, uid, gid, cfd);
893
894 return 0;
895 }
896
897 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
898 {
899 struct dirent *direntp;
900 DIR *dir;
901 bool ret = false;
902 char pathname[MAXPATHLEN];
903 int dupfd;
904
905 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
906 if (dupfd < 0)
907 return false;
908
909 dir = fdopendir(dupfd);
910 if (!dir) {
911 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
912 close(dupfd);
913 return false;
914 }
915
916 while ((direntp = readdir(dir))) {
917 struct stat mystat;
918 int rc;
919
920 if (!strcmp(direntp->d_name, ".") ||
921 !strcmp(direntp->d_name, ".."))
922 continue;
923
924 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
925 if (rc < 0 || rc >= MAXPATHLEN) {
926 lxcfs_error("%s\n", "Pathname too long.");
927 continue;
928 }
929
930 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
931 if (rc) {
932 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
933 continue;
934 }
935 if (S_ISDIR(mystat.st_mode))
936 if (!recursive_rmdir(pathname, fd, cfd))
937 lxcfs_debug("Error removing %s.\n", pathname);
938 }
939
940 ret = true;
941 if (closedir(dir) < 0) {
942 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
943 ret = false;
944 }
945
946 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
947 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
948 ret = false;
949 }
950
951 close(dupfd);
952
953 return ret;
954 }
955
956 bool cgfs_remove(const char *controller, const char *cg)
957 {
958 int fd, cfd;
959 size_t len;
960 char *dirnam, *tmpc;
961 bool bret;
962
963 tmpc = find_mounted_controller(controller, &cfd);
964 if (!tmpc)
965 return false;
966
967 /* Make sure we pass a relative path to *at() family of functions.
968 * . + /cg + \0
969 */
970 len = strlen(cg) + 2;
971 dirnam = alloca(len);
972 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
973
974 fd = openat(cfd, dirnam, O_DIRECTORY);
975 if (fd < 0)
976 return false;
977
978 bret = recursive_rmdir(dirnam, fd, cfd);
979 close(fd);
980 return bret;
981 }
982
983 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
984 {
985 int cfd;
986 size_t len;
987 char *pathname, *tmpc;
988
989 tmpc = find_mounted_controller(controller, &cfd);
990 if (!tmpc)
991 return false;
992
993 /* Make sure we pass a relative path to *at() family of functions.
994 * . + /file + \0
995 */
996 len = strlen(file) + 2;
997 pathname = alloca(len);
998 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
999 if (fchmodat(cfd, pathname, mode, 0) < 0)
1000 return false;
1001 return true;
1002 }
1003
1004 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
1005 {
1006 size_t len;
1007 char *fname;
1008
1009 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
1010 fname = alloca(len);
1011 snprintf(fname, len, "%s/tasks", dirname);
1012 if (fchownat(fd, fname, uid, gid, 0) != 0)
1013 return -errno;
1014 snprintf(fname, len, "%s/cgroup.procs", dirname);
1015 if (fchownat(fd, fname, uid, gid, 0) != 0)
1016 return -errno;
1017 return 0;
1018 }
1019
1020 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
1021 {
1022 int cfd;
1023 size_t len;
1024 char *pathname, *tmpc;
1025
1026 tmpc = find_mounted_controller(controller, &cfd);
1027 if (!tmpc)
1028 return -EINVAL;
1029
1030 /* Make sure we pass a relative path to *at() family of functions.
1031 * . + /file + \0
1032 */
1033 len = strlen(file) + 2;
1034 pathname = alloca(len);
1035 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1036 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
1037 return -errno;
1038
1039 if (is_dir(pathname, cfd))
1040 // like cgmanager did, we want to chown the tasks file as well
1041 return chown_tasks_files(pathname, uid, gid, cfd);
1042
1043 return 0;
1044 }
1045
1046 FILE *open_pids_file(const char *controller, const char *cgroup)
1047 {
1048 int fd, cfd;
1049 size_t len;
1050 char *pathname, *tmpc;
1051
1052 tmpc = find_mounted_controller(controller, &cfd);
1053 if (!tmpc)
1054 return NULL;
1055
1056 /* Make sure we pass a relative path to *at() family of functions.
1057 * . + /cgroup + / "cgroup.procs" + \0
1058 */
1059 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
1060 pathname = alloca(len);
1061 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1062
1063 fd = openat(cfd, pathname, O_WRONLY);
1064 if (fd < 0)
1065 return NULL;
1066
1067 return fdopen(fd, "w");
1068 }
1069
1070 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1071 void ***list, size_t typesize,
1072 void* (*iterator)(const char*, const char*, const char*))
1073 {
1074 int cfd, fd, ret;
1075 size_t len;
1076 char *cg, *tmpc;
1077 char pathname[MAXPATHLEN];
1078 size_t sz = 0, asz = 0;
1079 struct dirent *dirent;
1080 DIR *dir;
1081
1082 tmpc = find_mounted_controller(controller, &cfd);
1083 *list = NULL;
1084 if (!tmpc)
1085 return false;
1086
1087 /* Make sure we pass a relative path to *at() family of functions. */
1088 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1089 cg = alloca(len);
1090 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1091 if (ret < 0 || (size_t)ret >= len) {
1092 lxcfs_error("Pathname too long under %s\n", cgroup);
1093 return false;
1094 }
1095
1096 fd = openat(cfd, cg, O_DIRECTORY);
1097 if (fd < 0)
1098 return false;
1099
1100 dir = fdopendir(fd);
1101 if (!dir)
1102 return false;
1103
1104 while ((dirent = readdir(dir))) {
1105 struct stat mystat;
1106
1107 if (!strcmp(dirent->d_name, ".") ||
1108 !strcmp(dirent->d_name, ".."))
1109 continue;
1110
1111 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1112 if (ret < 0 || ret >= MAXPATHLEN) {
1113 lxcfs_error("Pathname too long under %s\n", cg);
1114 continue;
1115 }
1116
1117 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1118 if (ret) {
1119 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1120 continue;
1121 }
1122 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1123 (directories && !S_ISDIR(mystat.st_mode)))
1124 continue;
1125
1126 if (sz+2 >= asz) {
1127 void **tmp;
1128 asz += BATCH_SIZE;
1129 do {
1130 tmp = realloc(*list, asz * typesize);
1131 } while (!tmp);
1132 *list = tmp;
1133 }
1134 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1135 (*list)[sz+1] = NULL;
1136 sz++;
1137 }
1138 if (closedir(dir) < 0) {
1139 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1140 return false;
1141 }
1142 return true;
1143 }
1144
1145 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1146 {
1147 char *dup;
1148 do {
1149 dup = strdup(dir_entry);
1150 } while (!dup);
1151 return dup;
1152 }
1153
1154 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1155 {
1156 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1157 }
1158
1159 void free_key(struct cgfs_files *k)
1160 {
1161 if (!k)
1162 return;
1163 free(k->name);
1164 free(k);
1165 }
1166
1167 void free_keys(struct cgfs_files **keys)
1168 {
1169 int i;
1170
1171 if (!keys)
1172 return;
1173 for (i = 0; keys[i]; i++) {
1174 free_key(keys[i]);
1175 }
1176 free(keys);
1177 }
1178
1179 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1180 {
1181 int ret, fd, cfd;
1182 size_t len;
1183 char *fnam, *tmpc;
1184
1185 tmpc = find_mounted_controller(controller, &cfd);
1186 if (!tmpc)
1187 return false;
1188
1189 /* Make sure we pass a relative path to *at() family of functions.
1190 * . + /cgroup + / + file + \0
1191 */
1192 len = strlen(cgroup) + strlen(file) + 3;
1193 fnam = alloca(len);
1194 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1195 if (ret < 0 || (size_t)ret >= len)
1196 return false;
1197
1198 fd = openat(cfd, fnam, O_RDONLY);
1199 if (fd < 0)
1200 return false;
1201
1202 *value = slurp_file(fnam, fd);
1203 return *value != NULL;
1204 }
1205
1206 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1207 {
1208 int ret, cfd;
1209 size_t len;
1210 char *fnam, *tmpc;
1211
1212 tmpc = find_mounted_controller(controller, &cfd);
1213 if (!tmpc)
1214 return false;
1215
1216 /* Make sure we pass a relative path to *at() family of functions.
1217 * . + /cgroup + / + file + \0
1218 */
1219 len = strlen(cgroup) + strlen(file) + 3;
1220 fnam = alloca(len);
1221 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1222 if (ret < 0 || (size_t)ret >= len)
1223 return false;
1224
1225 return (faccessat(cfd, fnam, F_OK, 0) == 0);
1226 }
1227
1228 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1229 {
1230 int ret, cfd;
1231 size_t len;
1232 char *fnam, *tmpc;
1233 struct stat sb;
1234 struct cgfs_files *newkey;
1235
1236 tmpc = find_mounted_controller(controller, &cfd);
1237 if (!tmpc)
1238 return false;
1239
1240 if (file && *file == '/')
1241 file++;
1242
1243 if (file && strchr(file, '/'))
1244 return NULL;
1245
1246 /* Make sure we pass a relative path to *at() family of functions.
1247 * . + /cgroup + / + file + \0
1248 */
1249 len = strlen(cgroup) + 3;
1250 if (file)
1251 len += strlen(file) + 1;
1252 fnam = alloca(len);
1253 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1254 file ? "/" : "", file ? file : "");
1255
1256 ret = fstatat(cfd, fnam, &sb, 0);
1257 if (ret < 0)
1258 return NULL;
1259
1260 do {
1261 newkey = malloc(sizeof(struct cgfs_files));
1262 } while (!newkey);
1263 if (file)
1264 newkey->name = must_copy_string(file);
1265 else if (strrchr(cgroup, '/'))
1266 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1267 else
1268 newkey->name = must_copy_string(cgroup);
1269 newkey->uid = sb.st_uid;
1270 newkey->gid = sb.st_gid;
1271 newkey->mode = sb.st_mode;
1272
1273 return newkey;
1274 }
1275
1276 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1277 {
1278 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1279 if (!entry) {
1280 lxcfs_error("Error getting files under %s:%s\n", controller,
1281 cgroup);
1282 }
1283 return entry;
1284 }
1285
1286 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1287 {
1288 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1289 }
1290
1291 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1292 {
1293 int cfd;
1294 size_t len;
1295 char *fnam, *tmpc;
1296 int ret;
1297 struct stat sb;
1298
1299 tmpc = find_mounted_controller(controller, &cfd);
1300 if (!tmpc)
1301 return false;
1302
1303 /* Make sure we pass a relative path to *at() family of functions.
1304 * . + /cgroup + / + f + \0
1305 */
1306 len = strlen(cgroup) + strlen(f) + 3;
1307 fnam = alloca(len);
1308 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1309 if (ret < 0 || (size_t)ret >= len)
1310 return false;
1311
1312 ret = fstatat(cfd, fnam, &sb, 0);
1313 if (ret < 0 || !S_ISDIR(sb.st_mode))
1314 return false;
1315
1316 return true;
1317 }
1318
1319 #define SEND_CREDS_OK 0
1320 #define SEND_CREDS_NOTSK 1
1321 #define SEND_CREDS_FAIL 2
1322 static bool recv_creds(int sock, struct ucred *cred, char *v);
1323 static int wait_for_pid(pid_t pid);
1324 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1325 static int send_creds_clone_wrapper(void *arg);
1326
1327 /*
1328 * clone a task which switches to @task's namespace and writes '1'.
1329 * over a unix sock so we can read the task's reaper's pid in our
1330 * namespace
1331 *
1332 * Note: glibc's fork() does not respect pidns, which can lead to failed
1333 * assertions inside glibc (and thus failed forks) if the child's pid in
1334 * the pidns and the parent pid outside are identical. Using clone prevents
1335 * this issue.
1336 */
1337 static void write_task_init_pid_exit(int sock, pid_t target)
1338 {
1339 char fnam[100];
1340 pid_t pid;
1341 int fd, ret;
1342 size_t stack_size = sysconf(_SC_PAGESIZE);
1343 void *stack = alloca(stack_size);
1344
1345 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1346 if (ret < 0 || ret >= sizeof(fnam))
1347 _exit(1);
1348
1349 fd = open(fnam, O_RDONLY);
1350 if (fd < 0) {
1351 perror("write_task_init_pid_exit open of ns/pid");
1352 _exit(1);
1353 }
1354 if (setns(fd, 0)) {
1355 perror("write_task_init_pid_exit setns 1");
1356 close(fd);
1357 _exit(1);
1358 }
1359 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1360 if (pid < 0)
1361 _exit(1);
1362 if (pid != 0) {
1363 if (!wait_for_pid(pid))
1364 _exit(1);
1365 _exit(0);
1366 }
1367 }
1368
1369 static int send_creds_clone_wrapper(void *arg) {
1370 struct ucred cred;
1371 char v;
1372 int sock = *(int *)arg;
1373
1374 /* we are the child */
1375 cred.uid = 0;
1376 cred.gid = 0;
1377 cred.pid = 1;
1378 v = '1';
1379 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1380 return 1;
1381 return 0;
1382 }
1383
1384 static pid_t get_init_pid_for_task(pid_t task)
1385 {
1386 int sock[2];
1387 pid_t pid;
1388 pid_t ret = -1;
1389 char v = '0';
1390 struct ucred cred;
1391
1392 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1393 perror("socketpair");
1394 return -1;
1395 }
1396
1397 pid = fork();
1398 if (pid < 0)
1399 goto out;
1400 if (!pid) {
1401 close(sock[1]);
1402 write_task_init_pid_exit(sock[0], task);
1403 _exit(0);
1404 }
1405
1406 if (!recv_creds(sock[1], &cred, &v))
1407 goto out;
1408 ret = cred.pid;
1409
1410 out:
1411 close(sock[0]);
1412 close(sock[1]);
1413 if (pid > 0)
1414 wait_for_pid(pid);
1415 return ret;
1416 }
1417
1418 static pid_t lookup_initpid_in_store(pid_t qpid)
1419 {
1420 pid_t answer = 0;
1421 struct stat sb;
1422 struct pidns_init_store *e;
1423 char fnam[100];
1424
1425 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1426 store_lock();
1427 if (stat(fnam, &sb) < 0)
1428 goto out;
1429 e = lookup_verify_initpid(&sb);
1430 if (e) {
1431 answer = e->initpid;
1432 goto out;
1433 }
1434 answer = get_init_pid_for_task(qpid);
1435 if (answer > 0)
1436 save_initpid(&sb, answer);
1437
1438 out:
1439 /* we prune at end in case we are returning
1440 * the value we were about to return */
1441 prune_initpid_store();
1442 store_unlock();
1443 return answer;
1444 }
1445
1446 static int wait_for_pid(pid_t pid)
1447 {
1448 int status, ret;
1449
1450 if (pid <= 0)
1451 return -1;
1452
1453 again:
1454 ret = waitpid(pid, &status, 0);
1455 if (ret == -1) {
1456 if (errno == EINTR)
1457 goto again;
1458 return -1;
1459 }
1460 if (ret != pid)
1461 goto again;
1462 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1463 return -1;
1464 return 0;
1465 }
1466
1467
1468 /*
1469 * append pid to *src.
1470 * src: a pointer to a char* in which ot append the pid.
1471 * sz: the number of characters printed so far, minus trailing \0.
1472 * asz: the allocated size so far
1473 * pid: the pid to append
1474 */
1475 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1476 {
1477 char tmp[30];
1478
1479 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1480
1481 if (!*src || tmplen + *sz + 1 >= *asz) {
1482 char *tmp;
1483 do {
1484 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1485 } while (!tmp);
1486 *src = tmp;
1487 *asz += BUF_RESERVE_SIZE;
1488 }
1489 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1490 *sz += tmplen;
1491 }
1492
1493 /*
1494 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1495 * valid in the caller's namespace, return the id mapped into
1496 * pid's namespace.
1497 * Returns the mapped id, or -1 on error.
1498 */
1499 unsigned int
1500 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1501 {
1502 unsigned int nsuid, // base id for a range in the idfile's namespace
1503 hostuid, // base id for a range in the caller's namespace
1504 count; // number of ids in this range
1505 char line[400];
1506 int ret;
1507
1508 fseek(idfile, 0L, SEEK_SET);
1509 while (fgets(line, 400, idfile)) {
1510 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1511 if (ret != 3)
1512 continue;
1513 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1514 /*
1515 * uids wrapped around - unexpected as this is a procfile,
1516 * so just bail.
1517 */
1518 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1519 nsuid, hostuid, count, line);
1520 return -1;
1521 }
1522 if (hostuid <= in_id && hostuid+count > in_id) {
1523 /*
1524 * now since hostuid <= in_id < hostuid+count, and
1525 * hostuid+count and nsuid+count do not wrap around,
1526 * we know that nsuid+(in_id-hostuid) which must be
1527 * less that nsuid+(count) must not wrap around
1528 */
1529 return (in_id - hostuid) + nsuid;
1530 }
1531 }
1532
1533 // no answer found
1534 return -1;
1535 }
1536
1537 /*
1538 * for is_privileged_over,
1539 * specify whether we require the calling uid to be root in his
1540 * namespace
1541 */
1542 #define NS_ROOT_REQD true
1543 #define NS_ROOT_OPT false
1544
1545 #define PROCLEN 100
1546
1547 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1548 {
1549 char fpath[PROCLEN];
1550 int ret;
1551 bool answer = false;
1552 uid_t nsuid;
1553
1554 if (victim == -1 || uid == -1)
1555 return false;
1556
1557 /*
1558 * If the request is one not requiring root in the namespace,
1559 * then having the same uid suffices. (i.e. uid 1000 has write
1560 * access to files owned by uid 1000
1561 */
1562 if (!req_ns_root && uid == victim)
1563 return true;
1564
1565 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1566 if (ret < 0 || ret >= PROCLEN)
1567 return false;
1568 FILE *f = fopen(fpath, "r");
1569 if (!f)
1570 return false;
1571
1572 /* if caller's not root in his namespace, reject */
1573 nsuid = convert_id_to_ns(f, uid);
1574 if (nsuid)
1575 goto out;
1576
1577 /*
1578 * If victim is not mapped into caller's ns, reject.
1579 * XXX I'm not sure this check is needed given that fuse
1580 * will be sending requests where the vfs has converted
1581 */
1582 nsuid = convert_id_to_ns(f, victim);
1583 if (nsuid == -1)
1584 goto out;
1585
1586 answer = true;
1587
1588 out:
1589 fclose(f);
1590 return answer;
1591 }
1592
1593 static bool perms_include(int fmode, mode_t req_mode)
1594 {
1595 mode_t r;
1596
1597 switch (req_mode & O_ACCMODE) {
1598 case O_RDONLY:
1599 r = S_IROTH;
1600 break;
1601 case O_WRONLY:
1602 r = S_IWOTH;
1603 break;
1604 case O_RDWR:
1605 r = S_IROTH | S_IWOTH;
1606 break;
1607 default:
1608 return false;
1609 }
1610 return ((fmode & r) == r);
1611 }
1612
1613
1614 /*
1615 * taskcg is a/b/c
1616 * querycg is /a/b/c/d/e
1617 * we return 'd'
1618 */
1619 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1620 {
1621 char *start, *end;
1622
1623 if (strlen(taskcg) <= strlen(querycg)) {
1624 lxcfs_error("%s\n", "I was fed bad input.");
1625 return NULL;
1626 }
1627
1628 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1629 start = strdup(taskcg + 1);
1630 else
1631 start = strdup(taskcg + strlen(querycg) + 1);
1632 if (!start)
1633 return NULL;
1634 end = strchr(start, '/');
1635 if (end)
1636 *end = '\0';
1637 return start;
1638 }
1639
1640 static void stripnewline(char *x)
1641 {
1642 size_t l = strlen(x);
1643 if (l && x[l-1] == '\n')
1644 x[l-1] = '\0';
1645 }
1646
1647 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1648 {
1649 int cfd;
1650 char fnam[PROCLEN];
1651 FILE *f;
1652 char *answer = NULL;
1653 char *line = NULL;
1654 size_t len = 0;
1655 int ret;
1656 const char *h = find_mounted_controller(contrl, &cfd);
1657 if (!h)
1658 return NULL;
1659
1660 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1661 if (ret < 0 || ret >= PROCLEN)
1662 return NULL;
1663 if (!(f = fopen(fnam, "r")))
1664 return NULL;
1665
1666 while (getline(&line, &len, f) != -1) {
1667 char *c1, *c2;
1668 if (!line[0])
1669 continue;
1670 c1 = strchr(line, ':');
1671 if (!c1)
1672 goto out;
1673 c1++;
1674 c2 = strchr(c1, ':');
1675 if (!c2)
1676 goto out;
1677 *c2 = '\0';
1678 if (strcmp(c1, h) != 0)
1679 continue;
1680 c2++;
1681 stripnewline(c2);
1682 do {
1683 answer = strdup(c2);
1684 } while (!answer);
1685 break;
1686 }
1687
1688 out:
1689 fclose(f);
1690 free(line);
1691 return answer;
1692 }
1693
1694 /*
1695 * check whether a fuse context may access a cgroup dir or file
1696 *
1697 * If file is not null, it is a cgroup file to check under cg.
1698 * If file is null, then we are checking perms on cg itself.
1699 *
1700 * For files we can check the mode of the list_keys result.
1701 * For cgroups, we must make assumptions based on the files under the
1702 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1703 * yet.
1704 */
1705 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1706 {
1707 struct cgfs_files *k = NULL;
1708 bool ret = false;
1709
1710 k = cgfs_get_key(contrl, cg, file);
1711 if (!k)
1712 return false;
1713
1714 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1715 if (perms_include(k->mode >> 6, mode)) {
1716 ret = true;
1717 goto out;
1718 }
1719 }
1720 if (fc->gid == k->gid) {
1721 if (perms_include(k->mode >> 3, mode)) {
1722 ret = true;
1723 goto out;
1724 }
1725 }
1726 ret = perms_include(k->mode, mode);
1727
1728 out:
1729 free_key(k);
1730 return ret;
1731 }
1732
1733 #define INITSCOPE "/init.scope"
1734 static void prune_init_slice(char *cg)
1735 {
1736 char *point;
1737 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1738
1739 if (cg_len < initscope_len)
1740 return;
1741
1742 point = cg + cg_len - initscope_len;
1743 if (strcmp(point, INITSCOPE) == 0) {
1744 if (point == cg)
1745 *(point+1) = '\0';
1746 else
1747 *point = '\0';
1748 }
1749 }
1750
1751 /*
1752 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1753 * If pid is in /a, he may act on /a/b, but not on /b.
1754 * if the answer is false and nextcg is not NULL, then *nextcg will point
1755 * to a string containing the next cgroup directory under cg, which must be
1756 * freed by the caller.
1757 */
1758 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1759 {
1760 bool answer = false;
1761 char *c2 = get_pid_cgroup(pid, contrl);
1762 char *linecmp;
1763
1764 if (!c2)
1765 return false;
1766 prune_init_slice(c2);
1767
1768 /*
1769 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1770 * they pass in a cgroup without leading '/'
1771 *
1772 * The original line here was:
1773 * linecmp = *cg == '/' ? c2 : c2+1;
1774 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1775 * Serge, do you know?
1776 */
1777 if (*cg == '/' || !strncmp(cg, "./", 2))
1778 linecmp = c2;
1779 else
1780 linecmp = c2 + 1;
1781 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1782 if (nextcg) {
1783 *nextcg = get_next_cgroup_dir(linecmp, cg);
1784 }
1785 goto out;
1786 }
1787 answer = true;
1788
1789 out:
1790 free(c2);
1791 return answer;
1792 }
1793
1794 /*
1795 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1796 */
1797 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1798 {
1799 bool answer = false;
1800 char *c2, *task_cg;
1801 size_t target_len, task_len;
1802
1803 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1804 return true;
1805
1806 c2 = get_pid_cgroup(pid, contrl);
1807 if (!c2)
1808 return false;
1809 prune_init_slice(c2);
1810
1811 task_cg = c2 + 1;
1812 target_len = strlen(cg);
1813 task_len = strlen(task_cg);
1814 if (task_len == 0) {
1815 /* Task is in the root cg, it can see everything. This case is
1816 * not handled by the strmcps below, since they test for the
1817 * last /, but that is the first / that we've chopped off
1818 * above.
1819 */
1820 answer = true;
1821 goto out;
1822 }
1823 if (strcmp(cg, task_cg) == 0) {
1824 answer = true;
1825 goto out;
1826 }
1827 if (target_len < task_len) {
1828 /* looking up a parent dir */
1829 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1830 answer = true;
1831 goto out;
1832 }
1833 if (target_len > task_len) {
1834 /* looking up a child dir */
1835 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1836 answer = true;
1837 goto out;
1838 }
1839
1840 out:
1841 free(c2);
1842 return answer;
1843 }
1844
1845 /*
1846 * given /cgroup/freezer/a/b, return "freezer".
1847 * the returned char* should NOT be freed.
1848 */
1849 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1850 {
1851 const char *p1;
1852 char *contr, *slash;
1853
1854 if (strlen(path) < 9) {
1855 errno = EACCES;
1856 return NULL;
1857 }
1858 if (*(path + 7) != '/') {
1859 errno = EINVAL;
1860 return NULL;
1861 }
1862 p1 = path + 8;
1863 contr = strdupa(p1);
1864 if (!contr) {
1865 errno = ENOMEM;
1866 return NULL;
1867 }
1868 slash = strstr(contr, "/");
1869 if (slash)
1870 *slash = '\0';
1871
1872 int i;
1873 for (i = 0; i < num_hierarchies; i++) {
1874 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1875 return hierarchies[i];
1876 }
1877 errno = ENOENT;
1878 return NULL;
1879 }
1880
1881 /*
1882 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1883 * Note that the returned value may include files (keynames) etc
1884 */
1885 static const char *find_cgroup_in_path(const char *path)
1886 {
1887 const char *p1;
1888
1889 if (strlen(path) < 9) {
1890 errno = EACCES;
1891 return NULL;
1892 }
1893 p1 = strstr(path + 8, "/");
1894 if (!p1) {
1895 errno = EINVAL;
1896 return NULL;
1897 }
1898 errno = 0;
1899 return p1 + 1;
1900 }
1901
1902 /*
1903 * split the last path element from the path in @cg.
1904 * @dir is newly allocated and should be freed, @last not
1905 */
1906 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1907 {
1908 char *p;
1909
1910 do {
1911 *dir = strdup(cg);
1912 } while (!*dir);
1913 *last = strrchr(cg, '/');
1914 if (!*last) {
1915 *last = NULL;
1916 return;
1917 }
1918 p = strrchr(*dir, '/');
1919 *p = '\0';
1920 }
1921
1922 /*
1923 * FUSE ops for /cgroup
1924 */
1925
1926 int cg_getattr(const char *path, struct stat *sb)
1927 {
1928 struct timespec now;
1929 struct fuse_context *fc = fuse_get_context();
1930 char * cgdir = NULL;
1931 char *last = NULL, *path1, *path2;
1932 struct cgfs_files *k = NULL;
1933 const char *cgroup;
1934 const char *controller = NULL;
1935 int ret = -ENOENT;
1936
1937
1938 if (!fc)
1939 return -EIO;
1940
1941 memset(sb, 0, sizeof(struct stat));
1942
1943 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1944 return -EINVAL;
1945
1946 sb->st_uid = sb->st_gid = 0;
1947 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1948 sb->st_size = 0;
1949
1950 if (strcmp(path, "/cgroup") == 0) {
1951 sb->st_mode = S_IFDIR | 00755;
1952 sb->st_nlink = 2;
1953 return 0;
1954 }
1955
1956 controller = pick_controller_from_path(fc, path);
1957 if (!controller)
1958 return -errno;
1959 cgroup = find_cgroup_in_path(path);
1960 if (!cgroup) {
1961 /* this is just /cgroup/controller, return it as a dir */
1962 sb->st_mode = S_IFDIR | 00755;
1963 sb->st_nlink = 2;
1964 return 0;
1965 }
1966
1967 get_cgdir_and_path(cgroup, &cgdir, &last);
1968
1969 if (!last) {
1970 path1 = "/";
1971 path2 = cgdir;
1972 } else {
1973 path1 = cgdir;
1974 path2 = last;
1975 }
1976
1977 pid_t initpid = lookup_initpid_in_store(fc->pid);
1978 if (initpid <= 0)
1979 initpid = fc->pid;
1980 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1981 * Then check that caller's cgroup is under path if last is a child
1982 * cgroup, or cgdir if last is a file */
1983
1984 if (is_child_cgroup(controller, path1, path2)) {
1985 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1986 ret = -ENOENT;
1987 goto out;
1988 }
1989 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1990 /* this is just /cgroup/controller, return it as a dir */
1991 sb->st_mode = S_IFDIR | 00555;
1992 sb->st_nlink = 2;
1993 ret = 0;
1994 goto out;
1995 }
1996 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1997 ret = -EACCES;
1998 goto out;
1999 }
2000
2001 // get uid, gid, from '/tasks' file and make up a mode
2002 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2003 sb->st_mode = S_IFDIR | 00755;
2004 k = cgfs_get_key(controller, cgroup, NULL);
2005 if (!k) {
2006 sb->st_uid = sb->st_gid = 0;
2007 } else {
2008 sb->st_uid = k->uid;
2009 sb->st_gid = k->gid;
2010 }
2011 free_key(k);
2012 sb->st_nlink = 2;
2013 ret = 0;
2014 goto out;
2015 }
2016
2017 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
2018 sb->st_mode = S_IFREG | k->mode;
2019 sb->st_nlink = 1;
2020 sb->st_uid = k->uid;
2021 sb->st_gid = k->gid;
2022 sb->st_size = 0;
2023 free_key(k);
2024 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2025 ret = -ENOENT;
2026 goto out;
2027 }
2028 ret = 0;
2029 }
2030
2031 out:
2032 free(cgdir);
2033 return ret;
2034 }
2035
2036 int cg_opendir(const char *path, struct fuse_file_info *fi)
2037 {
2038 struct fuse_context *fc = fuse_get_context();
2039 const char *cgroup;
2040 struct file_info *dir_info;
2041 char *controller = NULL;
2042
2043 if (!fc)
2044 return -EIO;
2045
2046 if (strcmp(path, "/cgroup") == 0) {
2047 cgroup = NULL;
2048 controller = NULL;
2049 } else {
2050 // return list of keys for the controller, and list of child cgroups
2051 controller = pick_controller_from_path(fc, path);
2052 if (!controller)
2053 return -errno;
2054
2055 cgroup = find_cgroup_in_path(path);
2056 if (!cgroup) {
2057 /* this is just /cgroup/controller, return its contents */
2058 cgroup = "/";
2059 }
2060 }
2061
2062 pid_t initpid = lookup_initpid_in_store(fc->pid);
2063 if (initpid <= 0)
2064 initpid = fc->pid;
2065 if (cgroup) {
2066 if (!caller_may_see_dir(initpid, controller, cgroup))
2067 return -ENOENT;
2068 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2069 return -EACCES;
2070 }
2071
2072 /* we'll free this at cg_releasedir */
2073 dir_info = malloc(sizeof(*dir_info));
2074 if (!dir_info)
2075 return -ENOMEM;
2076 dir_info->controller = must_copy_string(controller);
2077 dir_info->cgroup = must_copy_string(cgroup);
2078 dir_info->type = LXC_TYPE_CGDIR;
2079 dir_info->buf = NULL;
2080 dir_info->file = NULL;
2081 dir_info->buflen = 0;
2082
2083 fi->fh = (unsigned long)dir_info;
2084 return 0;
2085 }
2086
2087 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2088 struct fuse_file_info *fi)
2089 {
2090 struct file_info *d = (struct file_info *)fi->fh;
2091 struct cgfs_files **list = NULL;
2092 int i, ret;
2093 char *nextcg = NULL;
2094 struct fuse_context *fc = fuse_get_context();
2095 char **clist = NULL;
2096
2097 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2098 return -EIO;
2099
2100 if (d->type != LXC_TYPE_CGDIR) {
2101 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
2102 return -EIO;
2103 }
2104 if (!d->cgroup && !d->controller) {
2105 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2106 int i;
2107
2108 for (i = 0; i < num_hierarchies; i++) {
2109 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2110 return -EIO;
2111 }
2112 }
2113 return 0;
2114 }
2115
2116 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2117 // not a valid cgroup
2118 ret = -EINVAL;
2119 goto out;
2120 }
2121
2122 pid_t initpid = lookup_initpid_in_store(fc->pid);
2123 if (initpid <= 0)
2124 initpid = fc->pid;
2125 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2126 if (nextcg) {
2127 ret = filler(buf, nextcg, NULL, 0);
2128 free(nextcg);
2129 if (ret != 0) {
2130 ret = -EIO;
2131 goto out;
2132 }
2133 }
2134 ret = 0;
2135 goto out;
2136 }
2137
2138 for (i = 0; list[i]; i++) {
2139 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2140 ret = -EIO;
2141 goto out;
2142 }
2143 }
2144
2145 // now get the list of child cgroups
2146
2147 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2148 ret = 0;
2149 goto out;
2150 }
2151 if (clist) {
2152 for (i = 0; clist[i]; i++) {
2153 if (filler(buf, clist[i], NULL, 0) != 0) {
2154 ret = -EIO;
2155 goto out;
2156 }
2157 }
2158 }
2159 ret = 0;
2160
2161 out:
2162 free_keys(list);
2163 if (clist) {
2164 for (i = 0; clist[i]; i++)
2165 free(clist[i]);
2166 free(clist);
2167 }
2168 return ret;
2169 }
2170
2171 static void do_release_file_info(struct fuse_file_info *fi)
2172 {
2173 struct file_info *f = (struct file_info *)fi->fh;
2174
2175 if (!f)
2176 return;
2177
2178 fi->fh = 0;
2179
2180 free(f->controller);
2181 f->controller = NULL;
2182 free(f->cgroup);
2183 f->cgroup = NULL;
2184 free(f->file);
2185 f->file = NULL;
2186 free(f->buf);
2187 f->buf = NULL;
2188 free(f);
2189 f = NULL;
2190 }
2191
2192 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2193 {
2194 do_release_file_info(fi);
2195 return 0;
2196 }
2197
2198 int cg_open(const char *path, struct fuse_file_info *fi)
2199 {
2200 const char *cgroup;
2201 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2202 struct cgfs_files *k = NULL;
2203 struct file_info *file_info;
2204 struct fuse_context *fc = fuse_get_context();
2205 int ret;
2206
2207 if (!fc)
2208 return -EIO;
2209
2210 controller = pick_controller_from_path(fc, path);
2211 if (!controller)
2212 return -errno;
2213 cgroup = find_cgroup_in_path(path);
2214 if (!cgroup)
2215 return -errno;
2216
2217 get_cgdir_and_path(cgroup, &cgdir, &last);
2218 if (!last) {
2219 path1 = "/";
2220 path2 = cgdir;
2221 } else {
2222 path1 = cgdir;
2223 path2 = last;
2224 }
2225
2226 k = cgfs_get_key(controller, path1, path2);
2227 if (!k) {
2228 ret = -EINVAL;
2229 goto out;
2230 }
2231 free_key(k);
2232
2233 pid_t initpid = lookup_initpid_in_store(fc->pid);
2234 if (initpid <= 0)
2235 initpid = fc->pid;
2236 if (!caller_may_see_dir(initpid, controller, path1)) {
2237 ret = -ENOENT;
2238 goto out;
2239 }
2240 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2241 ret = -EACCES;
2242 goto out;
2243 }
2244
2245 /* we'll free this at cg_release */
2246 file_info = malloc(sizeof(*file_info));
2247 if (!file_info) {
2248 ret = -ENOMEM;
2249 goto out;
2250 }
2251 file_info->controller = must_copy_string(controller);
2252 file_info->cgroup = must_copy_string(path1);
2253 file_info->file = must_copy_string(path2);
2254 file_info->type = LXC_TYPE_CGFILE;
2255 file_info->buf = NULL;
2256 file_info->buflen = 0;
2257
2258 fi->fh = (unsigned long)file_info;
2259 ret = 0;
2260
2261 out:
2262 free(cgdir);
2263 return ret;
2264 }
2265
2266 int cg_access(const char *path, int mode)
2267 {
2268 int ret;
2269 const char *cgroup;
2270 char *path1, *path2, *controller;
2271 char *last = NULL, *cgdir = NULL;
2272 struct cgfs_files *k = NULL;
2273 struct fuse_context *fc = fuse_get_context();
2274
2275 if (strcmp(path, "/cgroup") == 0)
2276 return 0;
2277
2278 if (!fc)
2279 return -EIO;
2280
2281 controller = pick_controller_from_path(fc, path);
2282 if (!controller)
2283 return -errno;
2284 cgroup = find_cgroup_in_path(path);
2285 if (!cgroup) {
2286 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2287 if ((mode & W_OK) == 0)
2288 return 0;
2289 return -EACCES;
2290 }
2291
2292 get_cgdir_and_path(cgroup, &cgdir, &last);
2293 if (!last) {
2294 path1 = "/";
2295 path2 = cgdir;
2296 } else {
2297 path1 = cgdir;
2298 path2 = last;
2299 }
2300
2301 k = cgfs_get_key(controller, path1, path2);
2302 if (!k) {
2303 if ((mode & W_OK) == 0)
2304 ret = 0;
2305 else
2306 ret = -EACCES;
2307 goto out;
2308 }
2309 free_key(k);
2310
2311 pid_t initpid = lookup_initpid_in_store(fc->pid);
2312 if (initpid <= 0)
2313 initpid = fc->pid;
2314 if (!caller_may_see_dir(initpid, controller, path1)) {
2315 ret = -ENOENT;
2316 goto out;
2317 }
2318 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2319 ret = -EACCES;
2320 goto out;
2321 }
2322
2323 ret = 0;
2324
2325 out:
2326 free(cgdir);
2327 return ret;
2328 }
2329
2330 int cg_release(const char *path, struct fuse_file_info *fi)
2331 {
2332 do_release_file_info(fi);
2333 return 0;
2334 }
2335
2336 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2337
2338 static bool wait_for_sock(int sock, int timeout)
2339 {
2340 struct epoll_event ev;
2341 int epfd, ret, now, starttime, deltatime, saved_errno;
2342
2343 if ((starttime = time(NULL)) < 0)
2344 return false;
2345
2346 if ((epfd = epoll_create(1)) < 0) {
2347 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2348 return false;
2349 }
2350
2351 ev.events = POLLIN_SET;
2352 ev.data.fd = sock;
2353 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2354 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2355 close(epfd);
2356 return false;
2357 }
2358
2359 again:
2360 if ((now = time(NULL)) < 0) {
2361 close(epfd);
2362 return false;
2363 }
2364
2365 deltatime = (starttime + timeout) - now;
2366 if (deltatime < 0) { // timeout
2367 errno = 0;
2368 close(epfd);
2369 return false;
2370 }
2371 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2372 if (ret < 0 && errno == EINTR)
2373 goto again;
2374 saved_errno = errno;
2375 close(epfd);
2376
2377 if (ret <= 0) {
2378 errno = saved_errno;
2379 return false;
2380 }
2381 return true;
2382 }
2383
2384 static int msgrecv(int sockfd, void *buf, size_t len)
2385 {
2386 if (!wait_for_sock(sockfd, 2))
2387 return -1;
2388 return recv(sockfd, buf, len, MSG_DONTWAIT);
2389 }
2390
2391 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2392 {
2393 struct msghdr msg = { 0 };
2394 struct iovec iov;
2395 struct cmsghdr *cmsg;
2396 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2397 char buf[1];
2398 buf[0] = 'p';
2399
2400 if (pingfirst) {
2401 if (msgrecv(sock, buf, 1) != 1) {
2402 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2403 return SEND_CREDS_FAIL;
2404 }
2405 }
2406
2407 msg.msg_control = cmsgbuf;
2408 msg.msg_controllen = sizeof(cmsgbuf);
2409
2410 cmsg = CMSG_FIRSTHDR(&msg);
2411 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2412 cmsg->cmsg_level = SOL_SOCKET;
2413 cmsg->cmsg_type = SCM_CREDENTIALS;
2414 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2415
2416 msg.msg_name = NULL;
2417 msg.msg_namelen = 0;
2418
2419 buf[0] = v;
2420 iov.iov_base = buf;
2421 iov.iov_len = sizeof(buf);
2422 msg.msg_iov = &iov;
2423 msg.msg_iovlen = 1;
2424
2425 if (sendmsg(sock, &msg, 0) < 0) {
2426 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2427 if (errno == 3)
2428 return SEND_CREDS_NOTSK;
2429 return SEND_CREDS_FAIL;
2430 }
2431
2432 return SEND_CREDS_OK;
2433 }
2434
2435 static bool recv_creds(int sock, struct ucred *cred, char *v)
2436 {
2437 struct msghdr msg = { 0 };
2438 struct iovec iov;
2439 struct cmsghdr *cmsg;
2440 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2441 char buf[1];
2442 int ret;
2443 int optval = 1;
2444
2445 *v = '1';
2446
2447 cred->pid = -1;
2448 cred->uid = -1;
2449 cred->gid = -1;
2450
2451 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2452 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2453 return false;
2454 }
2455 buf[0] = '1';
2456 if (write(sock, buf, 1) != 1) {
2457 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2458 return false;
2459 }
2460
2461 msg.msg_name = NULL;
2462 msg.msg_namelen = 0;
2463 msg.msg_control = cmsgbuf;
2464 msg.msg_controllen = sizeof(cmsgbuf);
2465
2466 iov.iov_base = buf;
2467 iov.iov_len = sizeof(buf);
2468 msg.msg_iov = &iov;
2469 msg.msg_iovlen = 1;
2470
2471 if (!wait_for_sock(sock, 2)) {
2472 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2473 return false;
2474 }
2475 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2476 if (ret < 0) {
2477 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2478 return false;
2479 }
2480
2481 cmsg = CMSG_FIRSTHDR(&msg);
2482
2483 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2484 cmsg->cmsg_level == SOL_SOCKET &&
2485 cmsg->cmsg_type == SCM_CREDENTIALS) {
2486 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2487 }
2488 *v = buf[0];
2489
2490 return true;
2491 }
2492
2493 struct pid_ns_clone_args {
2494 int *cpipe;
2495 int sock;
2496 pid_t tpid;
2497 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2498 };
2499
2500 /*
2501 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2502 * with clone(). This simply writes '1' as ACK back to the parent
2503 * before calling the actual wrapped function.
2504 */
2505 static int pid_ns_clone_wrapper(void *arg) {
2506 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2507 char b = '1';
2508
2509 close(args->cpipe[0]);
2510 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2511 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2512 close(args->cpipe[1]);
2513 return args->wrapped(args->sock, args->tpid);
2514 }
2515
2516 /*
2517 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2518 * int value back over the socket. This shifts the pid from the
2519 * sender's pidns into tpid's pidns.
2520 */
2521 static int pid_to_ns(int sock, pid_t tpid)
2522 {
2523 char v = '0';
2524 struct ucred cred;
2525
2526 while (recv_creds(sock, &cred, &v)) {
2527 if (v == '1')
2528 return 0;
2529 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2530 return 1;
2531 }
2532 return 0;
2533 }
2534
2535
2536 /*
2537 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2538 * in your old pidns. Only children which you clone will be in the target
2539 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2540 * actually convert pids.
2541 *
2542 * Note: glibc's fork() does not respect pidns, which can lead to failed
2543 * assertions inside glibc (and thus failed forks) if the child's pid in
2544 * the pidns and the parent pid outside are identical. Using clone prevents
2545 * this issue.
2546 */
2547 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2548 {
2549 int newnsfd = -1, ret, cpipe[2];
2550 char fnam[100];
2551 pid_t cpid;
2552 char v;
2553
2554 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2555 if (ret < 0 || ret >= sizeof(fnam))
2556 _exit(1);
2557 newnsfd = open(fnam, O_RDONLY);
2558 if (newnsfd < 0)
2559 _exit(1);
2560 if (setns(newnsfd, 0) < 0)
2561 _exit(1);
2562 close(newnsfd);
2563
2564 if (pipe(cpipe) < 0)
2565 _exit(1);
2566
2567 struct pid_ns_clone_args args = {
2568 .cpipe = cpipe,
2569 .sock = sock,
2570 .tpid = tpid,
2571 .wrapped = &pid_to_ns
2572 };
2573 size_t stack_size = sysconf(_SC_PAGESIZE);
2574 void *stack = alloca(stack_size);
2575
2576 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2577 if (cpid < 0)
2578 _exit(1);
2579
2580 // give the child 1 second to be done forking and
2581 // write its ack
2582 if (!wait_for_sock(cpipe[0], 1))
2583 _exit(1);
2584 ret = read(cpipe[0], &v, 1);
2585 if (ret != sizeof(char) || v != '1')
2586 _exit(1);
2587
2588 if (!wait_for_pid(cpid))
2589 _exit(1);
2590 _exit(0);
2591 }
2592
2593 /*
2594 * To read cgroup files with a particular pid, we will setns into the child
2595 * pidns, open a pipe, fork a child - which will be the first to really be in
2596 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2597 */
2598 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2599 {
2600 int sock[2] = {-1, -1};
2601 char *tmpdata = NULL;
2602 int ret;
2603 pid_t qpid, cpid = -1;
2604 bool answer = false;
2605 char v = '0';
2606 struct ucred cred;
2607 size_t sz = 0, asz = 0;
2608
2609 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2610 return false;
2611
2612 /*
2613 * Now we read the pids from returned data one by one, pass
2614 * them into a child in the target namespace, read back the
2615 * translated pids, and put them into our to-return data
2616 */
2617
2618 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2619 perror("socketpair");
2620 free(tmpdata);
2621 return false;
2622 }
2623
2624 cpid = fork();
2625 if (cpid == -1)
2626 goto out;
2627
2628 if (!cpid) // child - exits when done
2629 pid_to_ns_wrapper(sock[1], tpid);
2630
2631 char *ptr = tmpdata;
2632 cred.uid = 0;
2633 cred.gid = 0;
2634 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2635 cred.pid = qpid;
2636 ret = send_creds(sock[0], &cred, v, true);
2637
2638 if (ret == SEND_CREDS_NOTSK)
2639 goto next;
2640 if (ret == SEND_CREDS_FAIL)
2641 goto out;
2642
2643 // read converted results
2644 if (!wait_for_sock(sock[0], 2)) {
2645 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2646 goto out;
2647 }
2648 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2649 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2650 goto out;
2651 }
2652 must_strcat_pid(d, &sz, &asz, qpid);
2653 next:
2654 ptr = strchr(ptr, '\n');
2655 if (!ptr)
2656 break;
2657 ptr++;
2658 }
2659
2660 cred.pid = getpid();
2661 v = '1';
2662 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2663 // failed to ask child to exit
2664 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2665 goto out;
2666 }
2667
2668 answer = true;
2669
2670 out:
2671 free(tmpdata);
2672 if (cpid != -1)
2673 wait_for_pid(cpid);
2674 if (sock[0] != -1) {
2675 close(sock[0]);
2676 close(sock[1]);
2677 }
2678 return answer;
2679 }
2680
2681 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2682 struct fuse_file_info *fi)
2683 {
2684 struct fuse_context *fc = fuse_get_context();
2685 struct file_info *f = (struct file_info *)fi->fh;
2686 struct cgfs_files *k = NULL;
2687 char *data = NULL;
2688 int ret, s;
2689 bool r;
2690
2691 if (f->type != LXC_TYPE_CGFILE) {
2692 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2693 return -EIO;
2694 }
2695
2696 if (offset)
2697 return 0;
2698
2699 if (!fc)
2700 return -EIO;
2701
2702 if (!f->controller)
2703 return -EINVAL;
2704
2705 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2706 return -EINVAL;
2707 }
2708 free_key(k);
2709
2710
2711 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2712 ret = -EACCES;
2713 goto out;
2714 }
2715
2716 if (strcmp(f->file, "tasks") == 0 ||
2717 strcmp(f->file, "/tasks") == 0 ||
2718 strcmp(f->file, "/cgroup.procs") == 0 ||
2719 strcmp(f->file, "cgroup.procs") == 0)
2720 // special case - we have to translate the pids
2721 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2722 else
2723 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2724
2725 if (!r) {
2726 ret = -EINVAL;
2727 goto out;
2728 }
2729
2730 if (!data) {
2731 ret = 0;
2732 goto out;
2733 }
2734 s = strlen(data);
2735 if (s > size)
2736 s = size;
2737 memcpy(buf, data, s);
2738 if (s > 0 && s < size && data[s-1] != '\n')
2739 buf[s++] = '\n';
2740
2741 ret = s;
2742
2743 out:
2744 free(data);
2745 return ret;
2746 }
2747
2748 static int pid_from_ns(int sock, pid_t tpid)
2749 {
2750 pid_t vpid;
2751 struct ucred cred;
2752 char v;
2753 int ret;
2754
2755 cred.uid = 0;
2756 cred.gid = 0;
2757 while (1) {
2758 if (!wait_for_sock(sock, 2)) {
2759 lxcfs_error("%s\n", "Timeout reading from parent.");
2760 return 1;
2761 }
2762 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2763 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2764 return 1;
2765 }
2766 if (vpid == -1) // done
2767 break;
2768 v = '0';
2769 cred.pid = vpid;
2770 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2771 v = '1';
2772 cred.pid = getpid();
2773 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2774 return 1;
2775 }
2776 }
2777 return 0;
2778 }
2779
2780 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2781 {
2782 int newnsfd = -1, ret, cpipe[2];
2783 char fnam[100];
2784 pid_t cpid;
2785 char v;
2786
2787 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2788 if (ret < 0 || ret >= sizeof(fnam))
2789 _exit(1);
2790 newnsfd = open(fnam, O_RDONLY);
2791 if (newnsfd < 0)
2792 _exit(1);
2793 if (setns(newnsfd, 0) < 0)
2794 _exit(1);
2795 close(newnsfd);
2796
2797 if (pipe(cpipe) < 0)
2798 _exit(1);
2799
2800 struct pid_ns_clone_args args = {
2801 .cpipe = cpipe,
2802 .sock = sock,
2803 .tpid = tpid,
2804 .wrapped = &pid_from_ns
2805 };
2806 size_t stack_size = sysconf(_SC_PAGESIZE);
2807 void *stack = alloca(stack_size);
2808
2809 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2810 if (cpid < 0)
2811 _exit(1);
2812
2813 // give the child 1 second to be done forking and
2814 // write its ack
2815 if (!wait_for_sock(cpipe[0], 1))
2816 _exit(1);
2817 ret = read(cpipe[0], &v, 1);
2818 if (ret != sizeof(char) || v != '1')
2819 _exit(1);
2820
2821 if (!wait_for_pid(cpid))
2822 _exit(1);
2823 _exit(0);
2824 }
2825
2826 /*
2827 * Given host @uid, return the uid to which it maps in
2828 * @pid's user namespace, or -1 if none.
2829 */
2830 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2831 {
2832 FILE *f;
2833 char line[400];
2834
2835 sprintf(line, "/proc/%d/uid_map", pid);
2836 if ((f = fopen(line, "r")) == NULL) {
2837 return false;
2838 }
2839
2840 *answer = convert_id_to_ns(f, uid);
2841 fclose(f);
2842
2843 if (*answer == -1)
2844 return false;
2845 return true;
2846 }
2847
2848 /*
2849 * get_pid_creds: get the real uid and gid of @pid from
2850 * /proc/$$/status
2851 * (XXX should we use euid here?)
2852 */
2853 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2854 {
2855 char line[400];
2856 uid_t u;
2857 gid_t g;
2858 FILE *f;
2859
2860 *uid = -1;
2861 *gid = -1;
2862 sprintf(line, "/proc/%d/status", pid);
2863 if ((f = fopen(line, "r")) == NULL) {
2864 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2865 return;
2866 }
2867 while (fgets(line, 400, f)) {
2868 if (strncmp(line, "Uid:", 4) == 0) {
2869 if (sscanf(line+4, "%u", &u) != 1) {
2870 lxcfs_error("bad uid line for pid %u\n", pid);
2871 fclose(f);
2872 return;
2873 }
2874 *uid = u;
2875 } else if (strncmp(line, "Gid:", 4) == 0) {
2876 if (sscanf(line+4, "%u", &g) != 1) {
2877 lxcfs_error("bad gid line for pid %u\n", pid);
2878 fclose(f);
2879 return;
2880 }
2881 *gid = g;
2882 }
2883 }
2884 fclose(f);
2885 }
2886
2887 /*
2888 * May the requestor @r move victim @v to a new cgroup?
2889 * This is allowed if
2890 * . they are the same task
2891 * . they are ownedy by the same uid
2892 * . @r is root on the host, or
2893 * . @v's uid is mapped into @r's where @r is root.
2894 */
2895 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2896 {
2897 uid_t v_uid, tmpuid;
2898 gid_t v_gid;
2899
2900 if (r == v)
2901 return true;
2902 if (r_uid == 0)
2903 return true;
2904 get_pid_creds(v, &v_uid, &v_gid);
2905 if (r_uid == v_uid)
2906 return true;
2907 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2908 && hostuid_to_ns(v_uid, r, &tmpuid))
2909 return true;
2910 return false;
2911 }
2912
2913 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2914 const char *file, const char *buf)
2915 {
2916 int sock[2] = {-1, -1};
2917 pid_t qpid, cpid = -1;
2918 FILE *pids_file = NULL;
2919 bool answer = false, fail = false;
2920
2921 pids_file = open_pids_file(contrl, cg);
2922 if (!pids_file)
2923 return false;
2924
2925 /*
2926 * write the pids to a socket, have helper in writer's pidns
2927 * call movepid for us
2928 */
2929 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2930 perror("socketpair");
2931 goto out;
2932 }
2933
2934 cpid = fork();
2935 if (cpid == -1)
2936 goto out;
2937
2938 if (!cpid) { // child
2939 fclose(pids_file);
2940 pid_from_ns_wrapper(sock[1], tpid);
2941 }
2942
2943 const char *ptr = buf;
2944 while (sscanf(ptr, "%d", &qpid) == 1) {
2945 struct ucred cred;
2946 char v;
2947
2948 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2949 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2950 goto out;
2951 }
2952
2953 if (recv_creds(sock[0], &cred, &v)) {
2954 if (v == '0') {
2955 if (!may_move_pid(tpid, tuid, cred.pid)) {
2956 fail = true;
2957 break;
2958 }
2959 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2960 fail = true;
2961 }
2962 }
2963
2964 ptr = strchr(ptr, '\n');
2965 if (!ptr)
2966 break;
2967 ptr++;
2968 }
2969
2970 /* All good, write the value */
2971 qpid = -1;
2972 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2973 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2974
2975 if (!fail)
2976 answer = true;
2977
2978 out:
2979 if (cpid != -1)
2980 wait_for_pid(cpid);
2981 if (sock[0] != -1) {
2982 close(sock[0]);
2983 close(sock[1]);
2984 }
2985 if (pids_file) {
2986 if (fclose(pids_file) != 0)
2987 answer = false;
2988 }
2989 return answer;
2990 }
2991
2992 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2993 struct fuse_file_info *fi)
2994 {
2995 struct fuse_context *fc = fuse_get_context();
2996 char *localbuf = NULL;
2997 struct cgfs_files *k = NULL;
2998 struct file_info *f = (struct file_info *)fi->fh;
2999 bool r;
3000
3001 if (f->type != LXC_TYPE_CGFILE) {
3002 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
3003 return -EIO;
3004 }
3005
3006 if (offset)
3007 return 0;
3008
3009 if (!fc)
3010 return -EIO;
3011
3012 localbuf = alloca(size+1);
3013 localbuf[size] = '\0';
3014 memcpy(localbuf, buf, size);
3015
3016 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
3017 size = -EINVAL;
3018 goto out;
3019 }
3020
3021 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
3022 size = -EACCES;
3023 goto out;
3024 }
3025
3026 if (strcmp(f->file, "tasks") == 0 ||
3027 strcmp(f->file, "/tasks") == 0 ||
3028 strcmp(f->file, "/cgroup.procs") == 0 ||
3029 strcmp(f->file, "cgroup.procs") == 0)
3030 // special case - we have to translate the pids
3031 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
3032 else
3033 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
3034
3035 if (!r)
3036 size = -EINVAL;
3037
3038 out:
3039 free_key(k);
3040 return size;
3041 }
3042
3043 int cg_chown(const char *path, uid_t uid, gid_t gid)
3044 {
3045 struct fuse_context *fc = fuse_get_context();
3046 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3047 struct cgfs_files *k = NULL;
3048 const char *cgroup;
3049 int ret;
3050
3051 if (!fc)
3052 return -EIO;
3053
3054 if (strcmp(path, "/cgroup") == 0)
3055 return -EPERM;
3056
3057 controller = pick_controller_from_path(fc, path);
3058 if (!controller)
3059 return errno == ENOENT ? -EPERM : -errno;
3060
3061 cgroup = find_cgroup_in_path(path);
3062 if (!cgroup)
3063 /* this is just /cgroup/controller */
3064 return -EPERM;
3065
3066 get_cgdir_and_path(cgroup, &cgdir, &last);
3067
3068 if (!last) {
3069 path1 = "/";
3070 path2 = cgdir;
3071 } else {
3072 path1 = cgdir;
3073 path2 = last;
3074 }
3075
3076 if (is_child_cgroup(controller, path1, path2)) {
3077 // get uid, gid, from '/tasks' file and make up a mode
3078 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3079 k = cgfs_get_key(controller, cgroup, "tasks");
3080
3081 } else
3082 k = cgfs_get_key(controller, path1, path2);
3083
3084 if (!k) {
3085 ret = -EINVAL;
3086 goto out;
3087 }
3088
3089 /*
3090 * This being a fuse request, the uid and gid must be valid
3091 * in the caller's namespace. So we can just check to make
3092 * sure that the caller is root in his uid, and privileged
3093 * over the file's current owner.
3094 */
3095 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3096 ret = -EACCES;
3097 goto out;
3098 }
3099
3100 ret = cgfs_chown_file(controller, cgroup, uid, gid);
3101
3102 out:
3103 free_key(k);
3104 free(cgdir);
3105
3106 return ret;
3107 }
3108
3109 int cg_chmod(const char *path, mode_t mode)
3110 {
3111 struct fuse_context *fc = fuse_get_context();
3112 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3113 struct cgfs_files *k = NULL;
3114 const char *cgroup;
3115 int ret;
3116
3117 if (!fc)
3118 return -EIO;
3119
3120 if (strcmp(path, "/cgroup") == 0)
3121 return -EPERM;
3122
3123 controller = pick_controller_from_path(fc, path);
3124 if (!controller)
3125 return errno == ENOENT ? -EPERM : -errno;
3126
3127 cgroup = find_cgroup_in_path(path);
3128 if (!cgroup)
3129 /* this is just /cgroup/controller */
3130 return -EPERM;
3131
3132 get_cgdir_and_path(cgroup, &cgdir, &last);
3133
3134 if (!last) {
3135 path1 = "/";
3136 path2 = cgdir;
3137 } else {
3138 path1 = cgdir;
3139 path2 = last;
3140 }
3141
3142 if (is_child_cgroup(controller, path1, path2)) {
3143 // get uid, gid, from '/tasks' file and make up a mode
3144 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3145 k = cgfs_get_key(controller, cgroup, "tasks");
3146
3147 } else
3148 k = cgfs_get_key(controller, path1, path2);
3149
3150 if (!k) {
3151 ret = -EINVAL;
3152 goto out;
3153 }
3154
3155 /*
3156 * This being a fuse request, the uid and gid must be valid
3157 * in the caller's namespace. So we can just check to make
3158 * sure that the caller is root in his uid, and privileged
3159 * over the file's current owner.
3160 */
3161 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3162 ret = -EPERM;
3163 goto out;
3164 }
3165
3166 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3167 ret = -EINVAL;
3168 goto out;
3169 }
3170
3171 ret = 0;
3172 out:
3173 free_key(k);
3174 free(cgdir);
3175 return ret;
3176 }
3177
3178 int cg_mkdir(const char *path, mode_t mode)
3179 {
3180 struct fuse_context *fc = fuse_get_context();
3181 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3182 const char *cgroup;
3183 int ret;
3184
3185 if (!fc)
3186 return -EIO;
3187
3188 controller = pick_controller_from_path(fc, path);
3189 if (!controller)
3190 return errno == ENOENT ? -EPERM : -errno;
3191
3192 cgroup = find_cgroup_in_path(path);
3193 if (!cgroup)
3194 return -errno;
3195
3196 get_cgdir_and_path(cgroup, &cgdir, &last);
3197 if (!last)
3198 path1 = "/";
3199 else
3200 path1 = cgdir;
3201
3202 pid_t initpid = lookup_initpid_in_store(fc->pid);
3203 if (initpid <= 0)
3204 initpid = fc->pid;
3205 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3206 if (!next)
3207 ret = -EINVAL;
3208 else if (last && strcmp(next, last) == 0)
3209 ret = -EEXIST;
3210 else
3211 ret = -EPERM;
3212 goto out;
3213 }
3214
3215 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3216 ret = -EACCES;
3217 goto out;
3218 }
3219 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3220 ret = -EACCES;
3221 goto out;
3222 }
3223
3224 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3225
3226 out:
3227 free(cgdir);
3228 free(next);
3229 return ret;
3230 }
3231
3232 int cg_rmdir(const char *path)
3233 {
3234 struct fuse_context *fc = fuse_get_context();
3235 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3236 const char *cgroup;
3237 int ret;
3238
3239 if (!fc)
3240 return -EIO;
3241
3242 controller = pick_controller_from_path(fc, path);
3243 if (!controller) /* Someone's trying to delete "/cgroup". */
3244 return -EPERM;
3245
3246 cgroup = find_cgroup_in_path(path);
3247 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3248 return -EPERM;
3249
3250 get_cgdir_and_path(cgroup, &cgdir, &last);
3251 if (!last) {
3252 /* Someone's trying to delete a cgroup on the same level as the
3253 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3254 * rmdir "/cgroup/blkio/init.slice".
3255 */
3256 ret = -EPERM;
3257 goto out;
3258 }
3259
3260 pid_t initpid = lookup_initpid_in_store(fc->pid);
3261 if (initpid <= 0)
3262 initpid = fc->pid;
3263 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3264 if (!last || (next && (strcmp(next, last) == 0)))
3265 ret = -EBUSY;
3266 else
3267 ret = -ENOENT;
3268 goto out;
3269 }
3270
3271 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3272 ret = -EACCES;
3273 goto out;
3274 }
3275 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3276 ret = -EACCES;
3277 goto out;
3278 }
3279
3280 if (!cgfs_remove(controller, cgroup)) {
3281 ret = -EINVAL;
3282 goto out;
3283 }
3284
3285 ret = 0;
3286
3287 out:
3288 free(cgdir);
3289 free(next);
3290 return ret;
3291 }
3292
3293 static bool startswith(const char *line, const char *pref)
3294 {
3295 if (strncmp(line, pref, strlen(pref)) == 0)
3296 return true;
3297 return false;
3298 }
3299
3300 static void parse_memstat(char *memstat, unsigned long *cached,
3301 unsigned long *active_anon, unsigned long *inactive_anon,
3302 unsigned long *active_file, unsigned long *inactive_file,
3303 unsigned long *unevictable, unsigned long *shmem)
3304 {
3305 char *eol;
3306
3307 while (*memstat) {
3308 if (startswith(memstat, "total_cache")) {
3309 sscanf(memstat + 11, "%lu", cached);
3310 *cached /= 1024;
3311 } else if (startswith(memstat, "total_active_anon")) {
3312 sscanf(memstat + 17, "%lu", active_anon);
3313 *active_anon /= 1024;
3314 } else if (startswith(memstat, "total_inactive_anon")) {
3315 sscanf(memstat + 19, "%lu", inactive_anon);
3316 *inactive_anon /= 1024;
3317 } else if (startswith(memstat, "total_active_file")) {
3318 sscanf(memstat + 17, "%lu", active_file);
3319 *active_file /= 1024;
3320 } else if (startswith(memstat, "total_inactive_file")) {
3321 sscanf(memstat + 19, "%lu", inactive_file);
3322 *inactive_file /= 1024;
3323 } else if (startswith(memstat, "total_unevictable")) {
3324 sscanf(memstat + 17, "%lu", unevictable);
3325 *unevictable /= 1024;
3326 } else if (startswith(memstat, "total_shmem")) {
3327 sscanf(memstat + 11, "%lu", shmem);
3328 *shmem /= 1024;
3329 }
3330 eol = strchr(memstat, '\n');
3331 if (!eol)
3332 return;
3333 memstat = eol+1;
3334 }
3335 }
3336
3337 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3338 {
3339 char *eol;
3340 char key[32];
3341
3342 memset(key, 0, 32);
3343 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3344
3345 size_t len = strlen(key);
3346 *v = 0;
3347
3348 while (*str) {
3349 if (startswith(str, key)) {
3350 sscanf(str + len, "%lu", v);
3351 return;
3352 }
3353 eol = strchr(str, '\n');
3354 if (!eol)
3355 return;
3356 str = eol+1;
3357 }
3358 }
3359
3360 static int read_file(const char *path, char *buf, size_t size,
3361 struct file_info *d)
3362 {
3363 size_t linelen = 0, total_len = 0, rv = 0;
3364 char *line = NULL;
3365 char *cache = d->buf;
3366 size_t cache_size = d->buflen;
3367 FILE *f = fopen(path, "r");
3368 if (!f)
3369 return 0;
3370
3371 while (getline(&line, &linelen, f) != -1) {
3372 ssize_t l = snprintf(cache, cache_size, "%s", line);
3373 if (l < 0) {
3374 perror("Error writing to cache");
3375 rv = 0;
3376 goto err;
3377 }
3378 if (l >= cache_size) {
3379 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3380 rv = 0;
3381 goto err;
3382 }
3383 cache += l;
3384 cache_size -= l;
3385 total_len += l;
3386 }
3387
3388 d->size = total_len;
3389 if (total_len > size)
3390 total_len = size;
3391
3392 /* read from off 0 */
3393 memcpy(buf, d->buf, total_len);
3394 rv = total_len;
3395 err:
3396 fclose(f);
3397 free(line);
3398 return rv;
3399 }
3400
3401 /*
3402 * FUSE ops for /proc
3403 */
3404
3405 static unsigned long get_memlimit(const char *cgroup, const char *file)
3406 {
3407 char *memlimit_str = NULL;
3408 unsigned long memlimit = -1;
3409
3410 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3411 memlimit = strtoul(memlimit_str, NULL, 10);
3412
3413 free(memlimit_str);
3414
3415 return memlimit;
3416 }
3417
3418 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3419 {
3420 char *copy = strdupa(cgroup);
3421 unsigned long memlimit = 0, retlimit;
3422
3423 retlimit = get_memlimit(copy, file);
3424
3425 while (strcmp(copy, "/") != 0) {
3426 copy = dirname(copy);
3427 memlimit = get_memlimit(copy, file);
3428 if (memlimit != -1 && memlimit < retlimit)
3429 retlimit = memlimit;
3430 };
3431
3432 return retlimit;
3433 }
3434
3435 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3436 struct fuse_file_info *fi)
3437 {
3438 struct fuse_context *fc = fuse_get_context();
3439 struct file_info *d = (struct file_info *)fi->fh;
3440 char *cg;
3441 char *memusage_str = NULL, *memstat_str = NULL,
3442 *memswlimit_str = NULL, *memswusage_str = NULL;
3443 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3444 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3445 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
3446 hostswtotal = 0;
3447 char *line = NULL;
3448 size_t linelen = 0, total_len = 0, rv = 0;
3449 char *cache = d->buf;
3450 size_t cache_size = d->buflen;
3451 FILE *f = NULL;
3452
3453 if (offset){
3454 if (offset > d->size)
3455 return -EINVAL;
3456 if (!d->cached)
3457 return 0;
3458 int left = d->size - offset;
3459 total_len = left > size ? size: left;
3460 memcpy(buf, cache + offset, total_len);
3461 return total_len;
3462 }
3463
3464 pid_t initpid = lookup_initpid_in_store(fc->pid);
3465 if (initpid <= 0)
3466 initpid = fc->pid;
3467 cg = get_pid_cgroup(initpid, "memory");
3468 if (!cg)
3469 return read_file("/proc/meminfo", buf, size, d);
3470 prune_init_slice(cg);
3471
3472 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3473 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3474 goto err;
3475 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3476 goto err;
3477
3478 // Following values are allowed to fail, because swapaccount might be turned
3479 // off for current kernel
3480 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3481 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3482 {
3483 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3484 memswusage = strtoul(memswusage_str, NULL, 10);
3485
3486 memswlimit = memswlimit / 1024;
3487 memswusage = memswusage / 1024;
3488 }
3489
3490 memusage = strtoul(memusage_str, NULL, 10);
3491 memlimit /= 1024;
3492 memusage /= 1024;
3493
3494 parse_memstat(memstat_str, &cached, &active_anon,
3495 &inactive_anon, &active_file, &inactive_file,
3496 &unevictable, &shmem);
3497
3498 f = fopen("/proc/meminfo", "r");
3499 if (!f)
3500 goto err;
3501
3502 while (getline(&line, &linelen, f) != -1) {
3503 ssize_t l;
3504 char *printme, lbuf[100];
3505
3506 memset(lbuf, 0, 100);
3507 if (startswith(line, "MemTotal:")) {
3508 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3509 if (hosttotal < memlimit)
3510 memlimit = hosttotal;
3511 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3512 printme = lbuf;
3513 } else if (startswith(line, "MemFree:")) {
3514 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3515 printme = lbuf;
3516 } else if (startswith(line, "MemAvailable:")) {
3517 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
3518 printme = lbuf;
3519 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3520 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3521 if (hostswtotal < memswlimit)
3522 memswlimit = hostswtotal;
3523 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
3524 printme = lbuf;
3525 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3526 unsigned long swaptotal = memswlimit,
3527 swapusage = memswusage - memusage,
3528 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3529 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3530 printme = lbuf;
3531 } else if (startswith(line, "Slab:")) {
3532 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3533 printme = lbuf;
3534 } else if (startswith(line, "Buffers:")) {
3535 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3536 printme = lbuf;
3537 } else if (startswith(line, "Cached:")) {
3538 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3539 printme = lbuf;
3540 } else if (startswith(line, "SwapCached:")) {
3541 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3542 printme = lbuf;
3543 } else if (startswith(line, "Active:")) {
3544 snprintf(lbuf, 100, "Active: %8lu kB\n",
3545 active_anon + active_file);
3546 printme = lbuf;
3547 } else if (startswith(line, "Inactive:")) {
3548 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3549 inactive_anon + inactive_file);
3550 printme = lbuf;
3551 } else if (startswith(line, "Active(anon)")) {
3552 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3553 printme = lbuf;
3554 } else if (startswith(line, "Inactive(anon)")) {
3555 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3556 printme = lbuf;
3557 } else if (startswith(line, "Active(file)")) {
3558 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3559 printme = lbuf;
3560 } else if (startswith(line, "Inactive(file)")) {
3561 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3562 printme = lbuf;
3563 } else if (startswith(line, "Unevictable")) {
3564 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3565 printme = lbuf;
3566 } else if (startswith(line, "SReclaimable")) {
3567 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3568 printme = lbuf;
3569 } else if (startswith(line, "SUnreclaim")) {
3570 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3571 printme = lbuf;
3572 } else if (startswith(line, "Shmem:")) {
3573 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3574 printme = lbuf;
3575 } else if (startswith(line, "ShmemHugePages")) {
3576 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3577 printme = lbuf;
3578 } else if (startswith(line, "ShmemPmdMapped")) {
3579 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3580 printme = lbuf;
3581 } else
3582 printme = line;
3583
3584 l = snprintf(cache, cache_size, "%s", printme);
3585 if (l < 0) {
3586 perror("Error writing to cache");
3587 rv = 0;
3588 goto err;
3589
3590 }
3591 if (l >= cache_size) {
3592 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3593 rv = 0;
3594 goto err;
3595 }
3596
3597 cache += l;
3598 cache_size -= l;
3599 total_len += l;
3600 }
3601
3602 d->cached = 1;
3603 d->size = total_len;
3604 if (total_len > size ) total_len = size;
3605 memcpy(buf, d->buf, total_len);
3606
3607 rv = total_len;
3608 err:
3609 if (f)
3610 fclose(f);
3611 free(line);
3612 free(cg);
3613 free(memusage_str);
3614 free(memswlimit_str);
3615 free(memswusage_str);
3616 free(memstat_str);
3617 return rv;
3618 }
3619
3620 /*
3621 * Read the cpuset.cpus for cg
3622 * Return the answer in a newly allocated string which must be freed
3623 */
3624 static char *get_cpuset(const char *cg)
3625 {
3626 char *answer;
3627
3628 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3629 return NULL;
3630 return answer;
3631 }
3632
3633 bool cpu_in_cpuset(int cpu, const char *cpuset);
3634
3635 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3636 {
3637 int cpu;
3638
3639 if (sscanf(line, "processor : %d", &cpu) != 1)
3640 return false;
3641 return cpu_in_cpuset(cpu, cpuset);
3642 }
3643
3644 /*
3645 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3646 * depending on `param`. Parameter value is returned throuh `value`.
3647 */
3648 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3649 {
3650 bool rv = false;
3651 char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3652 char *str = NULL;
3653
3654 sprintf(file, "cpu.cfs_%s_us", param);
3655
3656 if (!cgfs_get_value("cpu", cg, file, &str))
3657 goto err;
3658
3659 if (sscanf(str, "%ld", value) != 1)
3660 goto err;
3661
3662 rv = true;
3663
3664 err:
3665 if (str)
3666 free(str);
3667 return rv;
3668 }
3669
3670 /*
3671 * Return the maximum number of visible CPUs based on CPU quotas.
3672 * If there is no quota set, zero is returned.
3673 */
3674 int max_cpu_count(const char *cg)
3675 {
3676 int rv, nprocs;
3677 int64_t cfs_quota, cfs_period;
3678
3679 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3680 return 0;
3681
3682 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3683 return 0;
3684
3685 if (cfs_quota <= 0 || cfs_period <= 0)
3686 return 0;
3687
3688 rv = cfs_quota / cfs_period;
3689
3690 /* In case quota/period does not yield a whole number, add one CPU for
3691 * the remainder.
3692 */
3693 if ((cfs_quota % cfs_period) > 0)
3694 rv += 1;
3695
3696 nprocs = get_nprocs();
3697
3698 if (rv > nprocs)
3699 rv = nprocs;
3700
3701 return rv;
3702 }
3703
3704 /*
3705 * Determine whether CPU views should be used or not.
3706 */
3707 bool use_cpuview(const char *cg)
3708 {
3709 int cfd;
3710 char *tmpc;
3711
3712 tmpc = find_mounted_controller("cpu", &cfd);
3713 if (!tmpc)
3714 return false;
3715
3716 tmpc = find_mounted_controller("cpuacct", &cfd);
3717 if (!tmpc)
3718 return false;
3719
3720 return true;
3721 }
3722
3723 /*
3724 * check whether this is a '^processor" line in /proc/cpuinfo
3725 */
3726 static bool is_processor_line(const char *line)
3727 {
3728 int cpu;
3729
3730 if (sscanf(line, "processor : %d", &cpu) == 1)
3731 return true;
3732 return false;
3733 }
3734
3735 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3736 struct fuse_file_info *fi)
3737 {
3738 struct fuse_context *fc = fuse_get_context();
3739 struct file_info *d = (struct file_info *)fi->fh;
3740 char *cg;
3741 char *cpuset = NULL;
3742 char *line = NULL;
3743 size_t linelen = 0, total_len = 0, rv = 0;
3744 bool am_printing = false, firstline = true, is_s390x = false;
3745 int curcpu = -1, cpu, max_cpus = 0;
3746 bool use_view;
3747 char *cache = d->buf;
3748 size_t cache_size = d->buflen;
3749 FILE *f = NULL;
3750
3751 if (offset){
3752 if (offset > d->size)
3753 return -EINVAL;
3754 if (!d->cached)
3755 return 0;
3756 int left = d->size - offset;
3757 total_len = left > size ? size: left;
3758 memcpy(buf, cache + offset, total_len);
3759 return total_len;
3760 }
3761
3762 pid_t initpid = lookup_initpid_in_store(fc->pid);
3763 if (initpid <= 0)
3764 initpid = fc->pid;
3765 cg = get_pid_cgroup(initpid, "cpuset");
3766 if (!cg)
3767 return read_file("proc/cpuinfo", buf, size, d);
3768 prune_init_slice(cg);
3769
3770 cpuset = get_cpuset(cg);
3771 if (!cpuset)
3772 goto err;
3773
3774 use_view = use_cpuview(cg);
3775
3776 if (use_view)
3777 max_cpus = max_cpu_count(cg);
3778
3779 f = fopen("/proc/cpuinfo", "r");
3780 if (!f)
3781 goto err;
3782
3783 while (getline(&line, &linelen, f) != -1) {
3784 ssize_t l;
3785 if (firstline) {
3786 firstline = false;
3787 if (strstr(line, "IBM/S390") != NULL) {
3788 is_s390x = true;
3789 am_printing = true;
3790 continue;
3791 }
3792 }
3793 if (strncmp(line, "# processors:", 12) == 0)
3794 continue;
3795 if (is_processor_line(line)) {
3796 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3797 break;
3798 am_printing = cpuline_in_cpuset(line, cpuset);
3799 if (am_printing) {
3800 curcpu ++;
3801 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3802 if (l < 0) {
3803 perror("Error writing to cache");
3804 rv = 0;
3805 goto err;
3806 }
3807 if (l >= cache_size) {
3808 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3809 rv = 0;
3810 goto err;
3811 }
3812 cache += l;
3813 cache_size -= l;
3814 total_len += l;
3815 }
3816 continue;
3817 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3818 char *p;
3819 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3820 break;
3821 if (!cpu_in_cpuset(cpu, cpuset))
3822 continue;
3823 curcpu ++;
3824 p = strchr(line, ':');
3825 if (!p || !*p)
3826 goto err;
3827 p++;
3828 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3829 if (l < 0) {
3830 perror("Error writing to cache");
3831 rv = 0;
3832 goto err;
3833 }
3834 if (l >= cache_size) {
3835 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3836 rv = 0;
3837 goto err;
3838 }
3839 cache += l;
3840 cache_size -= l;
3841 total_len += l;
3842 continue;
3843
3844 }
3845 if (am_printing) {
3846 l = snprintf(cache, cache_size, "%s", line);
3847 if (l < 0) {
3848 perror("Error writing to cache");
3849 rv = 0;
3850 goto err;
3851 }
3852 if (l >= cache_size) {
3853 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3854 rv = 0;
3855 goto err;
3856 }
3857 cache += l;
3858 cache_size -= l;
3859 total_len += l;
3860 }
3861 }
3862
3863 if (is_s390x) {
3864 char *origcache = d->buf;
3865 ssize_t l;
3866 do {
3867 d->buf = malloc(d->buflen);
3868 } while (!d->buf);
3869 cache = d->buf;
3870 cache_size = d->buflen;
3871 total_len = 0;
3872 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3873 if (l < 0 || l >= cache_size) {
3874 free(origcache);
3875 goto err;
3876 }
3877 cache_size -= l;
3878 cache += l;
3879 total_len += l;
3880 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3881 if (l < 0 || l >= cache_size) {
3882 free(origcache);
3883 goto err;
3884 }
3885 cache_size -= l;
3886 cache += l;
3887 total_len += l;
3888 l = snprintf(cache, cache_size, "%s", origcache);
3889 free(origcache);
3890 if (l < 0 || l >= cache_size)
3891 goto err;
3892 total_len += l;
3893 }
3894
3895 d->cached = 1;
3896 d->size = total_len;
3897 if (total_len > size ) total_len = size;
3898
3899 /* read from off 0 */
3900 memcpy(buf, d->buf, total_len);
3901 rv = total_len;
3902 err:
3903 if (f)
3904 fclose(f);
3905 free(line);
3906 free(cpuset);
3907 free(cg);
3908 return rv;
3909 }
3910
3911 static uint64_t get_reaper_start_time(pid_t pid)
3912 {
3913 int ret;
3914 FILE *f;
3915 uint64_t starttime;
3916 /* strlen("/proc/") = 6
3917 * +
3918 * LXCFS_NUMSTRLEN64
3919 * +
3920 * strlen("/stat") = 5
3921 * +
3922 * \0 = 1
3923 * */
3924 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3925 char path[__PROC_PID_STAT_LEN];
3926 pid_t qpid;
3927
3928 qpid = lookup_initpid_in_store(pid);
3929 if (qpid <= 0) {
3930 /* Caller can check for EINVAL on 0. */
3931 errno = EINVAL;
3932 return 0;
3933 }
3934
3935 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3936 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3937 /* Caller can check for EINVAL on 0. */
3938 errno = EINVAL;
3939 return 0;
3940 }
3941
3942 f = fopen(path, "r");
3943 if (!f) {
3944 /* Caller can check for EINVAL on 0. */
3945 errno = EINVAL;
3946 return 0;
3947 }
3948
3949 /* Note that the *scanf() argument supression requires that length
3950 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3951 * at us. It's like telling someone you're not married and then asking
3952 * if you can bring your wife to the party.
3953 */
3954 ret = fscanf(f, "%*d " /* (1) pid %d */
3955 "%*s " /* (2) comm %s */
3956 "%*c " /* (3) state %c */
3957 "%*d " /* (4) ppid %d */
3958 "%*d " /* (5) pgrp %d */
3959 "%*d " /* (6) session %d */
3960 "%*d " /* (7) tty_nr %d */
3961 "%*d " /* (8) tpgid %d */
3962 "%*u " /* (9) flags %u */
3963 "%*u " /* (10) minflt %lu */
3964 "%*u " /* (11) cminflt %lu */
3965 "%*u " /* (12) majflt %lu */
3966 "%*u " /* (13) cmajflt %lu */
3967 "%*u " /* (14) utime %lu */
3968 "%*u " /* (15) stime %lu */
3969 "%*d " /* (16) cutime %ld */
3970 "%*d " /* (17) cstime %ld */
3971 "%*d " /* (18) priority %ld */
3972 "%*d " /* (19) nice %ld */
3973 "%*d " /* (20) num_threads %ld */
3974 "%*d " /* (21) itrealvalue %ld */
3975 "%" PRIu64, /* (22) starttime %llu */
3976 &starttime);
3977 if (ret != 1) {
3978 fclose(f);
3979 /* Caller can check for EINVAL on 0. */
3980 errno = EINVAL;
3981 return 0;
3982 }
3983
3984 fclose(f);
3985
3986 errno = 0;
3987 return starttime;
3988 }
3989
3990 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3991 {
3992 uint64_t clockticks;
3993 int64_t ticks_per_sec;
3994
3995 clockticks = get_reaper_start_time(pid);
3996 if (clockticks == 0 && errno == EINVAL) {
3997 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3998 return 0;
3999 }
4000
4001 ticks_per_sec = sysconf(_SC_CLK_TCK);
4002 if (ticks_per_sec < 0 && errno == EINVAL) {
4003 lxcfs_debug(
4004 "%s\n",
4005 "failed to determine number of clock ticks in a second");
4006 return 0;
4007 }
4008
4009 return (clockticks /= ticks_per_sec);
4010 }
4011
4012 static uint64_t get_reaper_age(pid_t pid)
4013 {
4014 uint64_t procstart, uptime, procage;
4015
4016 /* We need to substract the time the process has started since system
4017 * boot minus the time when the system has started to get the actual
4018 * reaper age.
4019 */
4020 procstart = get_reaper_start_time_in_sec(pid);
4021 procage = procstart;
4022 if (procstart > 0) {
4023 int ret;
4024 struct timespec spec;
4025
4026 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
4027 if (ret < 0)
4028 return 0;
4029 /* We could make this more precise here by using the tv_nsec
4030 * field in the timespec struct and convert it to milliseconds
4031 * and then create a double for the seconds and milliseconds but
4032 * that seems more work than it is worth.
4033 */
4034 uptime = spec.tv_sec;
4035 procage = uptime - procstart;
4036 }
4037
4038 return procage;
4039 }
4040
4041 /*
4042 * Returns 0 on success.
4043 * It is the caller's responsibility to free `return_usage`, unless this
4044 * function returns an error.
4045 */
4046 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage)
4047 {
4048 int cpucount = get_nprocs();
4049 struct cpuacct_usage *cpu_usage;
4050 int rv = 0, i, j, ret, read_pos = 0, read_cnt;
4051 int cg_cpu;
4052 uint64_t cg_user, cg_system;
4053 int64_t ticks_per_sec;
4054 char *usage_str = NULL;
4055
4056 ticks_per_sec = sysconf(_SC_CLK_TCK);
4057
4058 if (ticks_per_sec < 0 && errno == EINVAL) {
4059 lxcfs_debug(
4060 "%s\n",
4061 "read_cpuacct_usage_all failed to determine number of clock ticks "
4062 "in a second");
4063 return -1;
4064 }
4065
4066 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4067 if (!cpu_usage)
4068 return -ENOMEM;
4069
4070 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4071 rv = -1;
4072 goto err;
4073 }
4074
4075 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4076 lxcfs_error("read_cpuacct_usage_all reading first line from "
4077 "%s/cpuacct.usage_all failed.\n", cg);
4078 rv = -1;
4079 goto err;
4080 }
4081
4082 read_pos += read_cnt;
4083
4084 for (i = 0, j = 0; i < cpucount; i++) {
4085 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4086 &cg_system, &read_cnt);
4087
4088 if (ret == EOF)
4089 break;
4090
4091 if (ret != 3) {
4092 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4093 "failed.\n", cg);
4094 rv = -1;
4095 goto err;
4096 }
4097
4098 read_pos += read_cnt;
4099
4100 if (!cpu_in_cpuset(i, cpuset))
4101 continue;
4102
4103 /* Convert the time from nanoseconds to USER_HZ */
4104 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4105 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4106 j++;
4107 }
4108
4109 rv = 0;
4110 *return_usage = cpu_usage;
4111
4112 err:
4113 if (usage_str)
4114 free(usage_str);
4115
4116 if (rv != 0) {
4117 free(cpu_usage);
4118 *return_usage = NULL;
4119 }
4120
4121 return rv;
4122 }
4123
4124 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4125 {
4126 int i;
4127 unsigned long sum = 0;
4128
4129 for (i = 0; i < cpu_count; i++) {
4130 /* When cpuset is changed on the fly, the CPUs might get reordered.
4131 * We could either reset all counters, or check that the substractions
4132 * below will return expected results.
4133 */
4134 if (newer[i].user > older[i].user)
4135 diff[i].user = newer[i].user - older[i].user;
4136 else
4137 diff[i].user = 0;
4138
4139 if (newer[i].system > older[i].system)
4140 diff[i].system = newer[i].system - older[i].system;
4141 else
4142 diff[i].system = 0;
4143
4144 if (newer[i].idle > older[i].idle)
4145 diff[i].idle = newer[i].idle - older[i].idle;
4146 else
4147 diff[i].idle = 0;
4148
4149 sum += diff[i].user;
4150 sum += diff[i].system;
4151 sum += diff[i].idle;
4152 }
4153
4154 return sum;
4155 }
4156
4157 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4158 {
4159 unsigned long free_space, to_add;
4160
4161 free_space = threshold - usage->user - usage->system;
4162
4163 if (free_space > usage->idle)
4164 free_space = usage->idle;
4165
4166 to_add = free_space > *surplus ? *surplus : free_space;
4167
4168 *counter += to_add;
4169 usage->idle -= to_add;
4170 *surplus -= to_add;
4171 }
4172
4173 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4174 {
4175 struct cg_proc_stat *first = NULL, *prev, *tmp;
4176
4177 for (prev = NULL; node; ) {
4178 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4179 tmp = node;
4180 lxcfs_debug("Removing stat node for %s\n", node->cg);
4181
4182 if (prev)
4183 prev->next = node->next;
4184 else
4185 first = node->next;
4186
4187 node = node->next;
4188 free_proc_stat_node(tmp);
4189 } else {
4190 if (!first)
4191 first = node;
4192 prev = node;
4193 node = node->next;
4194 }
4195 }
4196
4197 return first;
4198 }
4199
4200 #define PROC_STAT_PRUNE_INTERVAL 10
4201 static void prune_proc_stat_history(void)
4202 {
4203 int i;
4204 time_t now = time(NULL);
4205
4206 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4207 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4208
4209 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4210 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4211 return;
4212 }
4213
4214 if (proc_stat_history[i]->next) {
4215 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4216 proc_stat_history[i]->lastcheck = now;
4217 }
4218
4219 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4220 }
4221 }
4222
4223 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4224 {
4225 struct cg_proc_stat *node;
4226
4227 pthread_rwlock_rdlock(&head->lock);
4228
4229 if (!head->next) {
4230 pthread_rwlock_unlock(&head->lock);
4231 return NULL;
4232 }
4233
4234 node = head->next;
4235
4236 do {
4237 if (strcmp(cg, node->cg) == 0)
4238 goto out;
4239 } while ((node = node->next));
4240
4241 node = NULL;
4242
4243 out:
4244 pthread_rwlock_unlock(&head->lock);
4245 prune_proc_stat_history();
4246 return node;
4247 }
4248
4249 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4250 {
4251 struct cg_proc_stat *node;
4252 int i;
4253
4254 node = malloc(sizeof(struct cg_proc_stat));
4255 if (!node)
4256 goto err;
4257
4258 node->cg = NULL;
4259 node->usage = NULL;
4260 node->view = NULL;
4261
4262 node->cg = malloc(strlen(cg) + 1);
4263 if (!node->cg)
4264 goto err;
4265
4266 strcpy(node->cg, cg);
4267
4268 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4269 if (!node->usage)
4270 goto err;
4271
4272 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4273
4274 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4275 if (!node->view)
4276 goto err;
4277
4278 node->cpu_count = cpu_count;
4279 node->next = NULL;
4280
4281 if (pthread_mutex_init(&node->lock, NULL) != 0) {
4282 lxcfs_error("%s\n", "Failed to initialize node lock");
4283 goto err;
4284 }
4285
4286 for (i = 0; i < cpu_count; i++) {
4287 node->view[i].user = 0;
4288 node->view[i].system = 0;
4289 node->view[i].idle = 0;
4290 }
4291
4292 return node;
4293
4294 err:
4295 if (node && node->cg)
4296 free(node->cg);
4297 if (node && node->usage)
4298 free(node->usage);
4299 if (node && node->view)
4300 free(node->view);
4301 if (node)
4302 free(node);
4303
4304 return NULL;
4305 }
4306
4307 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4308 {
4309 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4310 struct cg_proc_stat_head *head = proc_stat_history[hash];
4311 struct cg_proc_stat *node, *rv = new_node;
4312
4313 pthread_rwlock_wrlock(&head->lock);
4314
4315 if (!head->next) {
4316 head->next = new_node;
4317 goto out;
4318 }
4319
4320 node = head->next;
4321
4322 for (;;) {
4323 if (strcmp(node->cg, new_node->cg) == 0) {
4324 /* The node is already present, return it */
4325 free_proc_stat_node(new_node);
4326 rv = node;
4327 goto out;
4328 }
4329
4330 if (node->next) {
4331 node = node->next;
4332 continue;
4333 }
4334
4335 node->next = new_node;
4336 goto out;
4337 }
4338
4339 out:
4340 pthread_rwlock_unlock(&head->lock);
4341 return rv;
4342 }
4343
4344 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4345 {
4346 struct cpuacct_usage *new_usage, *new_view;
4347 int i;
4348
4349 /* Allocate new memory */
4350 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4351 if (!new_usage)
4352 return false;
4353
4354 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4355 if (!new_view) {
4356 free(new_usage);
4357 return false;
4358 }
4359
4360 /* Copy existing data & initialize new elements */
4361 for (i = 0; i < cpu_count; i++) {
4362 if (i < node->cpu_count) {
4363 new_usage[i].user = node->usage[i].user;
4364 new_usage[i].system = node->usage[i].system;
4365 new_usage[i].idle = node->usage[i].idle;
4366
4367 new_view[i].user = node->view[i].user;
4368 new_view[i].system = node->view[i].system;
4369 new_view[i].idle = node->view[i].idle;
4370 } else {
4371 new_usage[i].user = 0;
4372 new_usage[i].system = 0;
4373 new_usage[i].idle = 0;
4374
4375 new_view[i].user = 0;
4376 new_view[i].system = 0;
4377 new_view[i].idle = 0;
4378 }
4379 }
4380
4381 free(node->usage);
4382 free(node->view);
4383
4384 node->usage = new_usage;
4385 node->view = new_view;
4386 node->cpu_count = cpu_count;
4387
4388 return true;
4389 }
4390
4391 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4392 {
4393 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4394 struct cg_proc_stat_head *head = proc_stat_history[hash];
4395 struct cg_proc_stat *node;
4396
4397 node = find_proc_stat_node(head, cg);
4398
4399 if (!node) {
4400 node = new_proc_stat_node(usage, cpu_count, cg);
4401 if (!node)
4402 return NULL;
4403
4404 node = add_proc_stat_node(node);
4405 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4406 }
4407
4408 pthread_mutex_lock(&node->lock);
4409
4410 /* If additional CPUs on the host have been enabled, CPU usage counter
4411 * arrays have to be expanded */
4412 if (node->cpu_count < cpu_count) {
4413 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4414 node->cpu_count, cpu_count, cg);
4415
4416 if (!expand_proc_stat_node(node, cpu_count)) {
4417 pthread_mutex_unlock(&node->lock);
4418 lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4419 node->cpu_count, cpu_count, cg);
4420 return NULL;
4421 }
4422 }
4423
4424 return node;
4425 }
4426
4427 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4428 {
4429 int i;
4430
4431 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4432 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4433
4434 for (i = 0; i < cpu_count; i++) {
4435 node->view[i].user = 0;
4436 node->view[i].system = 0;
4437 node->view[i].idle = 0;
4438 }
4439
4440 node->cpu_count = cpu_count;
4441 }
4442
4443 static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, FILE *f, char *buf, size_t buf_size)
4444 {
4445 char *line = NULL;
4446 size_t linelen = 0, total_len = 0, rv = 0, l;
4447 int curcpu = -1; /* cpu numbering starts at 0 */
4448 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4449 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4450 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4451 unsigned long user_surplus = 0, system_surplus = 0;
4452 unsigned long total_sum, threshold;
4453 struct cg_proc_stat *stat_node;
4454 struct cpuacct_usage *diff = NULL;
4455 int nprocs = get_nprocs();
4456
4457 /* Read all CPU stats and stop when we've encountered other lines */
4458 while (getline(&line, &linelen, f) != -1) {
4459 int cpu, ret;
4460 char cpu_char[10]; /* That's a lot of cores */
4461 uint64_t all_used, cg_used;
4462
4463 if (strlen(line) == 0)
4464 continue;
4465 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4466 /* not a ^cpuN line containing a number N */
4467 break;
4468 }
4469
4470 if (sscanf(cpu_char, "%d", &cpu) != 1)
4471 continue;
4472 if (!cpu_in_cpuset(cpu, cpuset))
4473 continue;
4474 curcpu ++;
4475 cpu_cnt ++;
4476
4477 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4478 &user,
4479 &nice,
4480 &system,
4481 &idle,
4482 &iowait,
4483 &irq,
4484 &softirq,
4485 &steal,
4486 &guest,
4487 &guest_nice);
4488
4489 if (ret != 10)
4490 continue;
4491
4492 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4493 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4494
4495 if (all_used >= cg_used) {
4496 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4497
4498 } else {
4499 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4500 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4501 curcpu, cg, all_used, cg_used);
4502 cg_cpu_usage[curcpu].idle = idle;
4503 }
4504 }
4505
4506 /* Cannot use more CPUs than is available due to cpuset */
4507 if (max_cpus > cpu_cnt)
4508 max_cpus = cpu_cnt;
4509
4510 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4511
4512 if (!stat_node) {
4513 lxcfs_error("unable to find/create stat node for %s\n", cg);
4514 rv = 0;
4515 goto err;
4516 }
4517
4518 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4519 if (!diff) {
4520 rv = 0;
4521 goto err;
4522 }
4523
4524 /*
4525 * If the new values are LOWER than values stored in memory, it means
4526 * the cgroup has been reset/recreated and we should reset too.
4527 */
4528 if (cg_cpu_usage[0].user < stat_node->usage[0].user)
4529 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4530
4531 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, cpu_cnt);
4532
4533 for (curcpu = 0; curcpu < cpu_cnt; curcpu++) {
4534 stat_node->usage[curcpu].user += diff[curcpu].user;
4535 stat_node->usage[curcpu].system += diff[curcpu].system;
4536 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4537
4538 if (max_cpus > 0 && curcpu >= max_cpus) {
4539 user_surplus += diff[curcpu].user;
4540 system_surplus += diff[curcpu].system;
4541 }
4542 }
4543
4544 /* Calculate usage counters of visible CPUs */
4545 if (max_cpus > 0) {
4546 /* threshold = maximum usage per cpu, including idle */
4547 threshold = total_sum / cpu_cnt * max_cpus;
4548
4549 for (curcpu = 0; curcpu < max_cpus; curcpu++) {
4550 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4551 continue;
4552
4553 /* Add user */
4554 add_cpu_usage(
4555 &user_surplus,
4556 &diff[curcpu],
4557 &diff[curcpu].user,
4558 threshold);
4559
4560 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4561 continue;
4562
4563 /* If there is still room, add system */
4564 add_cpu_usage(
4565 &system_surplus,
4566 &diff[curcpu],
4567 &diff[curcpu].system,
4568 threshold);
4569 }
4570
4571 if (user_surplus > 0)
4572 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4573 if (system_surplus > 0)
4574 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4575
4576 for (curcpu = 0; curcpu < max_cpus; curcpu++) {
4577 stat_node->view[curcpu].user += diff[curcpu].user;
4578 stat_node->view[curcpu].system += diff[curcpu].system;
4579 stat_node->view[curcpu].idle += diff[curcpu].idle;
4580
4581 user_sum += stat_node->view[curcpu].user;
4582 system_sum += stat_node->view[curcpu].system;
4583 idle_sum += stat_node->view[curcpu].idle;
4584 }
4585
4586 } else {
4587 for (curcpu = 0; curcpu < cpu_cnt; curcpu++) {
4588 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4589 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4590 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4591
4592 user_sum += stat_node->view[curcpu].user;
4593 system_sum += stat_node->view[curcpu].system;
4594 idle_sum += stat_node->view[curcpu].idle;
4595 }
4596 }
4597
4598 /* Render the file */
4599 /* cpu-all */
4600 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4601 user_sum,
4602 system_sum,
4603 idle_sum);
4604
4605 if (l < 0) {
4606 perror("Error writing to cache");
4607 rv = 0;
4608 goto err;
4609
4610 }
4611 if (l >= buf_size) {
4612 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4613 rv = 0;
4614 goto err;
4615 }
4616
4617 buf += l;
4618 buf_size -= l;
4619 total_len += l;
4620
4621 /* Render visible CPUs */
4622 for (curcpu = 0; curcpu < cpu_cnt; curcpu++) {
4623 if (max_cpus > 0 && curcpu == max_cpus)
4624 break;
4625
4626 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4627 curcpu,
4628 stat_node->view[curcpu].user,
4629 stat_node->view[curcpu].system,
4630 stat_node->view[curcpu].idle);
4631
4632 if (l < 0) {
4633 perror("Error writing to cache");
4634 rv = 0;
4635 goto err;
4636
4637 }
4638 if (l >= buf_size) {
4639 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4640 rv = 0;
4641 goto err;
4642 }
4643
4644 buf += l;
4645 buf_size -= l;
4646 total_len += l;
4647 }
4648
4649 /* Pass the rest of /proc/stat, start with the last line read */
4650 l = snprintf(buf, buf_size, "%s", line);
4651
4652 if (l < 0) {
4653 perror("Error writing to cache");
4654 rv = 0;
4655 goto err;
4656
4657 }
4658 if (l >= buf_size) {
4659 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4660 rv = 0;
4661 goto err;
4662 }
4663
4664 buf += l;
4665 buf_size -= l;
4666 total_len += l;
4667
4668 /* Pass the rest of the host's /proc/stat */
4669 while (getline(&line, &linelen, f) != -1) {
4670 l = snprintf(buf, buf_size, "%s", line);
4671 if (l < 0) {
4672 perror("Error writing to cache");
4673 rv = 0;
4674 goto err;
4675 }
4676 if (l >= buf_size) {
4677 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4678 rv = 0;
4679 goto err;
4680 }
4681 buf += l;
4682 buf_size -= l;
4683 total_len += l;
4684 }
4685
4686 rv = total_len;
4687
4688 err:
4689 if (stat_node)
4690 pthread_mutex_unlock(&stat_node->lock);
4691 if (line)
4692 free(line);
4693 if (diff)
4694 free(diff);
4695 return rv;
4696 }
4697
4698 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4699 static int proc_stat_read(char *buf, size_t size, off_t offset,
4700 struct fuse_file_info *fi)
4701 {
4702 struct fuse_context *fc = fuse_get_context();
4703 struct file_info *d = (struct file_info *)fi->fh;
4704 char *cg;
4705 char *cpuset = NULL;
4706 char *line = NULL;
4707 size_t linelen = 0, total_len = 0, rv = 0;
4708 int curcpu = -1; /* cpu numbering starts at 0 */
4709 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4710 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
4711 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4712 char cpuall[CPUALL_MAX_SIZE];
4713 /* reserve for cpu all */
4714 char *cache = d->buf + CPUALL_MAX_SIZE;
4715 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4716 FILE *f = NULL;
4717 struct cpuacct_usage *cg_cpu_usage = NULL;
4718
4719 if (offset){
4720 if (offset > d->size)
4721 return -EINVAL;
4722 if (!d->cached)
4723 return 0;
4724 int left = d->size - offset;
4725 total_len = left > size ? size: left;
4726 memcpy(buf, d->buf + offset, total_len);
4727 return total_len;
4728 }
4729
4730 pid_t initpid = lookup_initpid_in_store(fc->pid);
4731 if (initpid <= 0)
4732 initpid = fc->pid;
4733 cg = get_pid_cgroup(initpid, "cpuset");
4734 if (!cg)
4735 return read_file("/proc/stat", buf, size, d);
4736 prune_init_slice(cg);
4737
4738 cpuset = get_cpuset(cg);
4739 if (!cpuset)
4740 goto err;
4741
4742 /*
4743 * Read cpuacct.usage_all for all CPUs.
4744 * If the cpuacct cgroup is present, it is used to calculate the container's
4745 * CPU usage. If not, values from the host's /proc/stat are used.
4746 */
4747 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage) != 0) {
4748 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
4749 "falling back to the host's /proc/stat");
4750 }
4751
4752 f = fopen("/proc/stat", "r");
4753 if (!f)
4754 goto err;
4755
4756 //skip first line
4757 if (getline(&line, &linelen, f) < 0) {
4758 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4759 goto err;
4760 }
4761
4762 if (use_cpuview(cg) && cg_cpu_usage) {
4763 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, f, d->buf, d->buflen);
4764 goto out;
4765 }
4766
4767 while (getline(&line, &linelen, f) != -1) {
4768 ssize_t l;
4769 int cpu;
4770 char cpu_char[10]; /* That's a lot of cores */
4771 char *c;
4772 uint64_t all_used, cg_used, new_idle;
4773 int ret;
4774
4775 if (strlen(line) == 0)
4776 continue;
4777 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4778 /* not a ^cpuN line containing a number N, just print it */
4779 l = snprintf(cache, cache_size, "%s", line);
4780 if (l < 0) {
4781 perror("Error writing to cache");
4782 rv = 0;
4783 goto err;
4784 }
4785 if (l >= cache_size) {
4786 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4787 rv = 0;
4788 goto err;
4789 }
4790 cache += l;
4791 cache_size -= l;
4792 total_len += l;
4793 continue;
4794 }
4795
4796 if (sscanf(cpu_char, "%d", &cpu) != 1)
4797 continue;
4798 if (!cpu_in_cpuset(cpu, cpuset))
4799 continue;
4800 curcpu ++;
4801
4802 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4803 &user,
4804 &nice,
4805 &system,
4806 &idle,
4807 &iowait,
4808 &irq,
4809 &softirq,
4810 &steal,
4811 &guest,
4812 &guest_nice);
4813
4814 if (ret != 10 || !cg_cpu_usage) {
4815 c = strchr(line, ' ');
4816 if (!c)
4817 continue;
4818 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4819 if (l < 0) {
4820 perror("Error writing to cache");
4821 rv = 0;
4822 goto err;
4823
4824 }
4825 if (l >= cache_size) {
4826 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4827 rv = 0;
4828 goto err;
4829 }
4830
4831 cache += l;
4832 cache_size -= l;
4833 total_len += l;
4834
4835 if (ret != 10)
4836 continue;
4837 }
4838
4839 if (cg_cpu_usage) {
4840 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4841 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4842
4843 if (all_used >= cg_used) {
4844 new_idle = idle + (all_used - cg_used);
4845
4846 } else {
4847 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4848 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4849 curcpu, cg, all_used, cg_used);
4850 new_idle = idle;
4851 }
4852
4853 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4854 curcpu, cg_cpu_usage[curcpu].user, cg_cpu_usage[curcpu].system,
4855 new_idle);
4856
4857 if (l < 0) {
4858 perror("Error writing to cache");
4859 rv = 0;
4860 goto err;
4861
4862 }
4863 if (l >= cache_size) {
4864 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4865 rv = 0;
4866 goto err;
4867 }
4868
4869 cache += l;
4870 cache_size -= l;
4871 total_len += l;
4872
4873 user_sum += cg_cpu_usage[curcpu].user;
4874 system_sum += cg_cpu_usage[curcpu].system;
4875 idle_sum += new_idle;
4876
4877 } else {
4878 user_sum += user;
4879 nice_sum += nice;
4880 system_sum += system;
4881 idle_sum += idle;
4882 iowait_sum += iowait;
4883 irq_sum += irq;
4884 softirq_sum += softirq;
4885 steal_sum += steal;
4886 guest_sum += guest;
4887 guest_nice_sum += guest_nice;
4888 }
4889 }
4890
4891 cache = d->buf;
4892
4893 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4894 user_sum,
4895 nice_sum,
4896 system_sum,
4897 idle_sum,
4898 iowait_sum,
4899 irq_sum,
4900 softirq_sum,
4901 steal_sum,
4902 guest_sum,
4903 guest_nice_sum);
4904 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4905 memcpy(cache, cpuall, cpuall_len);
4906 cache += cpuall_len;
4907 } else {
4908 /* shouldn't happen */
4909 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4910 cpuall_len = 0;
4911 }
4912
4913 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4914 total_len += cpuall_len;
4915
4916 out:
4917 d->cached = 1;
4918 d->size = total_len;
4919 if (total_len > size)
4920 total_len = size;
4921
4922 memcpy(buf, d->buf, total_len);
4923 rv = total_len;
4924
4925 err:
4926 if (f)
4927 fclose(f);
4928 if (cg_cpu_usage)
4929 free(cg_cpu_usage);
4930 free(line);
4931 free(cpuset);
4932 free(cg);
4933 return rv;
4934 }
4935
4936 /* This function retrieves the busy time of a group of tasks by looking at
4937 * cpuacct.usage. Unfortunately, this only makes sense when the container has
4938 * been given it's own cpuacct cgroup. If not, this function will take the busy
4939 * time of all other taks that do not actually belong to the container into
4940 * account as well. If someone has a clever solution for this please send a
4941 * patch!
4942 */
4943 static unsigned long get_reaper_busy(pid_t task)
4944 {
4945 pid_t initpid = lookup_initpid_in_store(task);
4946 char *cgroup = NULL, *usage_str = NULL;
4947 unsigned long usage = 0;
4948
4949 if (initpid <= 0)
4950 return 0;
4951
4952 cgroup = get_pid_cgroup(initpid, "cpuacct");
4953 if (!cgroup)
4954 goto out;
4955 prune_init_slice(cgroup);
4956 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
4957 goto out;
4958 usage = strtoul(usage_str, NULL, 10);
4959 usage /= 1000000000;
4960
4961 out:
4962 free(cgroup);
4963 free(usage_str);
4964 return usage;
4965 }
4966
4967 #if RELOADTEST
4968 void iwashere(void)
4969 {
4970 int fd;
4971
4972 fd = creat("/tmp/lxcfs-iwashere", 0644);
4973 if (fd >= 0)
4974 close(fd);
4975 }
4976 #endif
4977
4978 /*
4979 * We read /proc/uptime and reuse its second field.
4980 * For the first field, we use the mtime for the reaper for
4981 * the calling pid as returned by getreaperage
4982 */
4983 static int proc_uptime_read(char *buf, size_t size, off_t offset,
4984 struct fuse_file_info *fi)
4985 {
4986 struct fuse_context *fc = fuse_get_context();
4987 struct file_info *d = (struct file_info *)fi->fh;
4988 unsigned long int busytime = get_reaper_busy(fc->pid);
4989 char *cache = d->buf;
4990 ssize_t total_len = 0;
4991 uint64_t idletime, reaperage;
4992
4993 #if RELOADTEST
4994 iwashere();
4995 #endif
4996
4997 if (offset){
4998 if (!d->cached)
4999 return 0;
5000 if (offset > d->size)
5001 return -EINVAL;
5002 int left = d->size - offset;
5003 total_len = left > size ? size: left;
5004 memcpy(buf, cache + offset, total_len);
5005 return total_len;
5006 }
5007
5008 reaperage = get_reaper_age(fc->pid);
5009 /* To understand why this is done, please read the comment to the
5010 * get_reaper_busy() function.
5011 */
5012 idletime = reaperage;
5013 if (reaperage >= busytime)
5014 idletime = reaperage - busytime;
5015
5016 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
5017 if (total_len < 0 || total_len >= d->buflen){
5018 lxcfs_error("%s\n", "failed to write to cache");
5019 return 0;
5020 }
5021
5022 d->size = (int)total_len;
5023 d->cached = 1;
5024
5025 if (total_len > size) total_len = size;
5026
5027 memcpy(buf, d->buf, total_len);
5028 return total_len;
5029 }
5030
5031 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5032 struct fuse_file_info *fi)
5033 {
5034 char dev_name[72];
5035 struct fuse_context *fc = fuse_get_context();
5036 struct file_info *d = (struct file_info *)fi->fh;
5037 char *cg;
5038 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5039 *io_wait_time_str = NULL, *io_service_time_str = NULL;
5040 unsigned long read = 0, write = 0;
5041 unsigned long read_merged = 0, write_merged = 0;
5042 unsigned long read_sectors = 0, write_sectors = 0;
5043 unsigned long read_ticks = 0, write_ticks = 0;
5044 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5045 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5046 char *cache = d->buf;
5047 size_t cache_size = d->buflen;
5048 char *line = NULL;
5049 size_t linelen = 0, total_len = 0, rv = 0;
5050 unsigned int major = 0, minor = 0;
5051 int i = 0;
5052 FILE *f = NULL;
5053
5054 if (offset){
5055 if (offset > d->size)
5056 return -EINVAL;
5057 if (!d->cached)
5058 return 0;
5059 int left = d->size - offset;
5060 total_len = left > size ? size: left;
5061 memcpy(buf, cache + offset, total_len);
5062 return total_len;
5063 }
5064
5065 pid_t initpid = lookup_initpid_in_store(fc->pid);
5066 if (initpid <= 0)
5067 initpid = fc->pid;
5068 cg = get_pid_cgroup(initpid, "blkio");
5069 if (!cg)
5070 return read_file("/proc/diskstats", buf, size, d);
5071 prune_init_slice(cg);
5072
5073 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
5074 goto err;
5075 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
5076 goto err;
5077 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
5078 goto err;
5079 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
5080 goto err;
5081 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
5082 goto err;
5083
5084
5085 f = fopen("/proc/diskstats", "r");
5086 if (!f)
5087 goto err;
5088
5089 while (getline(&line, &linelen, f) != -1) {
5090 ssize_t l;
5091 char lbuf[256];
5092
5093 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5094 if (i != 3)
5095 continue;
5096
5097 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5098 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5099 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5100 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5101 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5102 read_sectors = read_sectors/512;
5103 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5104 write_sectors = write_sectors/512;
5105
5106 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5107 rd_svctm = rd_svctm/1000000;
5108 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5109 rd_wait = rd_wait/1000000;
5110 read_ticks = rd_svctm + rd_wait;
5111
5112 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5113 wr_svctm = wr_svctm/1000000;
5114 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5115 wr_wait = wr_wait/1000000;
5116 write_ticks = wr_svctm + wr_wait;
5117
5118 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5119 tot_ticks = tot_ticks/1000000;
5120
5121 memset(lbuf, 0, 256);
5122 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5123 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5124 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5125 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5126 else
5127 continue;
5128
5129 l = snprintf(cache, cache_size, "%s", lbuf);
5130 if (l < 0) {
5131 perror("Error writing to fuse buf");
5132 rv = 0;
5133 goto err;
5134 }
5135 if (l >= cache_size) {
5136 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5137 rv = 0;
5138 goto err;
5139 }
5140 cache += l;
5141 cache_size -= l;
5142 total_len += l;
5143 }
5144
5145 d->cached = 1;
5146 d->size = total_len;
5147 if (total_len > size ) total_len = size;
5148 memcpy(buf, d->buf, total_len);
5149
5150 rv = total_len;
5151 err:
5152 free(cg);
5153 if (f)
5154 fclose(f);
5155 free(line);
5156 free(io_serviced_str);
5157 free(io_merged_str);
5158 free(io_service_bytes_str);
5159 free(io_wait_time_str);
5160 free(io_service_time_str);
5161 return rv;
5162 }
5163
5164 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5165 struct fuse_file_info *fi)
5166 {
5167 struct fuse_context *fc = fuse_get_context();
5168 struct file_info *d = (struct file_info *)fi->fh;
5169 char *cg = NULL;
5170 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
5171 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
5172 ssize_t total_len = 0, rv = 0;
5173 ssize_t l = 0;
5174 char *cache = d->buf;
5175
5176 if (offset) {
5177 if (offset > d->size)
5178 return -EINVAL;
5179 if (!d->cached)
5180 return 0;
5181 int left = d->size - offset;
5182 total_len = left > size ? size: left;
5183 memcpy(buf, cache + offset, total_len);
5184 return total_len;
5185 }
5186
5187 pid_t initpid = lookup_initpid_in_store(fc->pid);
5188 if (initpid <= 0)
5189 initpid = fc->pid;
5190 cg = get_pid_cgroup(initpid, "memory");
5191 if (!cg)
5192 return read_file("/proc/swaps", buf, size, d);
5193 prune_init_slice(cg);
5194
5195 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
5196
5197 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
5198 goto err;
5199
5200 memusage = strtoul(memusage_str, NULL, 10);
5201
5202 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5203 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5204
5205 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
5206 memswusage = strtoul(memswusage_str, NULL, 10);
5207
5208 swap_total = (memswlimit - memlimit) / 1024;
5209 swap_free = (memswusage - memusage) / 1024;
5210 }
5211
5212 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5213
5214 /* When no mem + swap limit is specified or swapaccount=0*/
5215 if (!memswlimit) {
5216 char *line = NULL;
5217 size_t linelen = 0;
5218 FILE *f = fopen("/proc/meminfo", "r");
5219
5220 if (!f)
5221 goto err;
5222
5223 while (getline(&line, &linelen, f) != -1) {
5224 if (startswith(line, "SwapTotal:")) {
5225 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
5226 } else if (startswith(line, "SwapFree:")) {
5227 sscanf(line, "SwapFree: %8lu kB", &swap_free);
5228 }
5229 }
5230
5231 free(line);
5232 fclose(f);
5233 }
5234
5235 if (swap_total > 0) {
5236 l = snprintf(d->buf + total_len, d->size - total_len,
5237 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5238 swap_total, swap_free);
5239 total_len += l;
5240 }
5241
5242 if (total_len < 0 || l < 0) {
5243 perror("Error writing to cache");
5244 rv = 0;
5245 goto err;
5246 }
5247
5248 d->cached = 1;
5249 d->size = (int)total_len;
5250
5251 if (total_len > size) total_len = size;
5252 memcpy(buf, d->buf, total_len);
5253 rv = total_len;
5254
5255 err:
5256 free(cg);
5257 free(memswlimit_str);
5258 free(memlimit_str);
5259 free(memusage_str);
5260 free(memswusage_str);
5261 return rv;
5262 }
5263 /*
5264 * Find the process pid from cgroup path.
5265 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5266 * @pid_buf : put pid to pid_buf.
5267 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5268 * @depth : the depth of cgroup in container.
5269 * @sum : return the number of pid.
5270 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5271 */
5272 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5273 {
5274 DIR *dir;
5275 int fd;
5276 struct dirent *file;
5277 FILE *f = NULL;
5278 size_t linelen = 0;
5279 char *line = NULL;
5280 int pd;
5281 char *path_dir, *path;
5282 char **pid;
5283
5284 /* path = dpath + "/cgroup.procs" + /0 */
5285 do {
5286 path = malloc(strlen(dpath) + 20);
5287 } while (!path);
5288
5289 strcpy(path, dpath);
5290 fd = openat(cfd, path, O_RDONLY);
5291 if (fd < 0)
5292 goto out;
5293
5294 dir = fdopendir(fd);
5295 if (dir == NULL) {
5296 close(fd);
5297 goto out;
5298 }
5299
5300 while (((file = readdir(dir)) != NULL) && depth > 0) {
5301 if (strncmp(file->d_name, ".", 1) == 0)
5302 continue;
5303 if (strncmp(file->d_name, "..", 1) == 0)
5304 continue;
5305 if (file->d_type == DT_DIR) {
5306 /* path + '/' + d_name +/0 */
5307 do {
5308 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5309 } while (!path_dir);
5310 strcpy(path_dir, path);
5311 strcat(path_dir, "/");
5312 strcat(path_dir, file->d_name);
5313 pd = depth - 1;
5314 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5315 free(path_dir);
5316 }
5317 }
5318 closedir(dir);
5319
5320 strcat(path, "/cgroup.procs");
5321 fd = openat(cfd, path, O_RDONLY);
5322 if (fd < 0)
5323 goto out;
5324
5325 f = fdopen(fd, "r");
5326 if (!f) {
5327 close(fd);
5328 goto out;
5329 }
5330
5331 while (getline(&line, &linelen, f) != -1) {
5332 do {
5333 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5334 } while (!pid);
5335 *pid_buf = pid;
5336 do {
5337 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5338 } while (*(*pid_buf + sum) == NULL);
5339 strcpy(*(*pid_buf + sum), line);
5340 sum++;
5341 }
5342 fclose(f);
5343 out:
5344 if (line)
5345 free(line);
5346 free(path);
5347 return sum;
5348 }
5349 /*
5350 * calc_load calculates the load according to the following formula:
5351 * load1 = load0 * exp + active * (1 - exp)
5352 *
5353 * @load1: the new loadavg.
5354 * @load0: the former loadavg.
5355 * @active: the total number of running pid at this moment.
5356 * @exp: the fixed-point defined in the beginning.
5357 */
5358 static unsigned long
5359 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5360 {
5361 unsigned long newload;
5362
5363 active = active > 0 ? active * FIXED_1 : 0;
5364 newload = load * exp + active * (FIXED_1 - exp);
5365 if (active >= load)
5366 newload += FIXED_1 - 1;
5367
5368 return newload / FIXED_1;
5369 }
5370
5371 /*
5372 * Return 0 means that container p->cg is closed.
5373 * Return -1 means that error occurred in refresh.
5374 * Positive num equals the total number of pid.
5375 */
5376 static int refresh_load(struct load_node *p, char *path)
5377 {
5378 FILE *f = NULL;
5379 char **idbuf;
5380 char proc_path[256];
5381 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5382 char *line = NULL;
5383 size_t linelen = 0;
5384 int sum, length;
5385 DIR *dp;
5386 struct dirent *file;
5387
5388 do {
5389 idbuf = malloc(sizeof(char *));
5390 } while (!idbuf);
5391 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5392 /* normal exit */
5393 if (sum == 0)
5394 goto out;
5395
5396 for (i = 0; i < sum; i++) {
5397 /*clean up '\n' */
5398 length = strlen(idbuf[i])-1;
5399 idbuf[i][length] = '\0';
5400 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5401 if (ret < 0 || ret > 255) {
5402 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5403 i = sum;
5404 sum = -1;
5405 goto err_out;
5406 }
5407
5408 dp = opendir(proc_path);
5409 if (!dp) {
5410 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5411 continue;
5412 }
5413 while ((file = readdir(dp)) != NULL) {
5414 if (strncmp(file->d_name, ".", 1) == 0)
5415 continue;
5416 if (strncmp(file->d_name, "..", 1) == 0)
5417 continue;
5418 total_pid++;
5419 /* We make the biggest pid become last_pid.*/
5420 ret = atof(file->d_name);
5421 last_pid = (ret > last_pid) ? ret : last_pid;
5422
5423 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5424 if (ret < 0 || ret > 255) {
5425 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5426 i = sum;
5427 sum = -1;
5428 closedir(dp);
5429 goto err_out;
5430 }
5431 f = fopen(proc_path, "r");
5432 if (f != NULL) {
5433 while (getline(&line, &linelen, f) != -1) {
5434 /* Find State */
5435 if ((line[0] == 'S') && (line[1] == 't'))
5436 break;
5437 }
5438 if ((line[7] == 'R') || (line[7] == 'D'))
5439 run_pid++;
5440 fclose(f);
5441 }
5442 }
5443 closedir(dp);
5444 }
5445 /*Calculate the loadavg.*/
5446 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5447 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5448 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5449 p->run_pid = run_pid;
5450 p->total_pid = total_pid;
5451 p->last_pid = last_pid;
5452
5453 free(line);
5454 err_out:
5455 for (; i > 0; i--)
5456 free(idbuf[i-1]);
5457 out:
5458 free(idbuf);
5459 return sum;
5460 }
5461 /*
5462 * Traverse the hash table and update it.
5463 */
5464 void *load_begin(void *arg)
5465 {
5466
5467 char *path = NULL;
5468 int i, sum, length, ret;
5469 struct load_node *f;
5470 int first_node;
5471 clock_t time1, time2;
5472
5473 while (1) {
5474 if (loadavg_stop == 1)
5475 return NULL;
5476
5477 time1 = clock();
5478 for (i = 0; i < LOAD_SIZE; i++) {
5479 pthread_mutex_lock(&load_hash[i].lock);
5480 if (load_hash[i].next == NULL) {
5481 pthread_mutex_unlock(&load_hash[i].lock);
5482 continue;
5483 }
5484 f = load_hash[i].next;
5485 first_node = 1;
5486 while (f) {
5487 length = strlen(f->cg) + 2;
5488 do {
5489 /* strlen(f->cg) + '.' or '' + \0 */
5490 path = malloc(length);
5491 } while (!path);
5492
5493 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5494 if (ret < 0 || ret > length - 1) {
5495 /* snprintf failed, ignore the node.*/
5496 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5497 goto out;
5498 }
5499 sum = refresh_load(f, path);
5500 if (sum == 0) {
5501 f = del_node(f, i);
5502 } else {
5503 out: f = f->next;
5504 }
5505 free(path);
5506 /* load_hash[i].lock locks only on the first node.*/
5507 if (first_node == 1) {
5508 first_node = 0;
5509 pthread_mutex_unlock(&load_hash[i].lock);
5510 }
5511 }
5512 }
5513
5514 if (loadavg_stop == 1)
5515 return NULL;
5516
5517 time2 = clock();
5518 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5519 }
5520 }
5521
5522 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5523 struct fuse_file_info *fi)
5524 {
5525 struct fuse_context *fc = fuse_get_context();
5526 struct file_info *d = (struct file_info *)fi->fh;
5527 pid_t initpid;
5528 char *cg;
5529 size_t total_len = 0;
5530 char *cache = d->buf;
5531 struct load_node *n;
5532 int hash;
5533 int cfd, rv = 0;
5534 unsigned long a, b, c;
5535
5536 if (offset) {
5537 if (offset > d->size)
5538 return -EINVAL;
5539 if (!d->cached)
5540 return 0;
5541 int left = d->size - offset;
5542 total_len = left > size ? size : left;
5543 memcpy(buf, cache + offset, total_len);
5544 return total_len;
5545 }
5546 if (!loadavg)
5547 return read_file("/proc/loadavg", buf, size, d);
5548
5549 initpid = lookup_initpid_in_store(fc->pid);
5550 if (initpid <= 0)
5551 initpid = fc->pid;
5552 cg = get_pid_cgroup(initpid, "cpu");
5553 if (!cg)
5554 return read_file("/proc/loadavg", buf, size, d);
5555
5556 prune_init_slice(cg);
5557 hash = calc_hash(cg) % LOAD_SIZE;
5558 n = locate_node(cg, hash);
5559
5560 /* First time */
5561 if (n == NULL) {
5562 if (!find_mounted_controller("cpu", &cfd)) {
5563 /*
5564 * In locate_node() above, pthread_rwlock_unlock() isn't used
5565 * because delete is not allowed before read has ended.
5566 */
5567 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5568 rv = 0;
5569 goto err;
5570 }
5571 do {
5572 n = malloc(sizeof(struct load_node));
5573 } while (!n);
5574
5575 do {
5576 n->cg = malloc(strlen(cg)+1);
5577 } while (!n->cg);
5578 strcpy(n->cg, cg);
5579 n->avenrun[0] = 0;
5580 n->avenrun[1] = 0;
5581 n->avenrun[2] = 0;
5582 n->run_pid = 0;
5583 n->total_pid = 1;
5584 n->last_pid = initpid;
5585 n->cfd = cfd;
5586 insert_node(&n, hash);
5587 }
5588 a = n->avenrun[0] + (FIXED_1/200);
5589 b = n->avenrun[1] + (FIXED_1/200);
5590 c = n->avenrun[2] + (FIXED_1/200);
5591 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5592 LOAD_INT(a), LOAD_FRAC(a),
5593 LOAD_INT(b), LOAD_FRAC(b),
5594 LOAD_INT(c), LOAD_FRAC(c),
5595 n->run_pid, n->total_pid, n->last_pid);
5596 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5597 if (total_len < 0 || total_len >= d->buflen) {
5598 lxcfs_error("%s\n", "Failed to write to cache");
5599 rv = 0;
5600 goto err;
5601 }
5602 d->size = (int)total_len;
5603 d->cached = 1;
5604
5605 if (total_len > size)
5606 total_len = size;
5607 memcpy(buf, d->buf, total_len);
5608 rv = total_len;
5609
5610 err:
5611 free(cg);
5612 return rv;
5613 }
5614 /* Return a positive number on success, return 0 on failure.*/
5615 pthread_t load_daemon(int load_use)
5616 {
5617 int ret;
5618 pthread_t pid;
5619
5620 ret = init_load();
5621 if (ret == -1) {
5622 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5623 return 0;
5624 }
5625 ret = pthread_create(&pid, NULL, load_begin, NULL);
5626 if (ret != 0) {
5627 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5628 load_free();
5629 return 0;
5630 }
5631 /* use loadavg, here loadavg = 1*/
5632 loadavg = load_use;
5633 return pid;
5634 }
5635
5636 /* Returns 0 on success. */
5637 int stop_load_daemon(pthread_t pid)
5638 {
5639 int s;
5640
5641 /* Signal the thread to gracefully stop */
5642 loadavg_stop = 1;
5643
5644 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5645 if (s != 0) {
5646 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5647 return -1;
5648 }
5649
5650 load_free();
5651 loadavg_stop = 0;
5652
5653 return 0;
5654 }
5655
5656 static off_t get_procfile_size(const char *which)
5657 {
5658 FILE *f = fopen(which, "r");
5659 char *line = NULL;
5660 size_t len = 0;
5661 ssize_t sz, answer = 0;
5662 if (!f)
5663 return 0;
5664
5665 while ((sz = getline(&line, &len, f)) != -1)
5666 answer += sz;
5667 fclose (f);
5668 free(line);
5669
5670 return answer;
5671 }
5672
5673 int proc_getattr(const char *path, struct stat *sb)
5674 {
5675 struct timespec now;
5676
5677 memset(sb, 0, sizeof(struct stat));
5678 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5679 return -EINVAL;
5680 sb->st_uid = sb->st_gid = 0;
5681 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5682 if (strcmp(path, "/proc") == 0) {
5683 sb->st_mode = S_IFDIR | 00555;
5684 sb->st_nlink = 2;
5685 return 0;
5686 }
5687 if (strcmp(path, "/proc/meminfo") == 0 ||
5688 strcmp(path, "/proc/cpuinfo") == 0 ||
5689 strcmp(path, "/proc/uptime") == 0 ||
5690 strcmp(path, "/proc/stat") == 0 ||
5691 strcmp(path, "/proc/diskstats") == 0 ||
5692 strcmp(path, "/proc/swaps") == 0 ||
5693 strcmp(path, "/proc/loadavg") == 0) {
5694 sb->st_size = 0;
5695 sb->st_mode = S_IFREG | 00444;
5696 sb->st_nlink = 1;
5697 return 0;
5698 }
5699
5700 return -ENOENT;
5701 }
5702
5703 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5704 struct fuse_file_info *fi)
5705 {
5706 if (filler(buf, ".", NULL, 0) != 0 ||
5707 filler(buf, "..", NULL, 0) != 0 ||
5708 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5709 filler(buf, "meminfo", NULL, 0) != 0 ||
5710 filler(buf, "stat", NULL, 0) != 0 ||
5711 filler(buf, "uptime", NULL, 0) != 0 ||
5712 filler(buf, "diskstats", NULL, 0) != 0 ||
5713 filler(buf, "swaps", NULL, 0) != 0 ||
5714 filler(buf, "loadavg", NULL, 0) != 0)
5715 return -EINVAL;
5716 return 0;
5717 }
5718
5719 int proc_open(const char *path, struct fuse_file_info *fi)
5720 {
5721 int type = -1;
5722 struct file_info *info;
5723
5724 if (strcmp(path, "/proc/meminfo") == 0)
5725 type = LXC_TYPE_PROC_MEMINFO;
5726 else if (strcmp(path, "/proc/cpuinfo") == 0)
5727 type = LXC_TYPE_PROC_CPUINFO;
5728 else if (strcmp(path, "/proc/uptime") == 0)
5729 type = LXC_TYPE_PROC_UPTIME;
5730 else if (strcmp(path, "/proc/stat") == 0)
5731 type = LXC_TYPE_PROC_STAT;
5732 else if (strcmp(path, "/proc/diskstats") == 0)
5733 type = LXC_TYPE_PROC_DISKSTATS;
5734 else if (strcmp(path, "/proc/swaps") == 0)
5735 type = LXC_TYPE_PROC_SWAPS;
5736 else if (strcmp(path, "/proc/loadavg") == 0)
5737 type = LXC_TYPE_PROC_LOADAVG;
5738 if (type == -1)
5739 return -ENOENT;
5740
5741 info = malloc(sizeof(*info));
5742 if (!info)
5743 return -ENOMEM;
5744
5745 memset(info, 0, sizeof(*info));
5746 info->type = type;
5747
5748 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5749 do {
5750 info->buf = malloc(info->buflen);
5751 } while (!info->buf);
5752 memset(info->buf, 0, info->buflen);
5753 /* set actual size to buffer size */
5754 info->size = info->buflen;
5755
5756 fi->fh = (unsigned long)info;
5757 return 0;
5758 }
5759
5760 int proc_access(const char *path, int mask)
5761 {
5762 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5763 return 0;
5764
5765 /* these are all read-only */
5766 if ((mask & ~R_OK) != 0)
5767 return -EACCES;
5768 return 0;
5769 }
5770
5771 int proc_release(const char *path, struct fuse_file_info *fi)
5772 {
5773 do_release_file_info(fi);
5774 return 0;
5775 }
5776
5777 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5778 struct fuse_file_info *fi)
5779 {
5780 struct file_info *f = (struct file_info *) fi->fh;
5781
5782 switch (f->type) {
5783 case LXC_TYPE_PROC_MEMINFO:
5784 return proc_meminfo_read(buf, size, offset, fi);
5785 case LXC_TYPE_PROC_CPUINFO:
5786 return proc_cpuinfo_read(buf, size, offset, fi);
5787 case LXC_TYPE_PROC_UPTIME:
5788 return proc_uptime_read(buf, size, offset, fi);
5789 case LXC_TYPE_PROC_STAT:
5790 return proc_stat_read(buf, size, offset, fi);
5791 case LXC_TYPE_PROC_DISKSTATS:
5792 return proc_diskstats_read(buf, size, offset, fi);
5793 case LXC_TYPE_PROC_SWAPS:
5794 return proc_swaps_read(buf, size, offset, fi);
5795 case LXC_TYPE_PROC_LOADAVG:
5796 return proc_loadavg_read(buf, size, offset, fi);
5797 default:
5798 return -EINVAL;
5799 }
5800 }
5801
5802 /*
5803 * Functions needed to setup cgroups in the __constructor__.
5804 */
5805
5806 static bool mkdir_p(const char *dir, mode_t mode)
5807 {
5808 const char *tmp = dir;
5809 const char *orig = dir;
5810 char *makeme;
5811
5812 do {
5813 dir = tmp + strspn(tmp, "/");
5814 tmp = dir + strcspn(dir, "/");
5815 makeme = strndup(orig, dir - orig);
5816 if (!makeme)
5817 return false;
5818 if (mkdir(makeme, mode) && errno != EEXIST) {
5819 lxcfs_error("Failed to create directory '%s': %s.\n",
5820 makeme, strerror(errno));
5821 free(makeme);
5822 return false;
5823 }
5824 free(makeme);
5825 } while(tmp != dir);
5826
5827 return true;
5828 }
5829
5830 static bool umount_if_mounted(void)
5831 {
5832 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5833 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5834 return false;
5835 }
5836 return true;
5837 }
5838
5839 /* __typeof__ should be safe to use with all compilers. */
5840 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5841 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5842 {
5843 return (fs->f_type == (fs_type_magic)magic_val);
5844 }
5845
5846 /*
5847 * looking at fs/proc_namespace.c, it appears we can
5848 * actually expect the rootfs entry to very specifically contain
5849 * " - rootfs rootfs "
5850 * IIUC, so long as we've chrooted so that rootfs is not our root,
5851 * the rootfs entry should always be skipped in mountinfo contents.
5852 */
5853 static bool is_on_ramfs(void)
5854 {
5855 FILE *f;
5856 char *p, *p2;
5857 char *line = NULL;
5858 size_t len = 0;
5859 int i;
5860
5861 f = fopen("/proc/self/mountinfo", "r");
5862 if (!f)
5863 return false;
5864
5865 while (getline(&line, &len, f) != -1) {
5866 for (p = line, i = 0; p && i < 4; i++)
5867 p = strchr(p + 1, ' ');
5868 if (!p)
5869 continue;
5870 p2 = strchr(p + 1, ' ');
5871 if (!p2)
5872 continue;
5873 *p2 = '\0';
5874 if (strcmp(p + 1, "/") == 0) {
5875 // this is '/'. is it the ramfs?
5876 p = strchr(p2 + 1, '-');
5877 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5878 free(line);
5879 fclose(f);
5880 return true;
5881 }
5882 }
5883 }
5884 free(line);
5885 fclose(f);
5886 return false;
5887 }
5888
5889 static int pivot_enter()
5890 {
5891 int ret = -1, oldroot = -1, newroot = -1;
5892
5893 oldroot = open("/", O_DIRECTORY | O_RDONLY);
5894 if (oldroot < 0) {
5895 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5896 return ret;
5897 }
5898
5899 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5900 if (newroot < 0) {
5901 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5902 goto err;
5903 }
5904
5905 /* change into new root fs */
5906 if (fchdir(newroot) < 0) {
5907 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5908 goto err;
5909 }
5910
5911 /* pivot_root into our new root fs */
5912 if (pivot_root(".", ".") < 0) {
5913 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5914 goto err;
5915 }
5916
5917 /*
5918 * At this point the old-root is mounted on top of our new-root.
5919 * To unmounted it we must not be chdir'd into it, so escape back
5920 * to the old-root.
5921 */
5922 if (fchdir(oldroot) < 0) {
5923 lxcfs_error("%s\n", "Failed to enter old root.");
5924 goto err;
5925 }
5926
5927 if (umount2(".", MNT_DETACH) < 0) {
5928 lxcfs_error("%s\n", "Failed to detach old root.");
5929 goto err;
5930 }
5931
5932 if (fchdir(newroot) < 0) {
5933 lxcfs_error("%s\n", "Failed to re-enter new root.");
5934 goto err;
5935 }
5936
5937 ret = 0;
5938
5939 err:
5940 if (oldroot > 0)
5941 close(oldroot);
5942 if (newroot > 0)
5943 close(newroot);
5944
5945 return ret;
5946 }
5947
5948 static int chroot_enter()
5949 {
5950 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5951 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5952 return -1;
5953 }
5954
5955 if (chroot(".") < 0) {
5956 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
5957 return -1;
5958 }
5959
5960 if (chdir("/") < 0) {
5961 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
5962 return -1;
5963 }
5964
5965 return 0;
5966 }
5967
5968 static int permute_and_enter(void)
5969 {
5970 struct statfs sb;
5971
5972 if (statfs("/", &sb) < 0) {
5973 lxcfs_error("%s\n", "Could not stat / mountpoint.");
5974 return -1;
5975 }
5976
5977 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5978 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5979 * /proc/1/mountinfo. */
5980 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5981 return chroot_enter();
5982
5983 if (pivot_enter() < 0) {
5984 lxcfs_error("%s\n", "Could not perform pivot root.");
5985 return -1;
5986 }
5987
5988 return 0;
5989 }
5990
5991 /* Prepare our new clean root. */
5992 static int permute_prepare(void)
5993 {
5994 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
5995 lxcfs_error("%s\n", "Failed to create directory for new root.");
5996 return -1;
5997 }
5998
5999 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
6000 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
6001 return -1;
6002 }
6003
6004 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
6005 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
6006 return -1;
6007 }
6008
6009 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
6010 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
6011 return -1;
6012 }
6013
6014 return 0;
6015 }
6016
6017 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6018 static bool permute_root(void)
6019 {
6020 /* Prepare new root. */
6021 if (permute_prepare() < 0)
6022 return false;
6023
6024 /* Pivot into new root. */
6025 if (permute_and_enter() < 0)
6026 return false;
6027
6028 return true;
6029 }
6030
6031 static int preserve_mnt_ns(int pid)
6032 {
6033 int ret;
6034 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6035 char path[len];
6036
6037 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6038 if (ret < 0 || (size_t)ret >= len)
6039 return -1;
6040
6041 return open(path, O_RDONLY | O_CLOEXEC);
6042 }
6043
6044 static bool cgfs_prepare_mounts(void)
6045 {
6046 if (!mkdir_p(BASEDIR, 0700)) {
6047 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6048 return false;
6049 }
6050
6051 if (!umount_if_mounted()) {
6052 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6053 return false;
6054 }
6055
6056 if (unshare(CLONE_NEWNS) < 0) {
6057 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6058 return false;
6059 }
6060
6061 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6062 if (cgroup_mount_ns_fd < 0) {
6063 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6064 return false;
6065 }
6066
6067 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6068 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6069 return false;
6070 }
6071
6072 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6073 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6074 return false;
6075 }
6076
6077 return true;
6078 }
6079
6080 static bool cgfs_mount_hierarchies(void)
6081 {
6082 char *target;
6083 size_t clen, len;
6084 int i, ret;
6085
6086 for (i = 0; i < num_hierarchies; i++) {
6087 char *controller = hierarchies[i];
6088
6089 clen = strlen(controller);
6090 len = strlen(BASEDIR) + clen + 2;
6091 target = malloc(len);
6092 if (!target)
6093 return false;
6094
6095 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
6096 if (ret < 0 || ret >= len) {
6097 free(target);
6098 return false;
6099 }
6100 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
6101 free(target);
6102 return false;
6103 }
6104 if (!strcmp(controller, "unified"))
6105 ret = mount("none", target, "cgroup2", 0, NULL);
6106 else
6107 ret = mount(controller, target, "cgroup", 0, controller);
6108 if (ret < 0) {
6109 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
6110 free(target);
6111 return false;
6112 }
6113
6114 fd_hierarchies[i] = open(target, O_DIRECTORY);
6115 if (fd_hierarchies[i] < 0) {
6116 free(target);
6117 return false;
6118 }
6119 free(target);
6120 }
6121 return true;
6122 }
6123
6124 static bool cgfs_setup_controllers(void)
6125 {
6126 if (!cgfs_prepare_mounts())
6127 return false;
6128
6129 if (!cgfs_mount_hierarchies()) {
6130 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6131 return false;
6132 }
6133
6134 if (!permute_root())
6135 return false;
6136
6137 return true;
6138 }
6139
6140 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
6141 {
6142 FILE *f;
6143 char *cret, *line = NULL;
6144 char cwd[MAXPATHLEN];
6145 size_t len = 0;
6146 int i, init_ns = -1;
6147 bool found_unified = false;
6148
6149 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
6150 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
6151 return;
6152 }
6153
6154 while (getline(&line, &len, f) != -1) {
6155 char *idx, *p, *p2;
6156
6157 p = strchr(line, ':');
6158 if (!p)
6159 goto out;
6160 idx = line;
6161 *(p++) = '\0';
6162
6163 p2 = strrchr(p, ':');
6164 if (!p2)
6165 goto out;
6166 *p2 = '\0';
6167
6168 /* With cgroupv2 /proc/self/cgroup can contain entries of the
6169 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
6170 * because it parses out the empty string "" and later on passes
6171 * it to mount(). Let's skip such entries.
6172 */
6173 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
6174 found_unified = true;
6175 p = "unified";
6176 }
6177
6178 if (!store_hierarchy(line, p))
6179 goto out;
6180 }
6181
6182 /* Preserve initial namespace. */
6183 init_ns = preserve_mnt_ns(getpid());
6184 if (init_ns < 0) {
6185 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
6186 goto out;
6187 }
6188
6189 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
6190 if (!fd_hierarchies) {
6191 lxcfs_error("%s\n", strerror(errno));
6192 goto out;
6193 }
6194
6195 for (i = 0; i < num_hierarchies; i++)
6196 fd_hierarchies[i] = -1;
6197
6198 cret = getcwd(cwd, MAXPATHLEN);
6199 if (!cret)
6200 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6201
6202 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6203 * to privately mount lxcfs cgroups. */
6204 if (!cgfs_setup_controllers()) {
6205 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
6206 goto out;
6207 }
6208
6209 if (setns(init_ns, 0) < 0) {
6210 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
6211 goto out;
6212 }
6213
6214 if (!cret || chdir(cwd) < 0)
6215 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6216
6217 if (!init_cpuview()) {
6218 lxcfs_error("%s\n", "failed to init CPU view");
6219 goto out;
6220 }
6221
6222 print_subsystems();
6223
6224 out:
6225 free(line);
6226 fclose(f);
6227 if (init_ns >= 0)
6228 close(init_ns);
6229 }
6230
6231 static void __attribute__((destructor)) free_subsystems(void)
6232 {
6233 int i;
6234
6235 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6236
6237 for (i = 0; i < num_hierarchies; i++) {
6238 if (hierarchies[i])
6239 free(hierarchies[i]);
6240 if (fd_hierarchies && fd_hierarchies[i] >= 0)
6241 close(fd_hierarchies[i]);
6242 }
6243 free(hierarchies);
6244 free(fd_hierarchies);
6245 free_cpuview();
6246
6247 if (cgroup_mount_ns_fd >= 0)
6248 close(cgroup_mount_ns_fd);
6249 }