]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
stat: use cpu views
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #define __STDC_FORMAT_MACROS
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <fuse.h>
16 #include <inttypes.h>
17 #include <libgen.h>
18 #include <pthread.h>
19 #include <sched.h>
20 #include <stdbool.h>
21 #include <stdint.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <time.h>
26 #include <unistd.h>
27 #include <wait.h>
28 #include <linux/magic.h>
29 #include <linux/sched.h>
30 #include <sys/epoll.h>
31 #include <sys/mman.h>
32 #include <sys/mount.h>
33 #include <sys/param.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/sysinfo.h>
37 #include <sys/vfs.h>
38
39 #include "bindings.h"
40 #include "config.h" // for VERSION
41
42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
43 #define LXCFS_NUMSTRLEN64 21
44
45 /* Define pivot_root() if missing from the C library */
46 #ifndef HAVE_PIVOT_ROOT
47 static int pivot_root(const char * new_root, const char * put_old)
48 {
49 #ifdef __NR_pivot_root
50 return syscall(__NR_pivot_root, new_root, put_old);
51 #else
52 errno = ENOSYS;
53 return -1;
54 #endif
55 }
56 #else
57 extern int pivot_root(const char * new_root, const char * put_old);
58 #endif
59
60 enum {
61 LXC_TYPE_CGDIR,
62 LXC_TYPE_CGFILE,
63 LXC_TYPE_PROC_MEMINFO,
64 LXC_TYPE_PROC_CPUINFO,
65 LXC_TYPE_PROC_UPTIME,
66 LXC_TYPE_PROC_STAT,
67 LXC_TYPE_PROC_DISKSTATS,
68 LXC_TYPE_PROC_SWAPS,
69 LXC_TYPE_PROC_LOADAVG,
70 };
71
72 struct file_info {
73 char *controller;
74 char *cgroup;
75 char *file;
76 int type;
77 char *buf; // unused as of yet
78 int buflen;
79 int size; //actual data size
80 int cached;
81 };
82
83 struct cpuacct_usage {
84 uint64_t user;
85 uint64_t system;
86 uint64_t idle;
87 };
88
89 /* The function of hash table.*/
90 #define LOAD_SIZE 100 /*the size of hash_table */
91 #define FLUSH_TIME 5 /*the flush rate */
92 #define DEPTH_DIR 3 /*the depth of per cgroup */
93 /* The function of calculate loadavg .*/
94 #define FSHIFT 11 /* nr of bits of precision */
95 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
96 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
97 #define EXP_5 2014 /* 1/exp(5sec/5min) */
98 #define EXP_15 2037 /* 1/exp(5sec/15min) */
99 #define LOAD_INT(x) ((x) >> FSHIFT)
100 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
101 /*
102 * This parameter is used for proc_loadavg_read().
103 * 1 means use loadavg, 0 means not use.
104 */
105 static int loadavg = 0;
106 static volatile sig_atomic_t loadavg_stop = 0;
107 static int calc_hash(const char *name)
108 {
109 unsigned int hash = 0;
110 unsigned int x = 0;
111 /* ELFHash algorithm. */
112 while (*name) {
113 hash = (hash << 4) + *name++;
114 x = hash & 0xf0000000;
115 if (x != 0)
116 hash ^= (x >> 24);
117 hash &= ~x;
118 }
119 return (hash & 0x7fffffff);
120 }
121
122 struct load_node {
123 char *cg; /*cg */
124 unsigned long avenrun[3]; /* Load averages */
125 unsigned int run_pid;
126 unsigned int total_pid;
127 unsigned int last_pid;
128 int cfd; /* The file descriptor of the mounted cgroup */
129 struct load_node *next;
130 struct load_node **pre;
131 };
132
133 struct load_head {
134 /*
135 * The lock is about insert load_node and refresh load_node.To the first
136 * load_node of each hash bucket, insert and refresh in this hash bucket is
137 * mutually exclusive.
138 */
139 pthread_mutex_t lock;
140 /*
141 * The rdlock is about read loadavg and delete load_node.To each hash
142 * bucket, read and delete is mutually exclusive. But at the same time, we
143 * allow paratactic read operation. This rdlock is at list level.
144 */
145 pthread_rwlock_t rdlock;
146 /*
147 * The rilock is about read loadavg and insert load_node.To the first
148 * load_node of each hash bucket, read and insert is mutually exclusive.
149 * But at the same time, we allow paratactic read operation.
150 */
151 pthread_rwlock_t rilock;
152 struct load_node *next;
153 };
154
155 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
156 /*
157 * init_load initialize the hash table.
158 * Return 0 on success, return -1 on failure.
159 */
160 static int init_load(void)
161 {
162 int i;
163 int ret;
164
165 for (i = 0; i < LOAD_SIZE; i++) {
166 load_hash[i].next = NULL;
167 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
168 if (ret != 0) {
169 lxcfs_error("%s\n", "Failed to initialize lock");
170 goto out3;
171 }
172 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
173 if (ret != 0) {
174 lxcfs_error("%s\n", "Failed to initialize rdlock");
175 goto out2;
176 }
177 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
178 if (ret != 0) {
179 lxcfs_error("%s\n", "Failed to initialize rilock");
180 goto out1;
181 }
182 }
183 return 0;
184 out1:
185 pthread_rwlock_destroy(&load_hash[i].rdlock);
186 out2:
187 pthread_mutex_destroy(&load_hash[i].lock);
188 out3:
189 while (i > 0) {
190 i--;
191 pthread_mutex_destroy(&load_hash[i].lock);
192 pthread_rwlock_destroy(&load_hash[i].rdlock);
193 pthread_rwlock_destroy(&load_hash[i].rilock);
194 }
195 return -1;
196 }
197
198 static void insert_node(struct load_node **n, int locate)
199 {
200 struct load_node *f;
201
202 pthread_mutex_lock(&load_hash[locate].lock);
203 pthread_rwlock_wrlock(&load_hash[locate].rilock);
204 f = load_hash[locate].next;
205 load_hash[locate].next = *n;
206
207 (*n)->pre = &(load_hash[locate].next);
208 if (f)
209 f->pre = &((*n)->next);
210 (*n)->next = f;
211 pthread_mutex_unlock(&load_hash[locate].lock);
212 pthread_rwlock_unlock(&load_hash[locate].rilock);
213 }
214 /*
215 * locate_node() finds special node. Not return NULL means success.
216 * It should be noted that rdlock isn't unlocked at the end of code
217 * because this function is used to read special node. Delete is not
218 * allowed before read has ended.
219 * unlock rdlock only in proc_loadavg_read().
220 */
221 static struct load_node *locate_node(char *cg, int locate)
222 {
223 struct load_node *f = NULL;
224 int i = 0;
225
226 pthread_rwlock_rdlock(&load_hash[locate].rilock);
227 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
228 if (load_hash[locate].next == NULL) {
229 pthread_rwlock_unlock(&load_hash[locate].rilock);
230 return f;
231 }
232 f = load_hash[locate].next;
233 pthread_rwlock_unlock(&load_hash[locate].rilock);
234 while (f && ((i = strcmp(f->cg, cg)) != 0))
235 f = f->next;
236 return f;
237 }
238 /* Delete the load_node n and return the next node of it. */
239 static struct load_node *del_node(struct load_node *n, int locate)
240 {
241 struct load_node *g;
242
243 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
244 if (n->next == NULL) {
245 *(n->pre) = NULL;
246 } else {
247 *(n->pre) = n->next;
248 n->next->pre = n->pre;
249 }
250 g = n->next;
251 free(n->cg);
252 free(n);
253 pthread_rwlock_unlock(&load_hash[locate].rdlock);
254 return g;
255 }
256
257 static void load_free(void)
258 {
259 int i;
260 struct load_node *f, *p;
261
262 for (i = 0; i < LOAD_SIZE; i++) {
263 pthread_mutex_lock(&load_hash[i].lock);
264 pthread_rwlock_wrlock(&load_hash[i].rilock);
265 pthread_rwlock_wrlock(&load_hash[i].rdlock);
266 if (load_hash[i].next == NULL) {
267 pthread_mutex_unlock(&load_hash[i].lock);
268 pthread_mutex_destroy(&load_hash[i].lock);
269 pthread_rwlock_unlock(&load_hash[i].rilock);
270 pthread_rwlock_destroy(&load_hash[i].rilock);
271 pthread_rwlock_unlock(&load_hash[i].rdlock);
272 pthread_rwlock_destroy(&load_hash[i].rdlock);
273 continue;
274 }
275 for (f = load_hash[i].next; f; ) {
276 free(f->cg);
277 p = f->next;
278 free(f);
279 f = p;
280 }
281 pthread_mutex_unlock(&load_hash[i].lock);
282 pthread_mutex_destroy(&load_hash[i].lock);
283 pthread_rwlock_unlock(&load_hash[i].rilock);
284 pthread_rwlock_destroy(&load_hash[i].rilock);
285 pthread_rwlock_unlock(&load_hash[i].rdlock);
286 pthread_rwlock_destroy(&load_hash[i].rdlock);
287 }
288 }
289
290 /* Data for CPU view */
291 struct cg_proc_stat {
292 char *cg;
293 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
294 struct cpuacct_usage *view; // Usage stats reported to the container
295 int cpu_count;
296 struct cg_proc_stat *next;
297 };
298
299 struct cg_proc_stat_head {
300 struct cg_proc_stat *next;
301 };
302
303 #define CPUVIEW_HASH_SIZE 100
304 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
305
306 static bool cpuview_init_head(struct cg_proc_stat_head **head)
307 {
308 *head = malloc(sizeof(struct cg_proc_stat_head));
309 if (!(*head)) {
310 lxcfs_error("%s\n", strerror(errno));
311 return false;
312 }
313
314 (*head)->next = NULL;
315 return true;
316 }
317
318 static bool init_cpuview()
319 {
320 int i;
321
322 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
323 proc_stat_history[i] = NULL;
324
325 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
326 if (!cpuview_init_head(&proc_stat_history[i]))
327 goto err;
328 }
329
330 return true;
331
332 err:
333 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
334 if (proc_stat_history[i]) {
335 free(proc_stat_history[i]);
336 proc_stat_history[i] = NULL;
337 }
338 }
339
340 return false;
341 }
342
343 static void cpuview_free_head(struct cg_proc_stat_head *head)
344 {
345 struct cg_proc_stat *node, *tmp;
346
347 if (head->next) {
348 node = head->next;
349
350 for (;;) {
351 tmp = node;
352 node = node->next;
353
354 free(tmp->cg);
355 free(tmp->usage);
356 free(tmp->view);
357 free(tmp);
358
359 if (!node)
360 break;
361 }
362 }
363
364 free(head);
365 }
366
367 static void free_cpuview()
368 {
369 int i;
370
371 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
372 if (proc_stat_history[i])
373 cpuview_free_head(proc_stat_history[i]);
374 }
375 }
376
377 /* Reserve buffer size to account for file size changes. */
378 #define BUF_RESERVE_SIZE 512
379
380 /*
381 * A table caching which pid is init for a pid namespace.
382 * When looking up which pid is init for $qpid, we first
383 * 1. Stat /proc/$qpid/ns/pid.
384 * 2. Check whether the ino_t is in our store.
385 * a. if not, fork a child in qpid's ns to send us
386 * ucred.pid = 1, and read the initpid. Cache
387 * initpid and creation time for /proc/initpid
388 * in a new store entry.
389 * b. if so, verify that /proc/initpid still matches
390 * what we have saved. If not, clear the store
391 * entry and go back to a. If so, return the
392 * cached initpid.
393 */
394 struct pidns_init_store {
395 ino_t ino; // inode number for /proc/$pid/ns/pid
396 pid_t initpid; // the pid of nit in that ns
397 long int ctime; // the time at which /proc/$initpid was created
398 struct pidns_init_store *next;
399 long int lastcheck;
400 };
401
402 /* lol - look at how they are allocated in the kernel */
403 #define PIDNS_HASH_SIZE 4096
404 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
405
406 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
407 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
408 static void lock_mutex(pthread_mutex_t *l)
409 {
410 int ret;
411
412 if ((ret = pthread_mutex_lock(l)) != 0) {
413 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
414 exit(1);
415 }
416 }
417
418 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
419 * Number of hierarchies mounted. */
420 static int num_hierarchies;
421
422 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
423 * Hierachies mounted {cpuset, blkio, ...}:
424 * Initialized via __constructor__ collect_and_mount_subsystems(). */
425 static char **hierarchies;
426
427 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
428 * Open file descriptors:
429 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
430 * private mount namespace.
431 * Initialized via __constructor__ collect_and_mount_subsystems().
432 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
433 * mounts and respective files in the private namespace even when located in
434 * another namespace using the *at() family of functions
435 * {openat(), fchownat(), ...}. */
436 static int *fd_hierarchies;
437 static int cgroup_mount_ns_fd = -1;
438
439 static void unlock_mutex(pthread_mutex_t *l)
440 {
441 int ret;
442
443 if ((ret = pthread_mutex_unlock(l)) != 0) {
444 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
445 exit(1);
446 }
447 }
448
449 static void store_lock(void)
450 {
451 lock_mutex(&pidns_store_mutex);
452 }
453
454 static void store_unlock(void)
455 {
456 unlock_mutex(&pidns_store_mutex);
457 }
458
459 /* Must be called under store_lock */
460 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
461 {
462 struct stat initsb;
463 char fnam[100];
464
465 snprintf(fnam, 100, "/proc/%d", e->initpid);
466 if (stat(fnam, &initsb) < 0)
467 return false;
468
469 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
470 initsb.st_ctime, e->initpid);
471
472 if (e->ctime != initsb.st_ctime)
473 return false;
474 return true;
475 }
476
477 /* Must be called under store_lock */
478 static void remove_initpid(struct pidns_init_store *e)
479 {
480 struct pidns_init_store *tmp;
481 int h;
482
483 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
484
485 h = HASH(e->ino);
486 if (pidns_hash_table[h] == e) {
487 pidns_hash_table[h] = e->next;
488 free(e);
489 return;
490 }
491
492 tmp = pidns_hash_table[h];
493 while (tmp) {
494 if (tmp->next == e) {
495 tmp->next = e->next;
496 free(e);
497 return;
498 }
499 tmp = tmp->next;
500 }
501 }
502
503 #define PURGE_SECS 5
504 /* Must be called under store_lock */
505 static void prune_initpid_store(void)
506 {
507 static long int last_prune = 0;
508 struct pidns_init_store *e, *prev, *delme;
509 long int now, threshold;
510 int i;
511
512 if (!last_prune) {
513 last_prune = time(NULL);
514 return;
515 }
516 now = time(NULL);
517 if (now < last_prune + PURGE_SECS)
518 return;
519
520 lxcfs_debug("%s\n", "Pruning.");
521
522 last_prune = now;
523 threshold = now - 2 * PURGE_SECS;
524
525 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
526 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
527 if (e->lastcheck < threshold) {
528
529 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
530
531 delme = e;
532 if (prev)
533 prev->next = e->next;
534 else
535 pidns_hash_table[i] = e->next;
536 e = e->next;
537 free(delme);
538 } else {
539 prev = e;
540 e = e->next;
541 }
542 }
543 }
544 }
545
546 /* Must be called under store_lock */
547 static void save_initpid(struct stat *sb, pid_t pid)
548 {
549 struct pidns_init_store *e;
550 char fpath[100];
551 struct stat procsb;
552 int h;
553
554 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
555
556 snprintf(fpath, 100, "/proc/%d", pid);
557 if (stat(fpath, &procsb) < 0)
558 return;
559 do {
560 e = malloc(sizeof(*e));
561 } while (!e);
562 e->ino = sb->st_ino;
563 e->initpid = pid;
564 e->ctime = procsb.st_ctime;
565 h = HASH(e->ino);
566 e->next = pidns_hash_table[h];
567 e->lastcheck = time(NULL);
568 pidns_hash_table[h] = e;
569 }
570
571 /*
572 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
573 * entry for the inode number and creation time. Verify that the init pid
574 * is still valid. If not, remove it. Return the entry if valid, NULL
575 * otherwise.
576 * Must be called under store_lock
577 */
578 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
579 {
580 int h = HASH(sb->st_ino);
581 struct pidns_init_store *e = pidns_hash_table[h];
582
583 while (e) {
584 if (e->ino == sb->st_ino) {
585 if (initpid_still_valid(e, sb)) {
586 e->lastcheck = time(NULL);
587 return e;
588 }
589 remove_initpid(e);
590 return NULL;
591 }
592 e = e->next;
593 }
594
595 return NULL;
596 }
597
598 static int is_dir(const char *path, int fd)
599 {
600 struct stat statbuf;
601 int ret = fstatat(fd, path, &statbuf, fd);
602 if (ret == 0 && S_ISDIR(statbuf.st_mode))
603 return 1;
604 return 0;
605 }
606
607 static char *must_copy_string(const char *str)
608 {
609 char *dup = NULL;
610 if (!str)
611 return NULL;
612 do {
613 dup = strdup(str);
614 } while (!dup);
615
616 return dup;
617 }
618
619 static inline void drop_trailing_newlines(char *s)
620 {
621 int l;
622
623 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
624 s[l-1] = '\0';
625 }
626
627 #define BATCH_SIZE 50
628 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
629 {
630 int newbatches = (newlen / BATCH_SIZE) + 1;
631 int oldbatches = (oldlen / BATCH_SIZE) + 1;
632
633 if (!*mem || newbatches > oldbatches) {
634 char *tmp;
635 do {
636 tmp = realloc(*mem, newbatches * BATCH_SIZE);
637 } while (!tmp);
638 *mem = tmp;
639 }
640 }
641 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
642 {
643 size_t newlen = *len + linelen;
644 dorealloc(contents, *len, newlen + 1);
645 memcpy(*contents + *len, line, linelen+1);
646 *len = newlen;
647 }
648
649 static char *slurp_file(const char *from, int fd)
650 {
651 char *line = NULL;
652 char *contents = NULL;
653 FILE *f = fdopen(fd, "r");
654 size_t len = 0, fulllen = 0;
655 ssize_t linelen;
656
657 if (!f)
658 return NULL;
659
660 while ((linelen = getline(&line, &len, f)) != -1) {
661 append_line(&contents, &fulllen, line, linelen);
662 }
663 fclose(f);
664
665 if (contents)
666 drop_trailing_newlines(contents);
667 free(line);
668 return contents;
669 }
670
671 static bool write_string(const char *fnam, const char *string, int fd)
672 {
673 FILE *f;
674 size_t len, ret;
675
676 f = fdopen(fd, "w");
677 if (!f)
678 return false;
679
680 len = strlen(string);
681 ret = fwrite(string, 1, len, f);
682 if (ret != len) {
683 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
684 strerror(errno), string, fnam);
685 fclose(f);
686 return false;
687 }
688
689 if (fclose(f) < 0) {
690 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
691 return false;
692 }
693
694 return true;
695 }
696
697 struct cgfs_files {
698 char *name;
699 uint32_t uid, gid;
700 uint32_t mode;
701 };
702
703 #define ALLOC_NUM 20
704 static bool store_hierarchy(char *stridx, char *h)
705 {
706 if (num_hierarchies % ALLOC_NUM == 0) {
707 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
708 n *= ALLOC_NUM;
709 char **tmp = realloc(hierarchies, n * sizeof(char *));
710 if (!tmp) {
711 lxcfs_error("%s\n", strerror(errno));
712 exit(1);
713 }
714 hierarchies = tmp;
715 }
716
717 hierarchies[num_hierarchies++] = must_copy_string(h);
718 return true;
719 }
720
721 static void print_subsystems(void)
722 {
723 int i;
724
725 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
726 fprintf(stderr, "hierarchies:\n");
727 for (i = 0; i < num_hierarchies; i++) {
728 if (hierarchies[i])
729 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
730 fd_hierarchies[i], hierarchies[i]);
731 }
732 }
733
734 static bool in_comma_list(const char *needle, const char *haystack)
735 {
736 const char *s = haystack, *e;
737 size_t nlen = strlen(needle);
738
739 while (*s && (e = strchr(s, ','))) {
740 if (nlen != e - s) {
741 s = e + 1;
742 continue;
743 }
744 if (strncmp(needle, s, nlen) == 0)
745 return true;
746 s = e + 1;
747 }
748 if (strcmp(needle, s) == 0)
749 return true;
750 return false;
751 }
752
753 /* do we need to do any massaging here? I'm not sure... */
754 /* Return the mounted controller and store the corresponding open file descriptor
755 * referring to the controller mountpoint in the private lxcfs namespace in
756 * @cfd.
757 */
758 static char *find_mounted_controller(const char *controller, int *cfd)
759 {
760 int i;
761
762 for (i = 0; i < num_hierarchies; i++) {
763 if (!hierarchies[i])
764 continue;
765 if (strcmp(hierarchies[i], controller) == 0) {
766 *cfd = fd_hierarchies[i];
767 return hierarchies[i];
768 }
769 if (in_comma_list(controller, hierarchies[i])) {
770 *cfd = fd_hierarchies[i];
771 return hierarchies[i];
772 }
773 }
774
775 return NULL;
776 }
777
778 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
779 const char *value)
780 {
781 int ret, fd, cfd;
782 size_t len;
783 char *fnam, *tmpc;
784
785 tmpc = find_mounted_controller(controller, &cfd);
786 if (!tmpc)
787 return false;
788
789 /* Make sure we pass a relative path to *at() family of functions.
790 * . + /cgroup + / + file + \0
791 */
792 len = strlen(cgroup) + strlen(file) + 3;
793 fnam = alloca(len);
794 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
795 if (ret < 0 || (size_t)ret >= len)
796 return false;
797
798 fd = openat(cfd, fnam, O_WRONLY);
799 if (fd < 0)
800 return false;
801
802 return write_string(fnam, value, fd);
803 }
804
805 // Chown all the files in the cgroup directory. We do this when we create
806 // a cgroup on behalf of a user.
807 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
808 {
809 struct dirent *direntp;
810 char path[MAXPATHLEN];
811 size_t len;
812 DIR *d;
813 int fd1, ret;
814
815 len = strlen(dirname);
816 if (len >= MAXPATHLEN) {
817 lxcfs_error("Pathname too long: %s\n", dirname);
818 return;
819 }
820
821 fd1 = openat(fd, dirname, O_DIRECTORY);
822 if (fd1 < 0)
823 return;
824
825 d = fdopendir(fd1);
826 if (!d) {
827 lxcfs_error("Failed to open %s\n", dirname);
828 return;
829 }
830
831 while ((direntp = readdir(d))) {
832 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
833 continue;
834 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
835 if (ret < 0 || ret >= MAXPATHLEN) {
836 lxcfs_error("Pathname too long under %s\n", dirname);
837 continue;
838 }
839 if (fchownat(fd, path, uid, gid, 0) < 0)
840 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
841 }
842 closedir(d);
843 }
844
845 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
846 {
847 int cfd;
848 size_t len;
849 char *dirnam, *tmpc;
850
851 tmpc = find_mounted_controller(controller, &cfd);
852 if (!tmpc)
853 return -EINVAL;
854
855 /* Make sure we pass a relative path to *at() family of functions.
856 * . + /cg + \0
857 */
858 len = strlen(cg) + 2;
859 dirnam = alloca(len);
860 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
861
862 if (mkdirat(cfd, dirnam, 0755) < 0)
863 return -errno;
864
865 if (uid == 0 && gid == 0)
866 return 0;
867
868 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
869 return -errno;
870
871 chown_all_cgroup_files(dirnam, uid, gid, cfd);
872
873 return 0;
874 }
875
876 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
877 {
878 struct dirent *direntp;
879 DIR *dir;
880 bool ret = false;
881 char pathname[MAXPATHLEN];
882 int dupfd;
883
884 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
885 if (dupfd < 0)
886 return false;
887
888 dir = fdopendir(dupfd);
889 if (!dir) {
890 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
891 close(dupfd);
892 return false;
893 }
894
895 while ((direntp = readdir(dir))) {
896 struct stat mystat;
897 int rc;
898
899 if (!strcmp(direntp->d_name, ".") ||
900 !strcmp(direntp->d_name, ".."))
901 continue;
902
903 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
904 if (rc < 0 || rc >= MAXPATHLEN) {
905 lxcfs_error("%s\n", "Pathname too long.");
906 continue;
907 }
908
909 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
910 if (rc) {
911 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
912 continue;
913 }
914 if (S_ISDIR(mystat.st_mode))
915 if (!recursive_rmdir(pathname, fd, cfd))
916 lxcfs_debug("Error removing %s.\n", pathname);
917 }
918
919 ret = true;
920 if (closedir(dir) < 0) {
921 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
922 ret = false;
923 }
924
925 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
926 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
927 ret = false;
928 }
929
930 close(dupfd);
931
932 return ret;
933 }
934
935 bool cgfs_remove(const char *controller, const char *cg)
936 {
937 int fd, cfd;
938 size_t len;
939 char *dirnam, *tmpc;
940 bool bret;
941
942 tmpc = find_mounted_controller(controller, &cfd);
943 if (!tmpc)
944 return false;
945
946 /* Make sure we pass a relative path to *at() family of functions.
947 * . + /cg + \0
948 */
949 len = strlen(cg) + 2;
950 dirnam = alloca(len);
951 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
952
953 fd = openat(cfd, dirnam, O_DIRECTORY);
954 if (fd < 0)
955 return false;
956
957 bret = recursive_rmdir(dirnam, fd, cfd);
958 close(fd);
959 return bret;
960 }
961
962 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
963 {
964 int cfd;
965 size_t len;
966 char *pathname, *tmpc;
967
968 tmpc = find_mounted_controller(controller, &cfd);
969 if (!tmpc)
970 return false;
971
972 /* Make sure we pass a relative path to *at() family of functions.
973 * . + /file + \0
974 */
975 len = strlen(file) + 2;
976 pathname = alloca(len);
977 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
978 if (fchmodat(cfd, pathname, mode, 0) < 0)
979 return false;
980 return true;
981 }
982
983 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
984 {
985 size_t len;
986 char *fname;
987
988 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
989 fname = alloca(len);
990 snprintf(fname, len, "%s/tasks", dirname);
991 if (fchownat(fd, fname, uid, gid, 0) != 0)
992 return -errno;
993 snprintf(fname, len, "%s/cgroup.procs", dirname);
994 if (fchownat(fd, fname, uid, gid, 0) != 0)
995 return -errno;
996 return 0;
997 }
998
999 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
1000 {
1001 int cfd;
1002 size_t len;
1003 char *pathname, *tmpc;
1004
1005 tmpc = find_mounted_controller(controller, &cfd);
1006 if (!tmpc)
1007 return -EINVAL;
1008
1009 /* Make sure we pass a relative path to *at() family of functions.
1010 * . + /file + \0
1011 */
1012 len = strlen(file) + 2;
1013 pathname = alloca(len);
1014 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1015 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
1016 return -errno;
1017
1018 if (is_dir(pathname, cfd))
1019 // like cgmanager did, we want to chown the tasks file as well
1020 return chown_tasks_files(pathname, uid, gid, cfd);
1021
1022 return 0;
1023 }
1024
1025 FILE *open_pids_file(const char *controller, const char *cgroup)
1026 {
1027 int fd, cfd;
1028 size_t len;
1029 char *pathname, *tmpc;
1030
1031 tmpc = find_mounted_controller(controller, &cfd);
1032 if (!tmpc)
1033 return NULL;
1034
1035 /* Make sure we pass a relative path to *at() family of functions.
1036 * . + /cgroup + / "cgroup.procs" + \0
1037 */
1038 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
1039 pathname = alloca(len);
1040 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1041
1042 fd = openat(cfd, pathname, O_WRONLY);
1043 if (fd < 0)
1044 return NULL;
1045
1046 return fdopen(fd, "w");
1047 }
1048
1049 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1050 void ***list, size_t typesize,
1051 void* (*iterator)(const char*, const char*, const char*))
1052 {
1053 int cfd, fd, ret;
1054 size_t len;
1055 char *cg, *tmpc;
1056 char pathname[MAXPATHLEN];
1057 size_t sz = 0, asz = 0;
1058 struct dirent *dirent;
1059 DIR *dir;
1060
1061 tmpc = find_mounted_controller(controller, &cfd);
1062 *list = NULL;
1063 if (!tmpc)
1064 return false;
1065
1066 /* Make sure we pass a relative path to *at() family of functions. */
1067 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1068 cg = alloca(len);
1069 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1070 if (ret < 0 || (size_t)ret >= len) {
1071 lxcfs_error("Pathname too long under %s\n", cgroup);
1072 return false;
1073 }
1074
1075 fd = openat(cfd, cg, O_DIRECTORY);
1076 if (fd < 0)
1077 return false;
1078
1079 dir = fdopendir(fd);
1080 if (!dir)
1081 return false;
1082
1083 while ((dirent = readdir(dir))) {
1084 struct stat mystat;
1085
1086 if (!strcmp(dirent->d_name, ".") ||
1087 !strcmp(dirent->d_name, ".."))
1088 continue;
1089
1090 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1091 if (ret < 0 || ret >= MAXPATHLEN) {
1092 lxcfs_error("Pathname too long under %s\n", cg);
1093 continue;
1094 }
1095
1096 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1097 if (ret) {
1098 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1099 continue;
1100 }
1101 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1102 (directories && !S_ISDIR(mystat.st_mode)))
1103 continue;
1104
1105 if (sz+2 >= asz) {
1106 void **tmp;
1107 asz += BATCH_SIZE;
1108 do {
1109 tmp = realloc(*list, asz * typesize);
1110 } while (!tmp);
1111 *list = tmp;
1112 }
1113 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1114 (*list)[sz+1] = NULL;
1115 sz++;
1116 }
1117 if (closedir(dir) < 0) {
1118 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1119 return false;
1120 }
1121 return true;
1122 }
1123
1124 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1125 {
1126 char *dup;
1127 do {
1128 dup = strdup(dir_entry);
1129 } while (!dup);
1130 return dup;
1131 }
1132
1133 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1134 {
1135 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1136 }
1137
1138 void free_key(struct cgfs_files *k)
1139 {
1140 if (!k)
1141 return;
1142 free(k->name);
1143 free(k);
1144 }
1145
1146 void free_keys(struct cgfs_files **keys)
1147 {
1148 int i;
1149
1150 if (!keys)
1151 return;
1152 for (i = 0; keys[i]; i++) {
1153 free_key(keys[i]);
1154 }
1155 free(keys);
1156 }
1157
1158 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1159 {
1160 int ret, fd, cfd;
1161 size_t len;
1162 char *fnam, *tmpc;
1163
1164 tmpc = find_mounted_controller(controller, &cfd);
1165 if (!tmpc)
1166 return false;
1167
1168 /* Make sure we pass a relative path to *at() family of functions.
1169 * . + /cgroup + / + file + \0
1170 */
1171 len = strlen(cgroup) + strlen(file) + 3;
1172 fnam = alloca(len);
1173 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1174 if (ret < 0 || (size_t)ret >= len)
1175 return false;
1176
1177 fd = openat(cfd, fnam, O_RDONLY);
1178 if (fd < 0)
1179 return false;
1180
1181 *value = slurp_file(fnam, fd);
1182 return *value != NULL;
1183 }
1184
1185 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1186 {
1187 int ret, cfd;
1188 size_t len;
1189 char *fnam, *tmpc;
1190 struct stat sb;
1191 struct cgfs_files *newkey;
1192
1193 tmpc = find_mounted_controller(controller, &cfd);
1194 if (!tmpc)
1195 return false;
1196
1197 if (file && *file == '/')
1198 file++;
1199
1200 if (file && strchr(file, '/'))
1201 return NULL;
1202
1203 /* Make sure we pass a relative path to *at() family of functions.
1204 * . + /cgroup + / + file + \0
1205 */
1206 len = strlen(cgroup) + 3;
1207 if (file)
1208 len += strlen(file) + 1;
1209 fnam = alloca(len);
1210 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1211 file ? "/" : "", file ? file : "");
1212
1213 ret = fstatat(cfd, fnam, &sb, 0);
1214 if (ret < 0)
1215 return NULL;
1216
1217 do {
1218 newkey = malloc(sizeof(struct cgfs_files));
1219 } while (!newkey);
1220 if (file)
1221 newkey->name = must_copy_string(file);
1222 else if (strrchr(cgroup, '/'))
1223 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1224 else
1225 newkey->name = must_copy_string(cgroup);
1226 newkey->uid = sb.st_uid;
1227 newkey->gid = sb.st_gid;
1228 newkey->mode = sb.st_mode;
1229
1230 return newkey;
1231 }
1232
1233 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1234 {
1235 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1236 if (!entry) {
1237 lxcfs_error("Error getting files under %s:%s\n", controller,
1238 cgroup);
1239 }
1240 return entry;
1241 }
1242
1243 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1244 {
1245 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1246 }
1247
1248 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1249 {
1250 int cfd;
1251 size_t len;
1252 char *fnam, *tmpc;
1253 int ret;
1254 struct stat sb;
1255
1256 tmpc = find_mounted_controller(controller, &cfd);
1257 if (!tmpc)
1258 return false;
1259
1260 /* Make sure we pass a relative path to *at() family of functions.
1261 * . + /cgroup + / + f + \0
1262 */
1263 len = strlen(cgroup) + strlen(f) + 3;
1264 fnam = alloca(len);
1265 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1266 if (ret < 0 || (size_t)ret >= len)
1267 return false;
1268
1269 ret = fstatat(cfd, fnam, &sb, 0);
1270 if (ret < 0 || !S_ISDIR(sb.st_mode))
1271 return false;
1272
1273 return true;
1274 }
1275
1276 #define SEND_CREDS_OK 0
1277 #define SEND_CREDS_NOTSK 1
1278 #define SEND_CREDS_FAIL 2
1279 static bool recv_creds(int sock, struct ucred *cred, char *v);
1280 static int wait_for_pid(pid_t pid);
1281 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1282 static int send_creds_clone_wrapper(void *arg);
1283
1284 /*
1285 * clone a task which switches to @task's namespace and writes '1'.
1286 * over a unix sock so we can read the task's reaper's pid in our
1287 * namespace
1288 *
1289 * Note: glibc's fork() does not respect pidns, which can lead to failed
1290 * assertions inside glibc (and thus failed forks) if the child's pid in
1291 * the pidns and the parent pid outside are identical. Using clone prevents
1292 * this issue.
1293 */
1294 static void write_task_init_pid_exit(int sock, pid_t target)
1295 {
1296 char fnam[100];
1297 pid_t pid;
1298 int fd, ret;
1299 size_t stack_size = sysconf(_SC_PAGESIZE);
1300 void *stack = alloca(stack_size);
1301
1302 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1303 if (ret < 0 || ret >= sizeof(fnam))
1304 _exit(1);
1305
1306 fd = open(fnam, O_RDONLY);
1307 if (fd < 0) {
1308 perror("write_task_init_pid_exit open of ns/pid");
1309 _exit(1);
1310 }
1311 if (setns(fd, 0)) {
1312 perror("write_task_init_pid_exit setns 1");
1313 close(fd);
1314 _exit(1);
1315 }
1316 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1317 if (pid < 0)
1318 _exit(1);
1319 if (pid != 0) {
1320 if (!wait_for_pid(pid))
1321 _exit(1);
1322 _exit(0);
1323 }
1324 }
1325
1326 static int send_creds_clone_wrapper(void *arg) {
1327 struct ucred cred;
1328 char v;
1329 int sock = *(int *)arg;
1330
1331 /* we are the child */
1332 cred.uid = 0;
1333 cred.gid = 0;
1334 cred.pid = 1;
1335 v = '1';
1336 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1337 return 1;
1338 return 0;
1339 }
1340
1341 static pid_t get_init_pid_for_task(pid_t task)
1342 {
1343 int sock[2];
1344 pid_t pid;
1345 pid_t ret = -1;
1346 char v = '0';
1347 struct ucred cred;
1348
1349 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1350 perror("socketpair");
1351 return -1;
1352 }
1353
1354 pid = fork();
1355 if (pid < 0)
1356 goto out;
1357 if (!pid) {
1358 close(sock[1]);
1359 write_task_init_pid_exit(sock[0], task);
1360 _exit(0);
1361 }
1362
1363 if (!recv_creds(sock[1], &cred, &v))
1364 goto out;
1365 ret = cred.pid;
1366
1367 out:
1368 close(sock[0]);
1369 close(sock[1]);
1370 if (pid > 0)
1371 wait_for_pid(pid);
1372 return ret;
1373 }
1374
1375 static pid_t lookup_initpid_in_store(pid_t qpid)
1376 {
1377 pid_t answer = 0;
1378 struct stat sb;
1379 struct pidns_init_store *e;
1380 char fnam[100];
1381
1382 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1383 store_lock();
1384 if (stat(fnam, &sb) < 0)
1385 goto out;
1386 e = lookup_verify_initpid(&sb);
1387 if (e) {
1388 answer = e->initpid;
1389 goto out;
1390 }
1391 answer = get_init_pid_for_task(qpid);
1392 if (answer > 0)
1393 save_initpid(&sb, answer);
1394
1395 out:
1396 /* we prune at end in case we are returning
1397 * the value we were about to return */
1398 prune_initpid_store();
1399 store_unlock();
1400 return answer;
1401 }
1402
1403 static int wait_for_pid(pid_t pid)
1404 {
1405 int status, ret;
1406
1407 if (pid <= 0)
1408 return -1;
1409
1410 again:
1411 ret = waitpid(pid, &status, 0);
1412 if (ret == -1) {
1413 if (errno == EINTR)
1414 goto again;
1415 return -1;
1416 }
1417 if (ret != pid)
1418 goto again;
1419 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1420 return -1;
1421 return 0;
1422 }
1423
1424
1425 /*
1426 * append pid to *src.
1427 * src: a pointer to a char* in which ot append the pid.
1428 * sz: the number of characters printed so far, minus trailing \0.
1429 * asz: the allocated size so far
1430 * pid: the pid to append
1431 */
1432 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1433 {
1434 char tmp[30];
1435
1436 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1437
1438 if (!*src || tmplen + *sz + 1 >= *asz) {
1439 char *tmp;
1440 do {
1441 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1442 } while (!tmp);
1443 *src = tmp;
1444 *asz += BUF_RESERVE_SIZE;
1445 }
1446 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1447 *sz += tmplen;
1448 }
1449
1450 /*
1451 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1452 * valid in the caller's namespace, return the id mapped into
1453 * pid's namespace.
1454 * Returns the mapped id, or -1 on error.
1455 */
1456 unsigned int
1457 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1458 {
1459 unsigned int nsuid, // base id for a range in the idfile's namespace
1460 hostuid, // base id for a range in the caller's namespace
1461 count; // number of ids in this range
1462 char line[400];
1463 int ret;
1464
1465 fseek(idfile, 0L, SEEK_SET);
1466 while (fgets(line, 400, idfile)) {
1467 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1468 if (ret != 3)
1469 continue;
1470 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1471 /*
1472 * uids wrapped around - unexpected as this is a procfile,
1473 * so just bail.
1474 */
1475 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1476 nsuid, hostuid, count, line);
1477 return -1;
1478 }
1479 if (hostuid <= in_id && hostuid+count > in_id) {
1480 /*
1481 * now since hostuid <= in_id < hostuid+count, and
1482 * hostuid+count and nsuid+count do not wrap around,
1483 * we know that nsuid+(in_id-hostuid) which must be
1484 * less that nsuid+(count) must not wrap around
1485 */
1486 return (in_id - hostuid) + nsuid;
1487 }
1488 }
1489
1490 // no answer found
1491 return -1;
1492 }
1493
1494 /*
1495 * for is_privileged_over,
1496 * specify whether we require the calling uid to be root in his
1497 * namespace
1498 */
1499 #define NS_ROOT_REQD true
1500 #define NS_ROOT_OPT false
1501
1502 #define PROCLEN 100
1503
1504 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1505 {
1506 char fpath[PROCLEN];
1507 int ret;
1508 bool answer = false;
1509 uid_t nsuid;
1510
1511 if (victim == -1 || uid == -1)
1512 return false;
1513
1514 /*
1515 * If the request is one not requiring root in the namespace,
1516 * then having the same uid suffices. (i.e. uid 1000 has write
1517 * access to files owned by uid 1000
1518 */
1519 if (!req_ns_root && uid == victim)
1520 return true;
1521
1522 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1523 if (ret < 0 || ret >= PROCLEN)
1524 return false;
1525 FILE *f = fopen(fpath, "r");
1526 if (!f)
1527 return false;
1528
1529 /* if caller's not root in his namespace, reject */
1530 nsuid = convert_id_to_ns(f, uid);
1531 if (nsuid)
1532 goto out;
1533
1534 /*
1535 * If victim is not mapped into caller's ns, reject.
1536 * XXX I'm not sure this check is needed given that fuse
1537 * will be sending requests where the vfs has converted
1538 */
1539 nsuid = convert_id_to_ns(f, victim);
1540 if (nsuid == -1)
1541 goto out;
1542
1543 answer = true;
1544
1545 out:
1546 fclose(f);
1547 return answer;
1548 }
1549
1550 static bool perms_include(int fmode, mode_t req_mode)
1551 {
1552 mode_t r;
1553
1554 switch (req_mode & O_ACCMODE) {
1555 case O_RDONLY:
1556 r = S_IROTH;
1557 break;
1558 case O_WRONLY:
1559 r = S_IWOTH;
1560 break;
1561 case O_RDWR:
1562 r = S_IROTH | S_IWOTH;
1563 break;
1564 default:
1565 return false;
1566 }
1567 return ((fmode & r) == r);
1568 }
1569
1570
1571 /*
1572 * taskcg is a/b/c
1573 * querycg is /a/b/c/d/e
1574 * we return 'd'
1575 */
1576 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1577 {
1578 char *start, *end;
1579
1580 if (strlen(taskcg) <= strlen(querycg)) {
1581 lxcfs_error("%s\n", "I was fed bad input.");
1582 return NULL;
1583 }
1584
1585 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1586 start = strdup(taskcg + 1);
1587 else
1588 start = strdup(taskcg + strlen(querycg) + 1);
1589 if (!start)
1590 return NULL;
1591 end = strchr(start, '/');
1592 if (end)
1593 *end = '\0';
1594 return start;
1595 }
1596
1597 static void stripnewline(char *x)
1598 {
1599 size_t l = strlen(x);
1600 if (l && x[l-1] == '\n')
1601 x[l-1] = '\0';
1602 }
1603
1604 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1605 {
1606 int cfd;
1607 char fnam[PROCLEN];
1608 FILE *f;
1609 char *answer = NULL;
1610 char *line = NULL;
1611 size_t len = 0;
1612 int ret;
1613 const char *h = find_mounted_controller(contrl, &cfd);
1614 if (!h)
1615 return NULL;
1616
1617 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1618 if (ret < 0 || ret >= PROCLEN)
1619 return NULL;
1620 if (!(f = fopen(fnam, "r")))
1621 return NULL;
1622
1623 while (getline(&line, &len, f) != -1) {
1624 char *c1, *c2;
1625 if (!line[0])
1626 continue;
1627 c1 = strchr(line, ':');
1628 if (!c1)
1629 goto out;
1630 c1++;
1631 c2 = strchr(c1, ':');
1632 if (!c2)
1633 goto out;
1634 *c2 = '\0';
1635 if (strcmp(c1, h) != 0)
1636 continue;
1637 c2++;
1638 stripnewline(c2);
1639 do {
1640 answer = strdup(c2);
1641 } while (!answer);
1642 break;
1643 }
1644
1645 out:
1646 fclose(f);
1647 free(line);
1648 return answer;
1649 }
1650
1651 /*
1652 * check whether a fuse context may access a cgroup dir or file
1653 *
1654 * If file is not null, it is a cgroup file to check under cg.
1655 * If file is null, then we are checking perms on cg itself.
1656 *
1657 * For files we can check the mode of the list_keys result.
1658 * For cgroups, we must make assumptions based on the files under the
1659 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1660 * yet.
1661 */
1662 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1663 {
1664 struct cgfs_files *k = NULL;
1665 bool ret = false;
1666
1667 k = cgfs_get_key(contrl, cg, file);
1668 if (!k)
1669 return false;
1670
1671 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1672 if (perms_include(k->mode >> 6, mode)) {
1673 ret = true;
1674 goto out;
1675 }
1676 }
1677 if (fc->gid == k->gid) {
1678 if (perms_include(k->mode >> 3, mode)) {
1679 ret = true;
1680 goto out;
1681 }
1682 }
1683 ret = perms_include(k->mode, mode);
1684
1685 out:
1686 free_key(k);
1687 return ret;
1688 }
1689
1690 #define INITSCOPE "/init.scope"
1691 static void prune_init_slice(char *cg)
1692 {
1693 char *point;
1694 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1695
1696 if (cg_len < initscope_len)
1697 return;
1698
1699 point = cg + cg_len - initscope_len;
1700 if (strcmp(point, INITSCOPE) == 0) {
1701 if (point == cg)
1702 *(point+1) = '\0';
1703 else
1704 *point = '\0';
1705 }
1706 }
1707
1708 /*
1709 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1710 * If pid is in /a, he may act on /a/b, but not on /b.
1711 * if the answer is false and nextcg is not NULL, then *nextcg will point
1712 * to a string containing the next cgroup directory under cg, which must be
1713 * freed by the caller.
1714 */
1715 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1716 {
1717 bool answer = false;
1718 char *c2 = get_pid_cgroup(pid, contrl);
1719 char *linecmp;
1720
1721 if (!c2)
1722 return false;
1723 prune_init_slice(c2);
1724
1725 /*
1726 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1727 * they pass in a cgroup without leading '/'
1728 *
1729 * The original line here was:
1730 * linecmp = *cg == '/' ? c2 : c2+1;
1731 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1732 * Serge, do you know?
1733 */
1734 if (*cg == '/' || !strncmp(cg, "./", 2))
1735 linecmp = c2;
1736 else
1737 linecmp = c2 + 1;
1738 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1739 if (nextcg) {
1740 *nextcg = get_next_cgroup_dir(linecmp, cg);
1741 }
1742 goto out;
1743 }
1744 answer = true;
1745
1746 out:
1747 free(c2);
1748 return answer;
1749 }
1750
1751 /*
1752 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1753 */
1754 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1755 {
1756 bool answer = false;
1757 char *c2, *task_cg;
1758 size_t target_len, task_len;
1759
1760 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1761 return true;
1762
1763 c2 = get_pid_cgroup(pid, contrl);
1764 if (!c2)
1765 return false;
1766 prune_init_slice(c2);
1767
1768 task_cg = c2 + 1;
1769 target_len = strlen(cg);
1770 task_len = strlen(task_cg);
1771 if (task_len == 0) {
1772 /* Task is in the root cg, it can see everything. This case is
1773 * not handled by the strmcps below, since they test for the
1774 * last /, but that is the first / that we've chopped off
1775 * above.
1776 */
1777 answer = true;
1778 goto out;
1779 }
1780 if (strcmp(cg, task_cg) == 0) {
1781 answer = true;
1782 goto out;
1783 }
1784 if (target_len < task_len) {
1785 /* looking up a parent dir */
1786 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1787 answer = true;
1788 goto out;
1789 }
1790 if (target_len > task_len) {
1791 /* looking up a child dir */
1792 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1793 answer = true;
1794 goto out;
1795 }
1796
1797 out:
1798 free(c2);
1799 return answer;
1800 }
1801
1802 /*
1803 * given /cgroup/freezer/a/b, return "freezer".
1804 * the returned char* should NOT be freed.
1805 */
1806 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1807 {
1808 const char *p1;
1809 char *contr, *slash;
1810
1811 if (strlen(path) < 9) {
1812 errno = EACCES;
1813 return NULL;
1814 }
1815 if (*(path + 7) != '/') {
1816 errno = EINVAL;
1817 return NULL;
1818 }
1819 p1 = path + 8;
1820 contr = strdupa(p1);
1821 if (!contr) {
1822 errno = ENOMEM;
1823 return NULL;
1824 }
1825 slash = strstr(contr, "/");
1826 if (slash)
1827 *slash = '\0';
1828
1829 int i;
1830 for (i = 0; i < num_hierarchies; i++) {
1831 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1832 return hierarchies[i];
1833 }
1834 errno = ENOENT;
1835 return NULL;
1836 }
1837
1838 /*
1839 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1840 * Note that the returned value may include files (keynames) etc
1841 */
1842 static const char *find_cgroup_in_path(const char *path)
1843 {
1844 const char *p1;
1845
1846 if (strlen(path) < 9) {
1847 errno = EACCES;
1848 return NULL;
1849 }
1850 p1 = strstr(path + 8, "/");
1851 if (!p1) {
1852 errno = EINVAL;
1853 return NULL;
1854 }
1855 errno = 0;
1856 return p1 + 1;
1857 }
1858
1859 /*
1860 * split the last path element from the path in @cg.
1861 * @dir is newly allocated and should be freed, @last not
1862 */
1863 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1864 {
1865 char *p;
1866
1867 do {
1868 *dir = strdup(cg);
1869 } while (!*dir);
1870 *last = strrchr(cg, '/');
1871 if (!*last) {
1872 *last = NULL;
1873 return;
1874 }
1875 p = strrchr(*dir, '/');
1876 *p = '\0';
1877 }
1878
1879 /*
1880 * FUSE ops for /cgroup
1881 */
1882
1883 int cg_getattr(const char *path, struct stat *sb)
1884 {
1885 struct timespec now;
1886 struct fuse_context *fc = fuse_get_context();
1887 char * cgdir = NULL;
1888 char *last = NULL, *path1, *path2;
1889 struct cgfs_files *k = NULL;
1890 const char *cgroup;
1891 const char *controller = NULL;
1892 int ret = -ENOENT;
1893
1894
1895 if (!fc)
1896 return -EIO;
1897
1898 memset(sb, 0, sizeof(struct stat));
1899
1900 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1901 return -EINVAL;
1902
1903 sb->st_uid = sb->st_gid = 0;
1904 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1905 sb->st_size = 0;
1906
1907 if (strcmp(path, "/cgroup") == 0) {
1908 sb->st_mode = S_IFDIR | 00755;
1909 sb->st_nlink = 2;
1910 return 0;
1911 }
1912
1913 controller = pick_controller_from_path(fc, path);
1914 if (!controller)
1915 return -errno;
1916 cgroup = find_cgroup_in_path(path);
1917 if (!cgroup) {
1918 /* this is just /cgroup/controller, return it as a dir */
1919 sb->st_mode = S_IFDIR | 00755;
1920 sb->st_nlink = 2;
1921 return 0;
1922 }
1923
1924 get_cgdir_and_path(cgroup, &cgdir, &last);
1925
1926 if (!last) {
1927 path1 = "/";
1928 path2 = cgdir;
1929 } else {
1930 path1 = cgdir;
1931 path2 = last;
1932 }
1933
1934 pid_t initpid = lookup_initpid_in_store(fc->pid);
1935 if (initpid <= 0)
1936 initpid = fc->pid;
1937 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1938 * Then check that caller's cgroup is under path if last is a child
1939 * cgroup, or cgdir if last is a file */
1940
1941 if (is_child_cgroup(controller, path1, path2)) {
1942 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1943 ret = -ENOENT;
1944 goto out;
1945 }
1946 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1947 /* this is just /cgroup/controller, return it as a dir */
1948 sb->st_mode = S_IFDIR | 00555;
1949 sb->st_nlink = 2;
1950 ret = 0;
1951 goto out;
1952 }
1953 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1954 ret = -EACCES;
1955 goto out;
1956 }
1957
1958 // get uid, gid, from '/tasks' file and make up a mode
1959 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1960 sb->st_mode = S_IFDIR | 00755;
1961 k = cgfs_get_key(controller, cgroup, NULL);
1962 if (!k) {
1963 sb->st_uid = sb->st_gid = 0;
1964 } else {
1965 sb->st_uid = k->uid;
1966 sb->st_gid = k->gid;
1967 }
1968 free_key(k);
1969 sb->st_nlink = 2;
1970 ret = 0;
1971 goto out;
1972 }
1973
1974 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1975 sb->st_mode = S_IFREG | k->mode;
1976 sb->st_nlink = 1;
1977 sb->st_uid = k->uid;
1978 sb->st_gid = k->gid;
1979 sb->st_size = 0;
1980 free_key(k);
1981 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1982 ret = -ENOENT;
1983 goto out;
1984 }
1985 ret = 0;
1986 }
1987
1988 out:
1989 free(cgdir);
1990 return ret;
1991 }
1992
1993 int cg_opendir(const char *path, struct fuse_file_info *fi)
1994 {
1995 struct fuse_context *fc = fuse_get_context();
1996 const char *cgroup;
1997 struct file_info *dir_info;
1998 char *controller = NULL;
1999
2000 if (!fc)
2001 return -EIO;
2002
2003 if (strcmp(path, "/cgroup") == 0) {
2004 cgroup = NULL;
2005 controller = NULL;
2006 } else {
2007 // return list of keys for the controller, and list of child cgroups
2008 controller = pick_controller_from_path(fc, path);
2009 if (!controller)
2010 return -errno;
2011
2012 cgroup = find_cgroup_in_path(path);
2013 if (!cgroup) {
2014 /* this is just /cgroup/controller, return its contents */
2015 cgroup = "/";
2016 }
2017 }
2018
2019 pid_t initpid = lookup_initpid_in_store(fc->pid);
2020 if (initpid <= 0)
2021 initpid = fc->pid;
2022 if (cgroup) {
2023 if (!caller_may_see_dir(initpid, controller, cgroup))
2024 return -ENOENT;
2025 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2026 return -EACCES;
2027 }
2028
2029 /* we'll free this at cg_releasedir */
2030 dir_info = malloc(sizeof(*dir_info));
2031 if (!dir_info)
2032 return -ENOMEM;
2033 dir_info->controller = must_copy_string(controller);
2034 dir_info->cgroup = must_copy_string(cgroup);
2035 dir_info->type = LXC_TYPE_CGDIR;
2036 dir_info->buf = NULL;
2037 dir_info->file = NULL;
2038 dir_info->buflen = 0;
2039
2040 fi->fh = (unsigned long)dir_info;
2041 return 0;
2042 }
2043
2044 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2045 struct fuse_file_info *fi)
2046 {
2047 struct file_info *d = (struct file_info *)fi->fh;
2048 struct cgfs_files **list = NULL;
2049 int i, ret;
2050 char *nextcg = NULL;
2051 struct fuse_context *fc = fuse_get_context();
2052 char **clist = NULL;
2053
2054 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2055 return -EIO;
2056
2057 if (d->type != LXC_TYPE_CGDIR) {
2058 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
2059 return -EIO;
2060 }
2061 if (!d->cgroup && !d->controller) {
2062 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2063 int i;
2064
2065 for (i = 0; i < num_hierarchies; i++) {
2066 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2067 return -EIO;
2068 }
2069 }
2070 return 0;
2071 }
2072
2073 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2074 // not a valid cgroup
2075 ret = -EINVAL;
2076 goto out;
2077 }
2078
2079 pid_t initpid = lookup_initpid_in_store(fc->pid);
2080 if (initpid <= 0)
2081 initpid = fc->pid;
2082 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2083 if (nextcg) {
2084 ret = filler(buf, nextcg, NULL, 0);
2085 free(nextcg);
2086 if (ret != 0) {
2087 ret = -EIO;
2088 goto out;
2089 }
2090 }
2091 ret = 0;
2092 goto out;
2093 }
2094
2095 for (i = 0; list[i]; i++) {
2096 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2097 ret = -EIO;
2098 goto out;
2099 }
2100 }
2101
2102 // now get the list of child cgroups
2103
2104 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2105 ret = 0;
2106 goto out;
2107 }
2108 if (clist) {
2109 for (i = 0; clist[i]; i++) {
2110 if (filler(buf, clist[i], NULL, 0) != 0) {
2111 ret = -EIO;
2112 goto out;
2113 }
2114 }
2115 }
2116 ret = 0;
2117
2118 out:
2119 free_keys(list);
2120 if (clist) {
2121 for (i = 0; clist[i]; i++)
2122 free(clist[i]);
2123 free(clist);
2124 }
2125 return ret;
2126 }
2127
2128 static void do_release_file_info(struct fuse_file_info *fi)
2129 {
2130 struct file_info *f = (struct file_info *)fi->fh;
2131
2132 if (!f)
2133 return;
2134
2135 fi->fh = 0;
2136
2137 free(f->controller);
2138 f->controller = NULL;
2139 free(f->cgroup);
2140 f->cgroup = NULL;
2141 free(f->file);
2142 f->file = NULL;
2143 free(f->buf);
2144 f->buf = NULL;
2145 free(f);
2146 f = NULL;
2147 }
2148
2149 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2150 {
2151 do_release_file_info(fi);
2152 return 0;
2153 }
2154
2155 int cg_open(const char *path, struct fuse_file_info *fi)
2156 {
2157 const char *cgroup;
2158 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2159 struct cgfs_files *k = NULL;
2160 struct file_info *file_info;
2161 struct fuse_context *fc = fuse_get_context();
2162 int ret;
2163
2164 if (!fc)
2165 return -EIO;
2166
2167 controller = pick_controller_from_path(fc, path);
2168 if (!controller)
2169 return -errno;
2170 cgroup = find_cgroup_in_path(path);
2171 if (!cgroup)
2172 return -errno;
2173
2174 get_cgdir_and_path(cgroup, &cgdir, &last);
2175 if (!last) {
2176 path1 = "/";
2177 path2 = cgdir;
2178 } else {
2179 path1 = cgdir;
2180 path2 = last;
2181 }
2182
2183 k = cgfs_get_key(controller, path1, path2);
2184 if (!k) {
2185 ret = -EINVAL;
2186 goto out;
2187 }
2188 free_key(k);
2189
2190 pid_t initpid = lookup_initpid_in_store(fc->pid);
2191 if (initpid <= 0)
2192 initpid = fc->pid;
2193 if (!caller_may_see_dir(initpid, controller, path1)) {
2194 ret = -ENOENT;
2195 goto out;
2196 }
2197 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2198 ret = -EACCES;
2199 goto out;
2200 }
2201
2202 /* we'll free this at cg_release */
2203 file_info = malloc(sizeof(*file_info));
2204 if (!file_info) {
2205 ret = -ENOMEM;
2206 goto out;
2207 }
2208 file_info->controller = must_copy_string(controller);
2209 file_info->cgroup = must_copy_string(path1);
2210 file_info->file = must_copy_string(path2);
2211 file_info->type = LXC_TYPE_CGFILE;
2212 file_info->buf = NULL;
2213 file_info->buflen = 0;
2214
2215 fi->fh = (unsigned long)file_info;
2216 ret = 0;
2217
2218 out:
2219 free(cgdir);
2220 return ret;
2221 }
2222
2223 int cg_access(const char *path, int mode)
2224 {
2225 int ret;
2226 const char *cgroup;
2227 char *path1, *path2, *controller;
2228 char *last = NULL, *cgdir = NULL;
2229 struct cgfs_files *k = NULL;
2230 struct fuse_context *fc = fuse_get_context();
2231
2232 if (strcmp(path, "/cgroup") == 0)
2233 return 0;
2234
2235 if (!fc)
2236 return -EIO;
2237
2238 controller = pick_controller_from_path(fc, path);
2239 if (!controller)
2240 return -errno;
2241 cgroup = find_cgroup_in_path(path);
2242 if (!cgroup) {
2243 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2244 if ((mode & W_OK) == 0)
2245 return 0;
2246 return -EACCES;
2247 }
2248
2249 get_cgdir_and_path(cgroup, &cgdir, &last);
2250 if (!last) {
2251 path1 = "/";
2252 path2 = cgdir;
2253 } else {
2254 path1 = cgdir;
2255 path2 = last;
2256 }
2257
2258 k = cgfs_get_key(controller, path1, path2);
2259 if (!k) {
2260 if ((mode & W_OK) == 0)
2261 ret = 0;
2262 else
2263 ret = -EACCES;
2264 goto out;
2265 }
2266 free_key(k);
2267
2268 pid_t initpid = lookup_initpid_in_store(fc->pid);
2269 if (initpid <= 0)
2270 initpid = fc->pid;
2271 if (!caller_may_see_dir(initpid, controller, path1)) {
2272 ret = -ENOENT;
2273 goto out;
2274 }
2275 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2276 ret = -EACCES;
2277 goto out;
2278 }
2279
2280 ret = 0;
2281
2282 out:
2283 free(cgdir);
2284 return ret;
2285 }
2286
2287 int cg_release(const char *path, struct fuse_file_info *fi)
2288 {
2289 do_release_file_info(fi);
2290 return 0;
2291 }
2292
2293 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2294
2295 static bool wait_for_sock(int sock, int timeout)
2296 {
2297 struct epoll_event ev;
2298 int epfd, ret, now, starttime, deltatime, saved_errno;
2299
2300 if ((starttime = time(NULL)) < 0)
2301 return false;
2302
2303 if ((epfd = epoll_create(1)) < 0) {
2304 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2305 return false;
2306 }
2307
2308 ev.events = POLLIN_SET;
2309 ev.data.fd = sock;
2310 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2311 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2312 close(epfd);
2313 return false;
2314 }
2315
2316 again:
2317 if ((now = time(NULL)) < 0) {
2318 close(epfd);
2319 return false;
2320 }
2321
2322 deltatime = (starttime + timeout) - now;
2323 if (deltatime < 0) { // timeout
2324 errno = 0;
2325 close(epfd);
2326 return false;
2327 }
2328 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2329 if (ret < 0 && errno == EINTR)
2330 goto again;
2331 saved_errno = errno;
2332 close(epfd);
2333
2334 if (ret <= 0) {
2335 errno = saved_errno;
2336 return false;
2337 }
2338 return true;
2339 }
2340
2341 static int msgrecv(int sockfd, void *buf, size_t len)
2342 {
2343 if (!wait_for_sock(sockfd, 2))
2344 return -1;
2345 return recv(sockfd, buf, len, MSG_DONTWAIT);
2346 }
2347
2348 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2349 {
2350 struct msghdr msg = { 0 };
2351 struct iovec iov;
2352 struct cmsghdr *cmsg;
2353 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2354 char buf[1];
2355 buf[0] = 'p';
2356
2357 if (pingfirst) {
2358 if (msgrecv(sock, buf, 1) != 1) {
2359 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2360 return SEND_CREDS_FAIL;
2361 }
2362 }
2363
2364 msg.msg_control = cmsgbuf;
2365 msg.msg_controllen = sizeof(cmsgbuf);
2366
2367 cmsg = CMSG_FIRSTHDR(&msg);
2368 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2369 cmsg->cmsg_level = SOL_SOCKET;
2370 cmsg->cmsg_type = SCM_CREDENTIALS;
2371 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2372
2373 msg.msg_name = NULL;
2374 msg.msg_namelen = 0;
2375
2376 buf[0] = v;
2377 iov.iov_base = buf;
2378 iov.iov_len = sizeof(buf);
2379 msg.msg_iov = &iov;
2380 msg.msg_iovlen = 1;
2381
2382 if (sendmsg(sock, &msg, 0) < 0) {
2383 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2384 if (errno == 3)
2385 return SEND_CREDS_NOTSK;
2386 return SEND_CREDS_FAIL;
2387 }
2388
2389 return SEND_CREDS_OK;
2390 }
2391
2392 static bool recv_creds(int sock, struct ucred *cred, char *v)
2393 {
2394 struct msghdr msg = { 0 };
2395 struct iovec iov;
2396 struct cmsghdr *cmsg;
2397 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2398 char buf[1];
2399 int ret;
2400 int optval = 1;
2401
2402 *v = '1';
2403
2404 cred->pid = -1;
2405 cred->uid = -1;
2406 cred->gid = -1;
2407
2408 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2409 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2410 return false;
2411 }
2412 buf[0] = '1';
2413 if (write(sock, buf, 1) != 1) {
2414 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2415 return false;
2416 }
2417
2418 msg.msg_name = NULL;
2419 msg.msg_namelen = 0;
2420 msg.msg_control = cmsgbuf;
2421 msg.msg_controllen = sizeof(cmsgbuf);
2422
2423 iov.iov_base = buf;
2424 iov.iov_len = sizeof(buf);
2425 msg.msg_iov = &iov;
2426 msg.msg_iovlen = 1;
2427
2428 if (!wait_for_sock(sock, 2)) {
2429 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2430 return false;
2431 }
2432 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2433 if (ret < 0) {
2434 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2435 return false;
2436 }
2437
2438 cmsg = CMSG_FIRSTHDR(&msg);
2439
2440 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2441 cmsg->cmsg_level == SOL_SOCKET &&
2442 cmsg->cmsg_type == SCM_CREDENTIALS) {
2443 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2444 }
2445 *v = buf[0];
2446
2447 return true;
2448 }
2449
2450 struct pid_ns_clone_args {
2451 int *cpipe;
2452 int sock;
2453 pid_t tpid;
2454 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2455 };
2456
2457 /*
2458 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2459 * with clone(). This simply writes '1' as ACK back to the parent
2460 * before calling the actual wrapped function.
2461 */
2462 static int pid_ns_clone_wrapper(void *arg) {
2463 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2464 char b = '1';
2465
2466 close(args->cpipe[0]);
2467 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2468 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2469 close(args->cpipe[1]);
2470 return args->wrapped(args->sock, args->tpid);
2471 }
2472
2473 /*
2474 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2475 * int value back over the socket. This shifts the pid from the
2476 * sender's pidns into tpid's pidns.
2477 */
2478 static int pid_to_ns(int sock, pid_t tpid)
2479 {
2480 char v = '0';
2481 struct ucred cred;
2482
2483 while (recv_creds(sock, &cred, &v)) {
2484 if (v == '1')
2485 return 0;
2486 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2487 return 1;
2488 }
2489 return 0;
2490 }
2491
2492
2493 /*
2494 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2495 * in your old pidns. Only children which you clone will be in the target
2496 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2497 * actually convert pids.
2498 *
2499 * Note: glibc's fork() does not respect pidns, which can lead to failed
2500 * assertions inside glibc (and thus failed forks) if the child's pid in
2501 * the pidns and the parent pid outside are identical. Using clone prevents
2502 * this issue.
2503 */
2504 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2505 {
2506 int newnsfd = -1, ret, cpipe[2];
2507 char fnam[100];
2508 pid_t cpid;
2509 char v;
2510
2511 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2512 if (ret < 0 || ret >= sizeof(fnam))
2513 _exit(1);
2514 newnsfd = open(fnam, O_RDONLY);
2515 if (newnsfd < 0)
2516 _exit(1);
2517 if (setns(newnsfd, 0) < 0)
2518 _exit(1);
2519 close(newnsfd);
2520
2521 if (pipe(cpipe) < 0)
2522 _exit(1);
2523
2524 struct pid_ns_clone_args args = {
2525 .cpipe = cpipe,
2526 .sock = sock,
2527 .tpid = tpid,
2528 .wrapped = &pid_to_ns
2529 };
2530 size_t stack_size = sysconf(_SC_PAGESIZE);
2531 void *stack = alloca(stack_size);
2532
2533 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2534 if (cpid < 0)
2535 _exit(1);
2536
2537 // give the child 1 second to be done forking and
2538 // write its ack
2539 if (!wait_for_sock(cpipe[0], 1))
2540 _exit(1);
2541 ret = read(cpipe[0], &v, 1);
2542 if (ret != sizeof(char) || v != '1')
2543 _exit(1);
2544
2545 if (!wait_for_pid(cpid))
2546 _exit(1);
2547 _exit(0);
2548 }
2549
2550 /*
2551 * To read cgroup files with a particular pid, we will setns into the child
2552 * pidns, open a pipe, fork a child - which will be the first to really be in
2553 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2554 */
2555 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2556 {
2557 int sock[2] = {-1, -1};
2558 char *tmpdata = NULL;
2559 int ret;
2560 pid_t qpid, cpid = -1;
2561 bool answer = false;
2562 char v = '0';
2563 struct ucred cred;
2564 size_t sz = 0, asz = 0;
2565
2566 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2567 return false;
2568
2569 /*
2570 * Now we read the pids from returned data one by one, pass
2571 * them into a child in the target namespace, read back the
2572 * translated pids, and put them into our to-return data
2573 */
2574
2575 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2576 perror("socketpair");
2577 free(tmpdata);
2578 return false;
2579 }
2580
2581 cpid = fork();
2582 if (cpid == -1)
2583 goto out;
2584
2585 if (!cpid) // child - exits when done
2586 pid_to_ns_wrapper(sock[1], tpid);
2587
2588 char *ptr = tmpdata;
2589 cred.uid = 0;
2590 cred.gid = 0;
2591 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2592 cred.pid = qpid;
2593 ret = send_creds(sock[0], &cred, v, true);
2594
2595 if (ret == SEND_CREDS_NOTSK)
2596 goto next;
2597 if (ret == SEND_CREDS_FAIL)
2598 goto out;
2599
2600 // read converted results
2601 if (!wait_for_sock(sock[0], 2)) {
2602 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2603 goto out;
2604 }
2605 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2606 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2607 goto out;
2608 }
2609 must_strcat_pid(d, &sz, &asz, qpid);
2610 next:
2611 ptr = strchr(ptr, '\n');
2612 if (!ptr)
2613 break;
2614 ptr++;
2615 }
2616
2617 cred.pid = getpid();
2618 v = '1';
2619 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2620 // failed to ask child to exit
2621 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2622 goto out;
2623 }
2624
2625 answer = true;
2626
2627 out:
2628 free(tmpdata);
2629 if (cpid != -1)
2630 wait_for_pid(cpid);
2631 if (sock[0] != -1) {
2632 close(sock[0]);
2633 close(sock[1]);
2634 }
2635 return answer;
2636 }
2637
2638 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2639 struct fuse_file_info *fi)
2640 {
2641 struct fuse_context *fc = fuse_get_context();
2642 struct file_info *f = (struct file_info *)fi->fh;
2643 struct cgfs_files *k = NULL;
2644 char *data = NULL;
2645 int ret, s;
2646 bool r;
2647
2648 if (f->type != LXC_TYPE_CGFILE) {
2649 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2650 return -EIO;
2651 }
2652
2653 if (offset)
2654 return 0;
2655
2656 if (!fc)
2657 return -EIO;
2658
2659 if (!f->controller)
2660 return -EINVAL;
2661
2662 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2663 return -EINVAL;
2664 }
2665 free_key(k);
2666
2667
2668 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2669 ret = -EACCES;
2670 goto out;
2671 }
2672
2673 if (strcmp(f->file, "tasks") == 0 ||
2674 strcmp(f->file, "/tasks") == 0 ||
2675 strcmp(f->file, "/cgroup.procs") == 0 ||
2676 strcmp(f->file, "cgroup.procs") == 0)
2677 // special case - we have to translate the pids
2678 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2679 else
2680 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2681
2682 if (!r) {
2683 ret = -EINVAL;
2684 goto out;
2685 }
2686
2687 if (!data) {
2688 ret = 0;
2689 goto out;
2690 }
2691 s = strlen(data);
2692 if (s > size)
2693 s = size;
2694 memcpy(buf, data, s);
2695 if (s > 0 && s < size && data[s-1] != '\n')
2696 buf[s++] = '\n';
2697
2698 ret = s;
2699
2700 out:
2701 free(data);
2702 return ret;
2703 }
2704
2705 static int pid_from_ns(int sock, pid_t tpid)
2706 {
2707 pid_t vpid;
2708 struct ucred cred;
2709 char v;
2710 int ret;
2711
2712 cred.uid = 0;
2713 cred.gid = 0;
2714 while (1) {
2715 if (!wait_for_sock(sock, 2)) {
2716 lxcfs_error("%s\n", "Timeout reading from parent.");
2717 return 1;
2718 }
2719 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2720 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2721 return 1;
2722 }
2723 if (vpid == -1) // done
2724 break;
2725 v = '0';
2726 cred.pid = vpid;
2727 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2728 v = '1';
2729 cred.pid = getpid();
2730 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2731 return 1;
2732 }
2733 }
2734 return 0;
2735 }
2736
2737 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2738 {
2739 int newnsfd = -1, ret, cpipe[2];
2740 char fnam[100];
2741 pid_t cpid;
2742 char v;
2743
2744 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2745 if (ret < 0 || ret >= sizeof(fnam))
2746 _exit(1);
2747 newnsfd = open(fnam, O_RDONLY);
2748 if (newnsfd < 0)
2749 _exit(1);
2750 if (setns(newnsfd, 0) < 0)
2751 _exit(1);
2752 close(newnsfd);
2753
2754 if (pipe(cpipe) < 0)
2755 _exit(1);
2756
2757 struct pid_ns_clone_args args = {
2758 .cpipe = cpipe,
2759 .sock = sock,
2760 .tpid = tpid,
2761 .wrapped = &pid_from_ns
2762 };
2763 size_t stack_size = sysconf(_SC_PAGESIZE);
2764 void *stack = alloca(stack_size);
2765
2766 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2767 if (cpid < 0)
2768 _exit(1);
2769
2770 // give the child 1 second to be done forking and
2771 // write its ack
2772 if (!wait_for_sock(cpipe[0], 1))
2773 _exit(1);
2774 ret = read(cpipe[0], &v, 1);
2775 if (ret != sizeof(char) || v != '1')
2776 _exit(1);
2777
2778 if (!wait_for_pid(cpid))
2779 _exit(1);
2780 _exit(0);
2781 }
2782
2783 /*
2784 * Given host @uid, return the uid to which it maps in
2785 * @pid's user namespace, or -1 if none.
2786 */
2787 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2788 {
2789 FILE *f;
2790 char line[400];
2791
2792 sprintf(line, "/proc/%d/uid_map", pid);
2793 if ((f = fopen(line, "r")) == NULL) {
2794 return false;
2795 }
2796
2797 *answer = convert_id_to_ns(f, uid);
2798 fclose(f);
2799
2800 if (*answer == -1)
2801 return false;
2802 return true;
2803 }
2804
2805 /*
2806 * get_pid_creds: get the real uid and gid of @pid from
2807 * /proc/$$/status
2808 * (XXX should we use euid here?)
2809 */
2810 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2811 {
2812 char line[400];
2813 uid_t u;
2814 gid_t g;
2815 FILE *f;
2816
2817 *uid = -1;
2818 *gid = -1;
2819 sprintf(line, "/proc/%d/status", pid);
2820 if ((f = fopen(line, "r")) == NULL) {
2821 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2822 return;
2823 }
2824 while (fgets(line, 400, f)) {
2825 if (strncmp(line, "Uid:", 4) == 0) {
2826 if (sscanf(line+4, "%u", &u) != 1) {
2827 lxcfs_error("bad uid line for pid %u\n", pid);
2828 fclose(f);
2829 return;
2830 }
2831 *uid = u;
2832 } else if (strncmp(line, "Gid:", 4) == 0) {
2833 if (sscanf(line+4, "%u", &g) != 1) {
2834 lxcfs_error("bad gid line for pid %u\n", pid);
2835 fclose(f);
2836 return;
2837 }
2838 *gid = g;
2839 }
2840 }
2841 fclose(f);
2842 }
2843
2844 /*
2845 * May the requestor @r move victim @v to a new cgroup?
2846 * This is allowed if
2847 * . they are the same task
2848 * . they are ownedy by the same uid
2849 * . @r is root on the host, or
2850 * . @v's uid is mapped into @r's where @r is root.
2851 */
2852 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2853 {
2854 uid_t v_uid, tmpuid;
2855 gid_t v_gid;
2856
2857 if (r == v)
2858 return true;
2859 if (r_uid == 0)
2860 return true;
2861 get_pid_creds(v, &v_uid, &v_gid);
2862 if (r_uid == v_uid)
2863 return true;
2864 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2865 && hostuid_to_ns(v_uid, r, &tmpuid))
2866 return true;
2867 return false;
2868 }
2869
2870 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2871 const char *file, const char *buf)
2872 {
2873 int sock[2] = {-1, -1};
2874 pid_t qpid, cpid = -1;
2875 FILE *pids_file = NULL;
2876 bool answer = false, fail = false;
2877
2878 pids_file = open_pids_file(contrl, cg);
2879 if (!pids_file)
2880 return false;
2881
2882 /*
2883 * write the pids to a socket, have helper in writer's pidns
2884 * call movepid for us
2885 */
2886 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2887 perror("socketpair");
2888 goto out;
2889 }
2890
2891 cpid = fork();
2892 if (cpid == -1)
2893 goto out;
2894
2895 if (!cpid) { // child
2896 fclose(pids_file);
2897 pid_from_ns_wrapper(sock[1], tpid);
2898 }
2899
2900 const char *ptr = buf;
2901 while (sscanf(ptr, "%d", &qpid) == 1) {
2902 struct ucred cred;
2903 char v;
2904
2905 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2906 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2907 goto out;
2908 }
2909
2910 if (recv_creds(sock[0], &cred, &v)) {
2911 if (v == '0') {
2912 if (!may_move_pid(tpid, tuid, cred.pid)) {
2913 fail = true;
2914 break;
2915 }
2916 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2917 fail = true;
2918 }
2919 }
2920
2921 ptr = strchr(ptr, '\n');
2922 if (!ptr)
2923 break;
2924 ptr++;
2925 }
2926
2927 /* All good, write the value */
2928 qpid = -1;
2929 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2930 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2931
2932 if (!fail)
2933 answer = true;
2934
2935 out:
2936 if (cpid != -1)
2937 wait_for_pid(cpid);
2938 if (sock[0] != -1) {
2939 close(sock[0]);
2940 close(sock[1]);
2941 }
2942 if (pids_file) {
2943 if (fclose(pids_file) != 0)
2944 answer = false;
2945 }
2946 return answer;
2947 }
2948
2949 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2950 struct fuse_file_info *fi)
2951 {
2952 struct fuse_context *fc = fuse_get_context();
2953 char *localbuf = NULL;
2954 struct cgfs_files *k = NULL;
2955 struct file_info *f = (struct file_info *)fi->fh;
2956 bool r;
2957
2958 if (f->type != LXC_TYPE_CGFILE) {
2959 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2960 return -EIO;
2961 }
2962
2963 if (offset)
2964 return 0;
2965
2966 if (!fc)
2967 return -EIO;
2968
2969 localbuf = alloca(size+1);
2970 localbuf[size] = '\0';
2971 memcpy(localbuf, buf, size);
2972
2973 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2974 size = -EINVAL;
2975 goto out;
2976 }
2977
2978 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2979 size = -EACCES;
2980 goto out;
2981 }
2982
2983 if (strcmp(f->file, "tasks") == 0 ||
2984 strcmp(f->file, "/tasks") == 0 ||
2985 strcmp(f->file, "/cgroup.procs") == 0 ||
2986 strcmp(f->file, "cgroup.procs") == 0)
2987 // special case - we have to translate the pids
2988 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2989 else
2990 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2991
2992 if (!r)
2993 size = -EINVAL;
2994
2995 out:
2996 free_key(k);
2997 return size;
2998 }
2999
3000 int cg_chown(const char *path, uid_t uid, gid_t gid)
3001 {
3002 struct fuse_context *fc = fuse_get_context();
3003 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3004 struct cgfs_files *k = NULL;
3005 const char *cgroup;
3006 int ret;
3007
3008 if (!fc)
3009 return -EIO;
3010
3011 if (strcmp(path, "/cgroup") == 0)
3012 return -EPERM;
3013
3014 controller = pick_controller_from_path(fc, path);
3015 if (!controller)
3016 return errno == ENOENT ? -EPERM : -errno;
3017
3018 cgroup = find_cgroup_in_path(path);
3019 if (!cgroup)
3020 /* this is just /cgroup/controller */
3021 return -EPERM;
3022
3023 get_cgdir_and_path(cgroup, &cgdir, &last);
3024
3025 if (!last) {
3026 path1 = "/";
3027 path2 = cgdir;
3028 } else {
3029 path1 = cgdir;
3030 path2 = last;
3031 }
3032
3033 if (is_child_cgroup(controller, path1, path2)) {
3034 // get uid, gid, from '/tasks' file and make up a mode
3035 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3036 k = cgfs_get_key(controller, cgroup, "tasks");
3037
3038 } else
3039 k = cgfs_get_key(controller, path1, path2);
3040
3041 if (!k) {
3042 ret = -EINVAL;
3043 goto out;
3044 }
3045
3046 /*
3047 * This being a fuse request, the uid and gid must be valid
3048 * in the caller's namespace. So we can just check to make
3049 * sure that the caller is root in his uid, and privileged
3050 * over the file's current owner.
3051 */
3052 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3053 ret = -EACCES;
3054 goto out;
3055 }
3056
3057 ret = cgfs_chown_file(controller, cgroup, uid, gid);
3058
3059 out:
3060 free_key(k);
3061 free(cgdir);
3062
3063 return ret;
3064 }
3065
3066 int cg_chmod(const char *path, mode_t mode)
3067 {
3068 struct fuse_context *fc = fuse_get_context();
3069 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3070 struct cgfs_files *k = NULL;
3071 const char *cgroup;
3072 int ret;
3073
3074 if (!fc)
3075 return -EIO;
3076
3077 if (strcmp(path, "/cgroup") == 0)
3078 return -EPERM;
3079
3080 controller = pick_controller_from_path(fc, path);
3081 if (!controller)
3082 return errno == ENOENT ? -EPERM : -errno;
3083
3084 cgroup = find_cgroup_in_path(path);
3085 if (!cgroup)
3086 /* this is just /cgroup/controller */
3087 return -EPERM;
3088
3089 get_cgdir_and_path(cgroup, &cgdir, &last);
3090
3091 if (!last) {
3092 path1 = "/";
3093 path2 = cgdir;
3094 } else {
3095 path1 = cgdir;
3096 path2 = last;
3097 }
3098
3099 if (is_child_cgroup(controller, path1, path2)) {
3100 // get uid, gid, from '/tasks' file and make up a mode
3101 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3102 k = cgfs_get_key(controller, cgroup, "tasks");
3103
3104 } else
3105 k = cgfs_get_key(controller, path1, path2);
3106
3107 if (!k) {
3108 ret = -EINVAL;
3109 goto out;
3110 }
3111
3112 /*
3113 * This being a fuse request, the uid and gid must be valid
3114 * in the caller's namespace. So we can just check to make
3115 * sure that the caller is root in his uid, and privileged
3116 * over the file's current owner.
3117 */
3118 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3119 ret = -EPERM;
3120 goto out;
3121 }
3122
3123 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3124 ret = -EINVAL;
3125 goto out;
3126 }
3127
3128 ret = 0;
3129 out:
3130 free_key(k);
3131 free(cgdir);
3132 return ret;
3133 }
3134
3135 int cg_mkdir(const char *path, mode_t mode)
3136 {
3137 struct fuse_context *fc = fuse_get_context();
3138 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3139 const char *cgroup;
3140 int ret;
3141
3142 if (!fc)
3143 return -EIO;
3144
3145 controller = pick_controller_from_path(fc, path);
3146 if (!controller)
3147 return errno == ENOENT ? -EPERM : -errno;
3148
3149 cgroup = find_cgroup_in_path(path);
3150 if (!cgroup)
3151 return -errno;
3152
3153 get_cgdir_and_path(cgroup, &cgdir, &last);
3154 if (!last)
3155 path1 = "/";
3156 else
3157 path1 = cgdir;
3158
3159 pid_t initpid = lookup_initpid_in_store(fc->pid);
3160 if (initpid <= 0)
3161 initpid = fc->pid;
3162 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3163 if (!next)
3164 ret = -EINVAL;
3165 else if (last && strcmp(next, last) == 0)
3166 ret = -EEXIST;
3167 else
3168 ret = -EPERM;
3169 goto out;
3170 }
3171
3172 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3173 ret = -EACCES;
3174 goto out;
3175 }
3176 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3177 ret = -EACCES;
3178 goto out;
3179 }
3180
3181 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3182
3183 out:
3184 free(cgdir);
3185 free(next);
3186 return ret;
3187 }
3188
3189 int cg_rmdir(const char *path)
3190 {
3191 struct fuse_context *fc = fuse_get_context();
3192 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3193 const char *cgroup;
3194 int ret;
3195
3196 if (!fc)
3197 return -EIO;
3198
3199 controller = pick_controller_from_path(fc, path);
3200 if (!controller) /* Someone's trying to delete "/cgroup". */
3201 return -EPERM;
3202
3203 cgroup = find_cgroup_in_path(path);
3204 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3205 return -EPERM;
3206
3207 get_cgdir_and_path(cgroup, &cgdir, &last);
3208 if (!last) {
3209 /* Someone's trying to delete a cgroup on the same level as the
3210 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3211 * rmdir "/cgroup/blkio/init.slice".
3212 */
3213 ret = -EPERM;
3214 goto out;
3215 }
3216
3217 pid_t initpid = lookup_initpid_in_store(fc->pid);
3218 if (initpid <= 0)
3219 initpid = fc->pid;
3220 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3221 if (!last || (next && (strcmp(next, last) == 0)))
3222 ret = -EBUSY;
3223 else
3224 ret = -ENOENT;
3225 goto out;
3226 }
3227
3228 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3229 ret = -EACCES;
3230 goto out;
3231 }
3232 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3233 ret = -EACCES;
3234 goto out;
3235 }
3236
3237 if (!cgfs_remove(controller, cgroup)) {
3238 ret = -EINVAL;
3239 goto out;
3240 }
3241
3242 ret = 0;
3243
3244 out:
3245 free(cgdir);
3246 free(next);
3247 return ret;
3248 }
3249
3250 static bool startswith(const char *line, const char *pref)
3251 {
3252 if (strncmp(line, pref, strlen(pref)) == 0)
3253 return true;
3254 return false;
3255 }
3256
3257 static void parse_memstat(char *memstat, unsigned long *cached,
3258 unsigned long *active_anon, unsigned long *inactive_anon,
3259 unsigned long *active_file, unsigned long *inactive_file,
3260 unsigned long *unevictable, unsigned long *shmem)
3261 {
3262 char *eol;
3263
3264 while (*memstat) {
3265 if (startswith(memstat, "total_cache")) {
3266 sscanf(memstat + 11, "%lu", cached);
3267 *cached /= 1024;
3268 } else if (startswith(memstat, "total_active_anon")) {
3269 sscanf(memstat + 17, "%lu", active_anon);
3270 *active_anon /= 1024;
3271 } else if (startswith(memstat, "total_inactive_anon")) {
3272 sscanf(memstat + 19, "%lu", inactive_anon);
3273 *inactive_anon /= 1024;
3274 } else if (startswith(memstat, "total_active_file")) {
3275 sscanf(memstat + 17, "%lu", active_file);
3276 *active_file /= 1024;
3277 } else if (startswith(memstat, "total_inactive_file")) {
3278 sscanf(memstat + 19, "%lu", inactive_file);
3279 *inactive_file /= 1024;
3280 } else if (startswith(memstat, "total_unevictable")) {
3281 sscanf(memstat + 17, "%lu", unevictable);
3282 *unevictable /= 1024;
3283 } else if (startswith(memstat, "total_shmem")) {
3284 sscanf(memstat + 11, "%lu", shmem);
3285 *shmem /= 1024;
3286 }
3287 eol = strchr(memstat, '\n');
3288 if (!eol)
3289 return;
3290 memstat = eol+1;
3291 }
3292 }
3293
3294 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3295 {
3296 char *eol;
3297 char key[32];
3298
3299 memset(key, 0, 32);
3300 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3301
3302 size_t len = strlen(key);
3303 *v = 0;
3304
3305 while (*str) {
3306 if (startswith(str, key)) {
3307 sscanf(str + len, "%lu", v);
3308 return;
3309 }
3310 eol = strchr(str, '\n');
3311 if (!eol)
3312 return;
3313 str = eol+1;
3314 }
3315 }
3316
3317 static int read_file(const char *path, char *buf, size_t size,
3318 struct file_info *d)
3319 {
3320 size_t linelen = 0, total_len = 0, rv = 0;
3321 char *line = NULL;
3322 char *cache = d->buf;
3323 size_t cache_size = d->buflen;
3324 FILE *f = fopen(path, "r");
3325 if (!f)
3326 return 0;
3327
3328 while (getline(&line, &linelen, f) != -1) {
3329 ssize_t l = snprintf(cache, cache_size, "%s", line);
3330 if (l < 0) {
3331 perror("Error writing to cache");
3332 rv = 0;
3333 goto err;
3334 }
3335 if (l >= cache_size) {
3336 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3337 rv = 0;
3338 goto err;
3339 }
3340 cache += l;
3341 cache_size -= l;
3342 total_len += l;
3343 }
3344
3345 d->size = total_len;
3346 if (total_len > size)
3347 total_len = size;
3348
3349 /* read from off 0 */
3350 memcpy(buf, d->buf, total_len);
3351 rv = total_len;
3352 err:
3353 fclose(f);
3354 free(line);
3355 return rv;
3356 }
3357
3358 /*
3359 * FUSE ops for /proc
3360 */
3361
3362 static unsigned long get_memlimit(const char *cgroup, const char *file)
3363 {
3364 char *memlimit_str = NULL;
3365 unsigned long memlimit = -1;
3366
3367 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3368 memlimit = strtoul(memlimit_str, NULL, 10);
3369
3370 free(memlimit_str);
3371
3372 return memlimit;
3373 }
3374
3375 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3376 {
3377 char *copy = strdupa(cgroup);
3378 unsigned long memlimit = 0, retlimit;
3379
3380 retlimit = get_memlimit(copy, file);
3381
3382 while (strcmp(copy, "/") != 0) {
3383 copy = dirname(copy);
3384 memlimit = get_memlimit(copy, file);
3385 if (memlimit != -1 && memlimit < retlimit)
3386 retlimit = memlimit;
3387 };
3388
3389 return retlimit;
3390 }
3391
3392 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3393 struct fuse_file_info *fi)
3394 {
3395 struct fuse_context *fc = fuse_get_context();
3396 struct file_info *d = (struct file_info *)fi->fh;
3397 char *cg;
3398 char *memusage_str = NULL, *memstat_str = NULL,
3399 *memswlimit_str = NULL, *memswusage_str = NULL;
3400 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3401 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3402 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
3403 hostswtotal = 0;
3404 char *line = NULL;
3405 size_t linelen = 0, total_len = 0, rv = 0;
3406 char *cache = d->buf;
3407 size_t cache_size = d->buflen;
3408 FILE *f = NULL;
3409
3410 if (offset){
3411 if (offset > d->size)
3412 return -EINVAL;
3413 if (!d->cached)
3414 return 0;
3415 int left = d->size - offset;
3416 total_len = left > size ? size: left;
3417 memcpy(buf, cache + offset, total_len);
3418 return total_len;
3419 }
3420
3421 pid_t initpid = lookup_initpid_in_store(fc->pid);
3422 if (initpid <= 0)
3423 initpid = fc->pid;
3424 cg = get_pid_cgroup(initpid, "memory");
3425 if (!cg)
3426 return read_file("/proc/meminfo", buf, size, d);
3427 prune_init_slice(cg);
3428
3429 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3430 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3431 goto err;
3432 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3433 goto err;
3434
3435 // Following values are allowed to fail, because swapaccount might be turned
3436 // off for current kernel
3437 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3438 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3439 {
3440 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3441 memswusage = strtoul(memswusage_str, NULL, 10);
3442
3443 memswlimit = memswlimit / 1024;
3444 memswusage = memswusage / 1024;
3445 }
3446
3447 memusage = strtoul(memusage_str, NULL, 10);
3448 memlimit /= 1024;
3449 memusage /= 1024;
3450
3451 parse_memstat(memstat_str, &cached, &active_anon,
3452 &inactive_anon, &active_file, &inactive_file,
3453 &unevictable, &shmem);
3454
3455 f = fopen("/proc/meminfo", "r");
3456 if (!f)
3457 goto err;
3458
3459 while (getline(&line, &linelen, f) != -1) {
3460 ssize_t l;
3461 char *printme, lbuf[100];
3462
3463 memset(lbuf, 0, 100);
3464 if (startswith(line, "MemTotal:")) {
3465 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3466 if (hosttotal < memlimit)
3467 memlimit = hosttotal;
3468 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3469 printme = lbuf;
3470 } else if (startswith(line, "MemFree:")) {
3471 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3472 printme = lbuf;
3473 } else if (startswith(line, "MemAvailable:")) {
3474 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
3475 printme = lbuf;
3476 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3477 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3478 if (hostswtotal < memswlimit)
3479 memswlimit = hostswtotal;
3480 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
3481 printme = lbuf;
3482 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3483 unsigned long swaptotal = memswlimit,
3484 swapusage = memswusage - memusage,
3485 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3486 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3487 printme = lbuf;
3488 } else if (startswith(line, "Slab:")) {
3489 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3490 printme = lbuf;
3491 } else if (startswith(line, "Buffers:")) {
3492 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3493 printme = lbuf;
3494 } else if (startswith(line, "Cached:")) {
3495 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3496 printme = lbuf;
3497 } else if (startswith(line, "SwapCached:")) {
3498 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3499 printme = lbuf;
3500 } else if (startswith(line, "Active:")) {
3501 snprintf(lbuf, 100, "Active: %8lu kB\n",
3502 active_anon + active_file);
3503 printme = lbuf;
3504 } else if (startswith(line, "Inactive:")) {
3505 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3506 inactive_anon + inactive_file);
3507 printme = lbuf;
3508 } else if (startswith(line, "Active(anon)")) {
3509 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3510 printme = lbuf;
3511 } else if (startswith(line, "Inactive(anon)")) {
3512 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3513 printme = lbuf;
3514 } else if (startswith(line, "Active(file)")) {
3515 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3516 printme = lbuf;
3517 } else if (startswith(line, "Inactive(file)")) {
3518 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3519 printme = lbuf;
3520 } else if (startswith(line, "Unevictable")) {
3521 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3522 printme = lbuf;
3523 } else if (startswith(line, "SReclaimable")) {
3524 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3525 printme = lbuf;
3526 } else if (startswith(line, "SUnreclaim")) {
3527 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3528 printme = lbuf;
3529 } else if (startswith(line, "Shmem:")) {
3530 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3531 printme = lbuf;
3532 } else if (startswith(line, "ShmemHugePages")) {
3533 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3534 printme = lbuf;
3535 } else if (startswith(line, "ShmemPmdMapped")) {
3536 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3537 printme = lbuf;
3538 } else
3539 printme = line;
3540
3541 l = snprintf(cache, cache_size, "%s", printme);
3542 if (l < 0) {
3543 perror("Error writing to cache");
3544 rv = 0;
3545 goto err;
3546
3547 }
3548 if (l >= cache_size) {
3549 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3550 rv = 0;
3551 goto err;
3552 }
3553
3554 cache += l;
3555 cache_size -= l;
3556 total_len += l;
3557 }
3558
3559 d->cached = 1;
3560 d->size = total_len;
3561 if (total_len > size ) total_len = size;
3562 memcpy(buf, d->buf, total_len);
3563
3564 rv = total_len;
3565 err:
3566 if (f)
3567 fclose(f);
3568 free(line);
3569 free(cg);
3570 free(memusage_str);
3571 free(memswlimit_str);
3572 free(memswusage_str);
3573 free(memstat_str);
3574 return rv;
3575 }
3576
3577 /*
3578 * Read the cpuset.cpus for cg
3579 * Return the answer in a newly allocated string which must be freed
3580 */
3581 static char *get_cpuset(const char *cg)
3582 {
3583 char *answer;
3584
3585 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3586 return NULL;
3587 return answer;
3588 }
3589
3590 bool cpu_in_cpuset(int cpu, const char *cpuset);
3591
3592 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3593 {
3594 int cpu;
3595
3596 if (sscanf(line, "processor : %d", &cpu) != 1)
3597 return false;
3598 return cpu_in_cpuset(cpu, cpuset);
3599 }
3600
3601 /*
3602 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3603 * depending on `param`. Parameter value is returned throuh `value`.
3604 */
3605 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3606 {
3607 bool rv = false;
3608 char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3609 char *str = NULL;
3610
3611 sprintf(file, "cpu.cfs_%s_us", param);
3612
3613 if (!cgfs_get_value("cpu", cg, file, &str))
3614 goto err;
3615
3616 if (sscanf(str, "%ld", value) != 1)
3617 goto err;
3618
3619 rv = true;
3620
3621 err:
3622 if (str)
3623 free(str);
3624 return rv;
3625 }
3626
3627 /*
3628 * Return the maximum number of visible CPUs based on CPU quotas.
3629 * If there is no quota set, zero is returned.
3630 */
3631 int max_cpu_count(const char *cg)
3632 {
3633 int rv, nprocs;
3634 int64_t cfs_quota, cfs_period;
3635
3636 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3637 return 0;
3638
3639 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3640 return 0;
3641
3642 if (cfs_quota <= 0 || cfs_period <= 0)
3643 return 0;
3644
3645 rv = cfs_quota / cfs_period;
3646
3647 /* In case quota/period does not yield a whole number, add one CPU for
3648 * the remainder.
3649 */
3650 if ((cfs_quota % cfs_period) > 0)
3651 rv += 1;
3652
3653 nprocs = get_nprocs();
3654
3655 if (rv > nprocs)
3656 rv = nprocs;
3657
3658 return rv;
3659 }
3660
3661 /*
3662 * Determine whether CPU views should be used or not.
3663 */
3664 bool use_cpuview(const char *cg)
3665 {
3666 int cfd;
3667 char *tmpc;
3668
3669 tmpc = find_mounted_controller("cpu", &cfd);
3670 if (!tmpc)
3671 return false;
3672
3673 tmpc = find_mounted_controller("cpuacct", &cfd);
3674 if (!tmpc)
3675 return false;
3676
3677 return true;
3678 }
3679
3680 /*
3681 * check whether this is a '^processor" line in /proc/cpuinfo
3682 */
3683 static bool is_processor_line(const char *line)
3684 {
3685 int cpu;
3686
3687 if (sscanf(line, "processor : %d", &cpu) == 1)
3688 return true;
3689 return false;
3690 }
3691
3692 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3693 struct fuse_file_info *fi)
3694 {
3695 struct fuse_context *fc = fuse_get_context();
3696 struct file_info *d = (struct file_info *)fi->fh;
3697 char *cg;
3698 char *cpuset = NULL;
3699 char *line = NULL;
3700 size_t linelen = 0, total_len = 0, rv = 0;
3701 bool am_printing = false, firstline = true, is_s390x = false;
3702 int curcpu = -1, cpu, max_cpus = 0;
3703 bool use_view;
3704 char *cache = d->buf;
3705 size_t cache_size = d->buflen;
3706 FILE *f = NULL;
3707
3708 if (offset){
3709 if (offset > d->size)
3710 return -EINVAL;
3711 if (!d->cached)
3712 return 0;
3713 int left = d->size - offset;
3714 total_len = left > size ? size: left;
3715 memcpy(buf, cache + offset, total_len);
3716 return total_len;
3717 }
3718
3719 pid_t initpid = lookup_initpid_in_store(fc->pid);
3720 if (initpid <= 0)
3721 initpid = fc->pid;
3722 cg = get_pid_cgroup(initpid, "cpuset");
3723 if (!cg)
3724 return read_file("proc/cpuinfo", buf, size, d);
3725 prune_init_slice(cg);
3726
3727 cpuset = get_cpuset(cg);
3728 if (!cpuset)
3729 goto err;
3730
3731 use_view = use_cpuview(cg);
3732
3733 if (use_view)
3734 max_cpus = max_cpu_count(cg);
3735
3736 f = fopen("/proc/cpuinfo", "r");
3737 if (!f)
3738 goto err;
3739
3740 while (getline(&line, &linelen, f) != -1) {
3741 ssize_t l;
3742 if (firstline) {
3743 firstline = false;
3744 if (strstr(line, "IBM/S390") != NULL) {
3745 is_s390x = true;
3746 am_printing = true;
3747 continue;
3748 }
3749 }
3750 if (strncmp(line, "# processors:", 12) == 0)
3751 continue;
3752 if (is_processor_line(line)) {
3753 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3754 break;
3755 am_printing = cpuline_in_cpuset(line, cpuset);
3756 if (am_printing) {
3757 curcpu ++;
3758 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3759 if (l < 0) {
3760 perror("Error writing to cache");
3761 rv = 0;
3762 goto err;
3763 }
3764 if (l >= cache_size) {
3765 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3766 rv = 0;
3767 goto err;
3768 }
3769 cache += l;
3770 cache_size -= l;
3771 total_len += l;
3772 }
3773 continue;
3774 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3775 char *p;
3776 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3777 break;
3778 if (!cpu_in_cpuset(cpu, cpuset))
3779 continue;
3780 curcpu ++;
3781 p = strchr(line, ':');
3782 if (!p || !*p)
3783 goto err;
3784 p++;
3785 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3786 if (l < 0) {
3787 perror("Error writing to cache");
3788 rv = 0;
3789 goto err;
3790 }
3791 if (l >= cache_size) {
3792 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3793 rv = 0;
3794 goto err;
3795 }
3796 cache += l;
3797 cache_size -= l;
3798 total_len += l;
3799 continue;
3800
3801 }
3802 if (am_printing) {
3803 l = snprintf(cache, cache_size, "%s", line);
3804 if (l < 0) {
3805 perror("Error writing to cache");
3806 rv = 0;
3807 goto err;
3808 }
3809 if (l >= cache_size) {
3810 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3811 rv = 0;
3812 goto err;
3813 }
3814 cache += l;
3815 cache_size -= l;
3816 total_len += l;
3817 }
3818 }
3819
3820 if (is_s390x) {
3821 char *origcache = d->buf;
3822 ssize_t l;
3823 do {
3824 d->buf = malloc(d->buflen);
3825 } while (!d->buf);
3826 cache = d->buf;
3827 cache_size = d->buflen;
3828 total_len = 0;
3829 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3830 if (l < 0 || l >= cache_size) {
3831 free(origcache);
3832 goto err;
3833 }
3834 cache_size -= l;
3835 cache += l;
3836 total_len += l;
3837 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3838 if (l < 0 || l >= cache_size) {
3839 free(origcache);
3840 goto err;
3841 }
3842 cache_size -= l;
3843 cache += l;
3844 total_len += l;
3845 l = snprintf(cache, cache_size, "%s", origcache);
3846 free(origcache);
3847 if (l < 0 || l >= cache_size)
3848 goto err;
3849 total_len += l;
3850 }
3851
3852 d->cached = 1;
3853 d->size = total_len;
3854 if (total_len > size ) total_len = size;
3855
3856 /* read from off 0 */
3857 memcpy(buf, d->buf, total_len);
3858 rv = total_len;
3859 err:
3860 if (f)
3861 fclose(f);
3862 free(line);
3863 free(cpuset);
3864 free(cg);
3865 return rv;
3866 }
3867
3868 static uint64_t get_reaper_start_time(pid_t pid)
3869 {
3870 int ret;
3871 FILE *f;
3872 uint64_t starttime;
3873 /* strlen("/proc/") = 6
3874 * +
3875 * LXCFS_NUMSTRLEN64
3876 * +
3877 * strlen("/stat") = 5
3878 * +
3879 * \0 = 1
3880 * */
3881 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3882 char path[__PROC_PID_STAT_LEN];
3883 pid_t qpid;
3884
3885 qpid = lookup_initpid_in_store(pid);
3886 if (qpid <= 0) {
3887 /* Caller can check for EINVAL on 0. */
3888 errno = EINVAL;
3889 return 0;
3890 }
3891
3892 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3893 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3894 /* Caller can check for EINVAL on 0. */
3895 errno = EINVAL;
3896 return 0;
3897 }
3898
3899 f = fopen(path, "r");
3900 if (!f) {
3901 /* Caller can check for EINVAL on 0. */
3902 errno = EINVAL;
3903 return 0;
3904 }
3905
3906 /* Note that the *scanf() argument supression requires that length
3907 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3908 * at us. It's like telling someone you're not married and then asking
3909 * if you can bring your wife to the party.
3910 */
3911 ret = fscanf(f, "%*d " /* (1) pid %d */
3912 "%*s " /* (2) comm %s */
3913 "%*c " /* (3) state %c */
3914 "%*d " /* (4) ppid %d */
3915 "%*d " /* (5) pgrp %d */
3916 "%*d " /* (6) session %d */
3917 "%*d " /* (7) tty_nr %d */
3918 "%*d " /* (8) tpgid %d */
3919 "%*u " /* (9) flags %u */
3920 "%*u " /* (10) minflt %lu */
3921 "%*u " /* (11) cminflt %lu */
3922 "%*u " /* (12) majflt %lu */
3923 "%*u " /* (13) cmajflt %lu */
3924 "%*u " /* (14) utime %lu */
3925 "%*u " /* (15) stime %lu */
3926 "%*d " /* (16) cutime %ld */
3927 "%*d " /* (17) cstime %ld */
3928 "%*d " /* (18) priority %ld */
3929 "%*d " /* (19) nice %ld */
3930 "%*d " /* (20) num_threads %ld */
3931 "%*d " /* (21) itrealvalue %ld */
3932 "%" PRIu64, /* (22) starttime %llu */
3933 &starttime);
3934 if (ret != 1) {
3935 fclose(f);
3936 /* Caller can check for EINVAL on 0. */
3937 errno = EINVAL;
3938 return 0;
3939 }
3940
3941 fclose(f);
3942
3943 errno = 0;
3944 return starttime;
3945 }
3946
3947 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3948 {
3949 uint64_t clockticks;
3950 int64_t ticks_per_sec;
3951
3952 clockticks = get_reaper_start_time(pid);
3953 if (clockticks == 0 && errno == EINVAL) {
3954 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3955 return 0;
3956 }
3957
3958 ticks_per_sec = sysconf(_SC_CLK_TCK);
3959 if (ticks_per_sec < 0 && errno == EINVAL) {
3960 lxcfs_debug(
3961 "%s\n",
3962 "failed to determine number of clock ticks in a second");
3963 return 0;
3964 }
3965
3966 return (clockticks /= ticks_per_sec);
3967 }
3968
3969 static uint64_t get_reaper_age(pid_t pid)
3970 {
3971 uint64_t procstart, uptime, procage;
3972
3973 /* We need to substract the time the process has started since system
3974 * boot minus the time when the system has started to get the actual
3975 * reaper age.
3976 */
3977 procstart = get_reaper_start_time_in_sec(pid);
3978 procage = procstart;
3979 if (procstart > 0) {
3980 int ret;
3981 struct timespec spec;
3982
3983 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3984 if (ret < 0)
3985 return 0;
3986 /* We could make this more precise here by using the tv_nsec
3987 * field in the timespec struct and convert it to milliseconds
3988 * and then create a double for the seconds and milliseconds but
3989 * that seems more work than it is worth.
3990 */
3991 uptime = spec.tv_sec;
3992 procage = uptime - procstart;
3993 }
3994
3995 return procage;
3996 }
3997
3998 /*
3999 * Returns 0 on success.
4000 * It is the caller's responsibility to free `return_usage`, unless this
4001 * function returns an error.
4002 */
4003 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage)
4004 {
4005 int cpucount = get_nprocs();
4006 struct cpuacct_usage *cpu_usage;
4007 int rv = 0, i, j, ret, read_pos = 0, read_cnt;
4008 int cg_cpu;
4009 uint64_t cg_user, cg_system;
4010 int64_t ticks_per_sec;
4011 char *usage_str = NULL;
4012
4013 ticks_per_sec = sysconf(_SC_CLK_TCK);
4014
4015 if (ticks_per_sec < 0 && errno == EINVAL) {
4016 lxcfs_debug(
4017 "%s\n",
4018 "read_cpuacct_usage_all failed to determine number of clock ticks "
4019 "in a second");
4020 return -1;
4021 }
4022
4023 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4024 if (!cpu_usage)
4025 return -ENOMEM;
4026
4027 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4028 rv = -1;
4029 goto err;
4030 }
4031
4032 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4033 lxcfs_error("read_cpuacct_usage_all reading first line from "
4034 "%s/cpuacct.usage_all failed.\n", cg);
4035 rv = -1;
4036 goto err;
4037 }
4038
4039 read_pos += read_cnt;
4040
4041 for (i = 0, j = 0; i < cpucount; i++) {
4042 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4043 &cg_system, &read_cnt);
4044
4045 if (ret == EOF)
4046 break;
4047
4048 if (ret != 3) {
4049 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4050 "failed.\n", cg);
4051 rv = -1;
4052 goto err;
4053 }
4054
4055 read_pos += read_cnt;
4056
4057 if (!cpu_in_cpuset(i, cpuset))
4058 continue;
4059
4060 /* Convert the time from nanoseconds to USER_HZ */
4061 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4062 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4063 j++;
4064 }
4065
4066 rv = 0;
4067 *return_usage = cpu_usage;
4068
4069 err:
4070 if (usage_str)
4071 free(usage_str);
4072
4073 if (rv != 0) {
4074 free(cpu_usage);
4075 *return_usage = NULL;
4076 }
4077
4078 return rv;
4079 }
4080
4081 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4082 {
4083 int i;
4084 unsigned long sum = 0;
4085
4086 for (i = 0; i < cpu_count; i++) {
4087 /* When cpuset is changed on the fly, the CPUs might get reordered.
4088 * We could either reset all counters, or check that the substractions
4089 * below will return expected results.
4090 */
4091 if (newer[i].user > older[i].user)
4092 diff[i].user = newer[i].user - older[i].user;
4093 else
4094 diff[i].user = 0;
4095
4096 if (newer[i].system > older[i].system)
4097 diff[i].system = newer[i].system - older[i].system;
4098 else
4099 diff[i].system = 0;
4100
4101 if (newer[i].idle > older[i].idle)
4102 diff[i].idle = newer[i].idle - older[i].idle;
4103 else
4104 diff[i].idle = 0;
4105
4106 sum += diff[i].user;
4107 sum += diff[i].system;
4108 sum += diff[i].idle;
4109 }
4110
4111 return sum;
4112 }
4113
4114 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4115 {
4116 unsigned long free_space, to_add;
4117
4118 free_space = threshold - usage->user - usage->system;
4119
4120 if (free_space > usage->idle)
4121 free_space = usage->idle;
4122
4123 to_add = free_space > *surplus ? *surplus : free_space;
4124
4125 *counter += to_add;
4126 usage->idle -= to_add;
4127 *surplus -= to_add;
4128 }
4129
4130 static struct cg_proc_stat *find_proc_stat_node(const char *cg)
4131 {
4132 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4133 struct cg_proc_stat_head *head = proc_stat_history[hash];
4134 struct cg_proc_stat *node;
4135
4136 if (!head->next)
4137 return NULL;
4138
4139 node = head->next;
4140
4141 do {
4142 if (strcmp(cg, node->cg) == 0)
4143 return node;
4144 } while ((node = node->next));
4145
4146 return NULL;
4147 }
4148
4149 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4150 {
4151 struct cg_proc_stat *node;
4152 int i;
4153
4154 node = malloc(sizeof(struct cg_proc_stat));
4155 if (!node)
4156 goto err;
4157
4158 node->cg = NULL;
4159 node->usage = NULL;
4160 node->view = NULL;
4161
4162 node->cg = malloc(strlen(cg) + 1);
4163 if (!node->cg)
4164 goto err;
4165
4166 strcpy(node->cg, cg);
4167
4168 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4169 if (!node->usage)
4170 goto err;
4171
4172 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4173
4174 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4175 if (!node->view)
4176 goto err;
4177
4178 node->cpu_count = cpu_count;
4179 node->next = NULL;
4180
4181 for (i = 0; i < cpu_count; i++) {
4182 node->view[i].user = 0;
4183 node->view[i].system = 0;
4184 node->view[i].idle = 0;
4185 }
4186
4187 return node;
4188
4189 err:
4190 if (node && node->cg)
4191 free(node->cg);
4192 if (node && node->usage)
4193 free(node->usage);
4194 if (node && node->view)
4195 free(node->view);
4196 if (node)
4197 free(node);
4198
4199 return NULL;
4200 }
4201
4202 static void add_proc_stat_node(struct cg_proc_stat *new_node)
4203 {
4204 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4205 struct cg_proc_stat_head *head = proc_stat_history[hash];
4206 struct cg_proc_stat *node;
4207
4208 if (!head->next) {
4209 head->next = new_node;
4210 return;
4211 }
4212
4213 for (;;) {
4214 node = head->next;
4215
4216 if (node->next) {
4217 node = node->next;
4218 continue;
4219 }
4220
4221 node->next = new_node;
4222 return;
4223 }
4224 }
4225
4226 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4227 {
4228 int i;
4229
4230 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4231 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4232
4233 for (i = 0; i < cpu_count; i++) {
4234 node->view[i].user = 0;
4235 node->view[i].system = 0;
4236 node->view[i].idle = 0;
4237 }
4238
4239 node->cpu_count = cpu_count;
4240 }
4241
4242 static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, FILE *f, char *buf, size_t buf_size)
4243 {
4244 char *line = NULL;
4245 size_t linelen = 0, total_len = 0, rv = 0, l;
4246 int curcpu = -1; /* cpu numbering starts at 0 */
4247 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4248 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4249 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4250 unsigned long user_surplus = 0, system_surplus = 0;
4251 unsigned long total_sum, threshold;
4252 struct cg_proc_stat *stat_node;
4253 struct cpuacct_usage *diff = NULL;
4254 int nprocs = get_nprocs();
4255
4256 /* Read all CPU stats and stop when we've encountered other lines */
4257 while (getline(&line, &linelen, f) != -1) {
4258 int cpu, ret;
4259 char cpu_char[10]; /* That's a lot of cores */
4260 uint64_t all_used, cg_used;
4261
4262 if (strlen(line) == 0)
4263 continue;
4264 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4265 /* not a ^cpuN line containing a number N */
4266 break;
4267 }
4268
4269 if (sscanf(cpu_char, "%d", &cpu) != 1)
4270 continue;
4271 if (!cpu_in_cpuset(cpu, cpuset))
4272 continue;
4273 curcpu ++;
4274 cpu_cnt ++;
4275
4276 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4277 &user,
4278 &nice,
4279 &system,
4280 &idle,
4281 &iowait,
4282 &irq,
4283 &softirq,
4284 &steal,
4285 &guest,
4286 &guest_nice);
4287
4288 if (ret != 10)
4289 continue;
4290
4291 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4292 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4293
4294 if (all_used >= cg_used) {
4295 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4296
4297 } else {
4298 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4299 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4300 curcpu, cg, all_used, cg_used);
4301 cg_cpu_usage[curcpu].idle = idle;
4302 }
4303 }
4304
4305 /* Cannot use more CPUs than is available due to cpuset */
4306 if (max_cpus > cpu_cnt)
4307 max_cpus = cpu_cnt;
4308
4309 stat_node = find_proc_stat_node(cg);
4310
4311 if (!stat_node) {
4312 stat_node = new_proc_stat_node(cg_cpu_usage, nprocs, cg);
4313 if (!stat_node) {
4314 rv = 0;
4315 goto err;
4316 }
4317
4318 add_proc_stat_node(stat_node);
4319 }
4320
4321 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4322 if (!diff) {
4323 rv = 0;
4324 goto err;
4325 }
4326
4327 /*
4328 * If the new values are LOWER than values stored in memory, it means
4329 * the cgroup has been reset/recreated and we should reset too.
4330 */
4331 if (cg_cpu_usage[0].user < stat_node->usage[0].user)
4332 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4333
4334 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, cpu_cnt);
4335
4336 for (curcpu = 0; curcpu < cpu_cnt; curcpu++) {
4337 stat_node->usage[curcpu].user += diff[curcpu].user;
4338 stat_node->usage[curcpu].system += diff[curcpu].system;
4339 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4340
4341 if (max_cpus > 0 && curcpu >= max_cpus) {
4342 user_surplus += diff[curcpu].user;
4343 system_surplus += diff[curcpu].system;
4344 }
4345 }
4346
4347 /* Calculate usage counters of visible CPUs */
4348 if (max_cpus > 0) {
4349 /* threshold = maximum usage per cpu, including idle */
4350 threshold = total_sum / cpu_cnt * max_cpus;
4351
4352 for (curcpu = 0; curcpu < max_cpus; curcpu++) {
4353 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4354 continue;
4355
4356 /* Add user */
4357 add_cpu_usage(
4358 &user_surplus,
4359 &diff[curcpu],
4360 &diff[curcpu].user,
4361 threshold);
4362
4363 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4364 continue;
4365
4366 /* If there is still room, add system */
4367 add_cpu_usage(
4368 &system_surplus,
4369 &diff[curcpu],
4370 &diff[curcpu].system,
4371 threshold);
4372 }
4373
4374 if (user_surplus > 0)
4375 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4376 if (system_surplus > 0)
4377 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4378
4379 for (curcpu = 0; curcpu < max_cpus; curcpu++) {
4380 stat_node->view[curcpu].user += diff[curcpu].user;
4381 stat_node->view[curcpu].system += diff[curcpu].system;
4382 stat_node->view[curcpu].idle += diff[curcpu].idle;
4383
4384 user_sum += stat_node->view[curcpu].user;
4385 system_sum += stat_node->view[curcpu].system;
4386 idle_sum += stat_node->view[curcpu].idle;
4387 }
4388
4389 } else {
4390 for (curcpu = 0; curcpu < cpu_cnt; curcpu++) {
4391 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4392 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4393 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4394
4395 user_sum += stat_node->view[curcpu].user;
4396 system_sum += stat_node->view[curcpu].system;
4397 idle_sum += stat_node->view[curcpu].idle;
4398 }
4399 }
4400
4401 /* Render the file */
4402 /* cpu-all */
4403 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4404 user_sum,
4405 system_sum,
4406 idle_sum);
4407
4408 if (l < 0) {
4409 perror("Error writing to cache");
4410 rv = 0;
4411 goto err;
4412
4413 }
4414 if (l >= buf_size) {
4415 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4416 rv = 0;
4417 goto err;
4418 }
4419
4420 buf += l;
4421 buf_size -= l;
4422 total_len += l;
4423
4424 /* Render visible CPUs */
4425 for (curcpu = 0; curcpu < cpu_cnt; curcpu++) {
4426 if (max_cpus > 0 && curcpu == max_cpus)
4427 break;
4428
4429 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4430 curcpu,
4431 stat_node->view[curcpu].user,
4432 stat_node->view[curcpu].system,
4433 stat_node->view[curcpu].idle);
4434
4435 if (l < 0) {
4436 perror("Error writing to cache");
4437 rv = 0;
4438 goto err;
4439
4440 }
4441 if (l >= buf_size) {
4442 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4443 rv = 0;
4444 goto err;
4445 }
4446
4447 buf += l;
4448 buf_size -= l;
4449 total_len += l;
4450 }
4451
4452 /* Pass the rest of /proc/stat, start with the last line read */
4453 l = snprintf(buf, buf_size, "%s", line);
4454
4455 if (l < 0) {
4456 perror("Error writing to cache");
4457 rv = 0;
4458 goto err;
4459
4460 }
4461 if (l >= buf_size) {
4462 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4463 rv = 0;
4464 goto err;
4465 }
4466
4467 buf += l;
4468 buf_size -= l;
4469 total_len += l;
4470
4471 /* Pass the rest of the host's /proc/stat */
4472 while (getline(&line, &linelen, f) != -1) {
4473 l = snprintf(buf, buf_size, "%s", line);
4474 if (l < 0) {
4475 perror("Error writing to cache");
4476 rv = 0;
4477 goto err;
4478 }
4479 if (l >= buf_size) {
4480 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4481 rv = 0;
4482 goto err;
4483 }
4484 buf += l;
4485 buf_size -= l;
4486 total_len += l;
4487 }
4488
4489 rv = total_len;
4490
4491 err:
4492 if (line)
4493 free(line);
4494 if (diff)
4495 free(diff);
4496 return rv;
4497 }
4498
4499 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4500 static int proc_stat_read(char *buf, size_t size, off_t offset,
4501 struct fuse_file_info *fi)
4502 {
4503 struct fuse_context *fc = fuse_get_context();
4504 struct file_info *d = (struct file_info *)fi->fh;
4505 char *cg;
4506 char *cpuset = NULL;
4507 char *line = NULL;
4508 size_t linelen = 0, total_len = 0, rv = 0;
4509 int curcpu = -1; /* cpu numbering starts at 0 */
4510 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4511 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
4512 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4513 char cpuall[CPUALL_MAX_SIZE];
4514 /* reserve for cpu all */
4515 char *cache = d->buf + CPUALL_MAX_SIZE;
4516 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4517 FILE *f = NULL;
4518 struct cpuacct_usage *cg_cpu_usage = NULL;
4519
4520 if (offset){
4521 if (offset > d->size)
4522 return -EINVAL;
4523 if (!d->cached)
4524 return 0;
4525 int left = d->size - offset;
4526 total_len = left > size ? size: left;
4527 memcpy(buf, d->buf + offset, total_len);
4528 return total_len;
4529 }
4530
4531 pid_t initpid = lookup_initpid_in_store(fc->pid);
4532 if (initpid <= 0)
4533 initpid = fc->pid;
4534 cg = get_pid_cgroup(initpid, "cpuset");
4535 if (!cg)
4536 return read_file("/proc/stat", buf, size, d);
4537 prune_init_slice(cg);
4538
4539 cpuset = get_cpuset(cg);
4540 if (!cpuset)
4541 goto err;
4542
4543 /*
4544 * Read cpuacct.usage_all for all CPUs.
4545 * If the cpuacct cgroup is present, it is used to calculate the container's
4546 * CPU usage. If not, values from the host's /proc/stat are used.
4547 */
4548 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage) != 0) {
4549 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
4550 "falling back to the host's /proc/stat");
4551 }
4552
4553 f = fopen("/proc/stat", "r");
4554 if (!f)
4555 goto err;
4556
4557 //skip first line
4558 if (getline(&line, &linelen, f) < 0) {
4559 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4560 goto err;
4561 }
4562
4563 if (use_cpuview(cg) && cg_cpu_usage) {
4564 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, f, d->buf, d->buflen);
4565 goto out;
4566 }
4567
4568 while (getline(&line, &linelen, f) != -1) {
4569 ssize_t l;
4570 int cpu;
4571 char cpu_char[10]; /* That's a lot of cores */
4572 char *c;
4573 uint64_t all_used, cg_used, new_idle;
4574 int ret;
4575
4576 if (strlen(line) == 0)
4577 continue;
4578 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4579 /* not a ^cpuN line containing a number N, just print it */
4580 l = snprintf(cache, cache_size, "%s", line);
4581 if (l < 0) {
4582 perror("Error writing to cache");
4583 rv = 0;
4584 goto err;
4585 }
4586 if (l >= cache_size) {
4587 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4588 rv = 0;
4589 goto err;
4590 }
4591 cache += l;
4592 cache_size -= l;
4593 total_len += l;
4594 continue;
4595 }
4596
4597 if (sscanf(cpu_char, "%d", &cpu) != 1)
4598 continue;
4599 if (!cpu_in_cpuset(cpu, cpuset))
4600 continue;
4601 curcpu ++;
4602
4603 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4604 &user,
4605 &nice,
4606 &system,
4607 &idle,
4608 &iowait,
4609 &irq,
4610 &softirq,
4611 &steal,
4612 &guest,
4613 &guest_nice);
4614
4615 if (ret != 10 || !cg_cpu_usage) {
4616 c = strchr(line, ' ');
4617 if (!c)
4618 continue;
4619 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4620 if (l < 0) {
4621 perror("Error writing to cache");
4622 rv = 0;
4623 goto err;
4624
4625 }
4626 if (l >= cache_size) {
4627 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4628 rv = 0;
4629 goto err;
4630 }
4631
4632 cache += l;
4633 cache_size -= l;
4634 total_len += l;
4635
4636 if (ret != 10)
4637 continue;
4638 }
4639
4640 if (cg_cpu_usage) {
4641 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4642 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4643
4644 if (all_used >= cg_used) {
4645 new_idle = idle + (all_used - cg_used);
4646
4647 } else {
4648 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4649 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4650 curcpu, cg, all_used, cg_used);
4651 new_idle = idle;
4652 }
4653
4654 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4655 curcpu, cg_cpu_usage[curcpu].user, cg_cpu_usage[curcpu].system,
4656 new_idle);
4657
4658 if (l < 0) {
4659 perror("Error writing to cache");
4660 rv = 0;
4661 goto err;
4662
4663 }
4664 if (l >= cache_size) {
4665 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4666 rv = 0;
4667 goto err;
4668 }
4669
4670 cache += l;
4671 cache_size -= l;
4672 total_len += l;
4673
4674 user_sum += cg_cpu_usage[curcpu].user;
4675 system_sum += cg_cpu_usage[curcpu].system;
4676 idle_sum += new_idle;
4677
4678 } else {
4679 user_sum += user;
4680 nice_sum += nice;
4681 system_sum += system;
4682 idle_sum += idle;
4683 iowait_sum += iowait;
4684 irq_sum += irq;
4685 softirq_sum += softirq;
4686 steal_sum += steal;
4687 guest_sum += guest;
4688 guest_nice_sum += guest_nice;
4689 }
4690 }
4691
4692 cache = d->buf;
4693
4694 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4695 user_sum,
4696 nice_sum,
4697 system_sum,
4698 idle_sum,
4699 iowait_sum,
4700 irq_sum,
4701 softirq_sum,
4702 steal_sum,
4703 guest_sum,
4704 guest_nice_sum);
4705 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4706 memcpy(cache, cpuall, cpuall_len);
4707 cache += cpuall_len;
4708 } else {
4709 /* shouldn't happen */
4710 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4711 cpuall_len = 0;
4712 }
4713
4714 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4715 total_len += cpuall_len;
4716
4717 out:
4718 d->cached = 1;
4719 d->size = total_len;
4720 if (total_len > size)
4721 total_len = size;
4722
4723 memcpy(buf, d->buf, total_len);
4724 rv = total_len;
4725
4726 err:
4727 if (f)
4728 fclose(f);
4729 if (cg_cpu_usage)
4730 free(cg_cpu_usage);
4731 free(line);
4732 free(cpuset);
4733 free(cg);
4734 return rv;
4735 }
4736
4737 /* This function retrieves the busy time of a group of tasks by looking at
4738 * cpuacct.usage. Unfortunately, this only makes sense when the container has
4739 * been given it's own cpuacct cgroup. If not, this function will take the busy
4740 * time of all other taks that do not actually belong to the container into
4741 * account as well. If someone has a clever solution for this please send a
4742 * patch!
4743 */
4744 static unsigned long get_reaper_busy(pid_t task)
4745 {
4746 pid_t initpid = lookup_initpid_in_store(task);
4747 char *cgroup = NULL, *usage_str = NULL;
4748 unsigned long usage = 0;
4749
4750 if (initpid <= 0)
4751 return 0;
4752
4753 cgroup = get_pid_cgroup(initpid, "cpuacct");
4754 if (!cgroup)
4755 goto out;
4756 prune_init_slice(cgroup);
4757 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
4758 goto out;
4759 usage = strtoul(usage_str, NULL, 10);
4760 usage /= 1000000000;
4761
4762 out:
4763 free(cgroup);
4764 free(usage_str);
4765 return usage;
4766 }
4767
4768 #if RELOADTEST
4769 void iwashere(void)
4770 {
4771 int fd;
4772
4773 fd = creat("/tmp/lxcfs-iwashere", 0644);
4774 if (fd >= 0)
4775 close(fd);
4776 }
4777 #endif
4778
4779 /*
4780 * We read /proc/uptime and reuse its second field.
4781 * For the first field, we use the mtime for the reaper for
4782 * the calling pid as returned by getreaperage
4783 */
4784 static int proc_uptime_read(char *buf, size_t size, off_t offset,
4785 struct fuse_file_info *fi)
4786 {
4787 struct fuse_context *fc = fuse_get_context();
4788 struct file_info *d = (struct file_info *)fi->fh;
4789 unsigned long int busytime = get_reaper_busy(fc->pid);
4790 char *cache = d->buf;
4791 ssize_t total_len = 0;
4792 uint64_t idletime, reaperage;
4793
4794 #if RELOADTEST
4795 iwashere();
4796 #endif
4797
4798 if (offset){
4799 if (!d->cached)
4800 return 0;
4801 if (offset > d->size)
4802 return -EINVAL;
4803 int left = d->size - offset;
4804 total_len = left > size ? size: left;
4805 memcpy(buf, cache + offset, total_len);
4806 return total_len;
4807 }
4808
4809 reaperage = get_reaper_age(fc->pid);
4810 /* To understand why this is done, please read the comment to the
4811 * get_reaper_busy() function.
4812 */
4813 idletime = reaperage;
4814 if (reaperage >= busytime)
4815 idletime = reaperage - busytime;
4816
4817 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
4818 if (total_len < 0 || total_len >= d->buflen){
4819 lxcfs_error("%s\n", "failed to write to cache");
4820 return 0;
4821 }
4822
4823 d->size = (int)total_len;
4824 d->cached = 1;
4825
4826 if (total_len > size) total_len = size;
4827
4828 memcpy(buf, d->buf, total_len);
4829 return total_len;
4830 }
4831
4832 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4833 struct fuse_file_info *fi)
4834 {
4835 char dev_name[72];
4836 struct fuse_context *fc = fuse_get_context();
4837 struct file_info *d = (struct file_info *)fi->fh;
4838 char *cg;
4839 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
4840 *io_wait_time_str = NULL, *io_service_time_str = NULL;
4841 unsigned long read = 0, write = 0;
4842 unsigned long read_merged = 0, write_merged = 0;
4843 unsigned long read_sectors = 0, write_sectors = 0;
4844 unsigned long read_ticks = 0, write_ticks = 0;
4845 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4846 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4847 char *cache = d->buf;
4848 size_t cache_size = d->buflen;
4849 char *line = NULL;
4850 size_t linelen = 0, total_len = 0, rv = 0;
4851 unsigned int major = 0, minor = 0;
4852 int i = 0;
4853 FILE *f = NULL;
4854
4855 if (offset){
4856 if (offset > d->size)
4857 return -EINVAL;
4858 if (!d->cached)
4859 return 0;
4860 int left = d->size - offset;
4861 total_len = left > size ? size: left;
4862 memcpy(buf, cache + offset, total_len);
4863 return total_len;
4864 }
4865
4866 pid_t initpid = lookup_initpid_in_store(fc->pid);
4867 if (initpid <= 0)
4868 initpid = fc->pid;
4869 cg = get_pid_cgroup(initpid, "blkio");
4870 if (!cg)
4871 return read_file("/proc/diskstats", buf, size, d);
4872 prune_init_slice(cg);
4873
4874 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
4875 goto err;
4876 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
4877 goto err;
4878 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
4879 goto err;
4880 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
4881 goto err;
4882 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
4883 goto err;
4884
4885
4886 f = fopen("/proc/diskstats", "r");
4887 if (!f)
4888 goto err;
4889
4890 while (getline(&line, &linelen, f) != -1) {
4891 ssize_t l;
4892 char lbuf[256];
4893
4894 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
4895 if (i != 3)
4896 continue;
4897
4898 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
4899 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
4900 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
4901 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
4902 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
4903 read_sectors = read_sectors/512;
4904 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
4905 write_sectors = write_sectors/512;
4906
4907 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
4908 rd_svctm = rd_svctm/1000000;
4909 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
4910 rd_wait = rd_wait/1000000;
4911 read_ticks = rd_svctm + rd_wait;
4912
4913 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
4914 wr_svctm = wr_svctm/1000000;
4915 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
4916 wr_wait = wr_wait/1000000;
4917 write_ticks = wr_svctm + wr_wait;
4918
4919 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
4920 tot_ticks = tot_ticks/1000000;
4921
4922 memset(lbuf, 0, 256);
4923 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
4924 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4925 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
4926 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
4927 else
4928 continue;
4929
4930 l = snprintf(cache, cache_size, "%s", lbuf);
4931 if (l < 0) {
4932 perror("Error writing to fuse buf");
4933 rv = 0;
4934 goto err;
4935 }
4936 if (l >= cache_size) {
4937 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4938 rv = 0;
4939 goto err;
4940 }
4941 cache += l;
4942 cache_size -= l;
4943 total_len += l;
4944 }
4945
4946 d->cached = 1;
4947 d->size = total_len;
4948 if (total_len > size ) total_len = size;
4949 memcpy(buf, d->buf, total_len);
4950
4951 rv = total_len;
4952 err:
4953 free(cg);
4954 if (f)
4955 fclose(f);
4956 free(line);
4957 free(io_serviced_str);
4958 free(io_merged_str);
4959 free(io_service_bytes_str);
4960 free(io_wait_time_str);
4961 free(io_service_time_str);
4962 return rv;
4963 }
4964
4965 static int proc_swaps_read(char *buf, size_t size, off_t offset,
4966 struct fuse_file_info *fi)
4967 {
4968 struct fuse_context *fc = fuse_get_context();
4969 struct file_info *d = (struct file_info *)fi->fh;
4970 char *cg = NULL;
4971 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
4972 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
4973 ssize_t total_len = 0, rv = 0;
4974 ssize_t l = 0;
4975 char *cache = d->buf;
4976
4977 if (offset) {
4978 if (offset > d->size)
4979 return -EINVAL;
4980 if (!d->cached)
4981 return 0;
4982 int left = d->size - offset;
4983 total_len = left > size ? size: left;
4984 memcpy(buf, cache + offset, total_len);
4985 return total_len;
4986 }
4987
4988 pid_t initpid = lookup_initpid_in_store(fc->pid);
4989 if (initpid <= 0)
4990 initpid = fc->pid;
4991 cg = get_pid_cgroup(initpid, "memory");
4992 if (!cg)
4993 return read_file("/proc/swaps", buf, size, d);
4994 prune_init_slice(cg);
4995
4996 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
4997
4998 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4999 goto err;
5000
5001 memusage = strtoul(memusage_str, NULL, 10);
5002
5003 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5004 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5005
5006 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
5007 memswusage = strtoul(memswusage_str, NULL, 10);
5008
5009 swap_total = (memswlimit - memlimit) / 1024;
5010 swap_free = (memswusage - memusage) / 1024;
5011 }
5012
5013 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5014
5015 /* When no mem + swap limit is specified or swapaccount=0*/
5016 if (!memswlimit) {
5017 char *line = NULL;
5018 size_t linelen = 0;
5019 FILE *f = fopen("/proc/meminfo", "r");
5020
5021 if (!f)
5022 goto err;
5023
5024 while (getline(&line, &linelen, f) != -1) {
5025 if (startswith(line, "SwapTotal:")) {
5026 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
5027 } else if (startswith(line, "SwapFree:")) {
5028 sscanf(line, "SwapFree: %8lu kB", &swap_free);
5029 }
5030 }
5031
5032 free(line);
5033 fclose(f);
5034 }
5035
5036 if (swap_total > 0) {
5037 l = snprintf(d->buf + total_len, d->size - total_len,
5038 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5039 swap_total, swap_free);
5040 total_len += l;
5041 }
5042
5043 if (total_len < 0 || l < 0) {
5044 perror("Error writing to cache");
5045 rv = 0;
5046 goto err;
5047 }
5048
5049 d->cached = 1;
5050 d->size = (int)total_len;
5051
5052 if (total_len > size) total_len = size;
5053 memcpy(buf, d->buf, total_len);
5054 rv = total_len;
5055
5056 err:
5057 free(cg);
5058 free(memswlimit_str);
5059 free(memlimit_str);
5060 free(memusage_str);
5061 free(memswusage_str);
5062 return rv;
5063 }
5064 /*
5065 * Find the process pid from cgroup path.
5066 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5067 * @pid_buf : put pid to pid_buf.
5068 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5069 * @depth : the depth of cgroup in container.
5070 * @sum : return the number of pid.
5071 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5072 */
5073 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5074 {
5075 DIR *dir;
5076 int fd;
5077 struct dirent *file;
5078 FILE *f = NULL;
5079 size_t linelen = 0;
5080 char *line = NULL;
5081 int pd;
5082 char *path_dir, *path;
5083 char **pid;
5084
5085 /* path = dpath + "/cgroup.procs" + /0 */
5086 do {
5087 path = malloc(strlen(dpath) + 20);
5088 } while (!path);
5089
5090 strcpy(path, dpath);
5091 fd = openat(cfd, path, O_RDONLY);
5092 if (fd < 0)
5093 goto out;
5094
5095 dir = fdopendir(fd);
5096 if (dir == NULL) {
5097 close(fd);
5098 goto out;
5099 }
5100
5101 while (((file = readdir(dir)) != NULL) && depth > 0) {
5102 if (strncmp(file->d_name, ".", 1) == 0)
5103 continue;
5104 if (strncmp(file->d_name, "..", 1) == 0)
5105 continue;
5106 if (file->d_type == DT_DIR) {
5107 /* path + '/' + d_name +/0 */
5108 do {
5109 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5110 } while (!path_dir);
5111 strcpy(path_dir, path);
5112 strcat(path_dir, "/");
5113 strcat(path_dir, file->d_name);
5114 pd = depth - 1;
5115 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5116 free(path_dir);
5117 }
5118 }
5119 closedir(dir);
5120
5121 strcat(path, "/cgroup.procs");
5122 fd = openat(cfd, path, O_RDONLY);
5123 if (fd < 0)
5124 goto out;
5125
5126 f = fdopen(fd, "r");
5127 if (!f) {
5128 close(fd);
5129 goto out;
5130 }
5131
5132 while (getline(&line, &linelen, f) != -1) {
5133 do {
5134 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5135 } while (!pid);
5136 *pid_buf = pid;
5137 do {
5138 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5139 } while (*(*pid_buf + sum) == NULL);
5140 strcpy(*(*pid_buf + sum), line);
5141 sum++;
5142 }
5143 fclose(f);
5144 out:
5145 if (line)
5146 free(line);
5147 free(path);
5148 return sum;
5149 }
5150 /*
5151 * calc_load calculates the load according to the following formula:
5152 * load1 = load0 * exp + active * (1 - exp)
5153 *
5154 * @load1: the new loadavg.
5155 * @load0: the former loadavg.
5156 * @active: the total number of running pid at this moment.
5157 * @exp: the fixed-point defined in the beginning.
5158 */
5159 static unsigned long
5160 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5161 {
5162 unsigned long newload;
5163
5164 active = active > 0 ? active * FIXED_1 : 0;
5165 newload = load * exp + active * (FIXED_1 - exp);
5166 if (active >= load)
5167 newload += FIXED_1 - 1;
5168
5169 return newload / FIXED_1;
5170 }
5171
5172 /*
5173 * Return 0 means that container p->cg is closed.
5174 * Return -1 means that error occurred in refresh.
5175 * Positive num equals the total number of pid.
5176 */
5177 static int refresh_load(struct load_node *p, char *path)
5178 {
5179 FILE *f = NULL;
5180 char **idbuf;
5181 char proc_path[256];
5182 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5183 char *line = NULL;
5184 size_t linelen = 0;
5185 int sum, length;
5186 DIR *dp;
5187 struct dirent *file;
5188
5189 do {
5190 idbuf = malloc(sizeof(char *));
5191 } while (!idbuf);
5192 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5193 /* normal exit */
5194 if (sum == 0)
5195 goto out;
5196
5197 for (i = 0; i < sum; i++) {
5198 /*clean up '\n' */
5199 length = strlen(idbuf[i])-1;
5200 idbuf[i][length] = '\0';
5201 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5202 if (ret < 0 || ret > 255) {
5203 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5204 i = sum;
5205 sum = -1;
5206 goto err_out;
5207 }
5208
5209 dp = opendir(proc_path);
5210 if (!dp) {
5211 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5212 continue;
5213 }
5214 while ((file = readdir(dp)) != NULL) {
5215 if (strncmp(file->d_name, ".", 1) == 0)
5216 continue;
5217 if (strncmp(file->d_name, "..", 1) == 0)
5218 continue;
5219 total_pid++;
5220 /* We make the biggest pid become last_pid.*/
5221 ret = atof(file->d_name);
5222 last_pid = (ret > last_pid) ? ret : last_pid;
5223
5224 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5225 if (ret < 0 || ret > 255) {
5226 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5227 i = sum;
5228 sum = -1;
5229 closedir(dp);
5230 goto err_out;
5231 }
5232 f = fopen(proc_path, "r");
5233 if (f != NULL) {
5234 while (getline(&line, &linelen, f) != -1) {
5235 /* Find State */
5236 if ((line[0] == 'S') && (line[1] == 't'))
5237 break;
5238 }
5239 if ((line[7] == 'R') || (line[7] == 'D'))
5240 run_pid++;
5241 fclose(f);
5242 }
5243 }
5244 closedir(dp);
5245 }
5246 /*Calculate the loadavg.*/
5247 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5248 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5249 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5250 p->run_pid = run_pid;
5251 p->total_pid = total_pid;
5252 p->last_pid = last_pid;
5253
5254 free(line);
5255 err_out:
5256 for (; i > 0; i--)
5257 free(idbuf[i-1]);
5258 out:
5259 free(idbuf);
5260 return sum;
5261 }
5262 /*
5263 * Traverse the hash table and update it.
5264 */
5265 void *load_begin(void *arg)
5266 {
5267
5268 char *path = NULL;
5269 int i, sum, length, ret;
5270 struct load_node *f;
5271 int first_node;
5272 clock_t time1, time2;
5273
5274 while (1) {
5275 if (loadavg_stop == 1)
5276 return NULL;
5277
5278 time1 = clock();
5279 for (i = 0; i < LOAD_SIZE; i++) {
5280 pthread_mutex_lock(&load_hash[i].lock);
5281 if (load_hash[i].next == NULL) {
5282 pthread_mutex_unlock(&load_hash[i].lock);
5283 continue;
5284 }
5285 f = load_hash[i].next;
5286 first_node = 1;
5287 while (f) {
5288 length = strlen(f->cg) + 2;
5289 do {
5290 /* strlen(f->cg) + '.' or '' + \0 */
5291 path = malloc(length);
5292 } while (!path);
5293
5294 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5295 if (ret < 0 || ret > length - 1) {
5296 /* snprintf failed, ignore the node.*/
5297 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5298 goto out;
5299 }
5300 sum = refresh_load(f, path);
5301 if (sum == 0) {
5302 f = del_node(f, i);
5303 } else {
5304 out: f = f->next;
5305 }
5306 free(path);
5307 /* load_hash[i].lock locks only on the first node.*/
5308 if (first_node == 1) {
5309 first_node = 0;
5310 pthread_mutex_unlock(&load_hash[i].lock);
5311 }
5312 }
5313 }
5314
5315 if (loadavg_stop == 1)
5316 return NULL;
5317
5318 time2 = clock();
5319 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5320 }
5321 }
5322
5323 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5324 struct fuse_file_info *fi)
5325 {
5326 struct fuse_context *fc = fuse_get_context();
5327 struct file_info *d = (struct file_info *)fi->fh;
5328 pid_t initpid;
5329 char *cg;
5330 size_t total_len = 0;
5331 char *cache = d->buf;
5332 struct load_node *n;
5333 int hash;
5334 int cfd, rv = 0;
5335 unsigned long a, b, c;
5336
5337 if (offset) {
5338 if (offset > d->size)
5339 return -EINVAL;
5340 if (!d->cached)
5341 return 0;
5342 int left = d->size - offset;
5343 total_len = left > size ? size : left;
5344 memcpy(buf, cache + offset, total_len);
5345 return total_len;
5346 }
5347 if (!loadavg)
5348 return read_file("/proc/loadavg", buf, size, d);
5349
5350 initpid = lookup_initpid_in_store(fc->pid);
5351 if (initpid <= 0)
5352 initpid = fc->pid;
5353 cg = get_pid_cgroup(initpid, "cpu");
5354 if (!cg)
5355 return read_file("/proc/loadavg", buf, size, d);
5356
5357 prune_init_slice(cg);
5358 hash = calc_hash(cg) % LOAD_SIZE;
5359 n = locate_node(cg, hash);
5360
5361 /* First time */
5362 if (n == NULL) {
5363 if (!find_mounted_controller("cpu", &cfd)) {
5364 /*
5365 * In locate_node() above, pthread_rwlock_unlock() isn't used
5366 * because delete is not allowed before read has ended.
5367 */
5368 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5369 rv = 0;
5370 goto err;
5371 }
5372 do {
5373 n = malloc(sizeof(struct load_node));
5374 } while (!n);
5375
5376 do {
5377 n->cg = malloc(strlen(cg)+1);
5378 } while (!n->cg);
5379 strcpy(n->cg, cg);
5380 n->avenrun[0] = 0;
5381 n->avenrun[1] = 0;
5382 n->avenrun[2] = 0;
5383 n->run_pid = 0;
5384 n->total_pid = 1;
5385 n->last_pid = initpid;
5386 n->cfd = cfd;
5387 insert_node(&n, hash);
5388 }
5389 a = n->avenrun[0] + (FIXED_1/200);
5390 b = n->avenrun[1] + (FIXED_1/200);
5391 c = n->avenrun[2] + (FIXED_1/200);
5392 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5393 LOAD_INT(a), LOAD_FRAC(a),
5394 LOAD_INT(b), LOAD_FRAC(b),
5395 LOAD_INT(c), LOAD_FRAC(c),
5396 n->run_pid, n->total_pid, n->last_pid);
5397 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5398 if (total_len < 0 || total_len >= d->buflen) {
5399 lxcfs_error("%s\n", "Failed to write to cache");
5400 rv = 0;
5401 goto err;
5402 }
5403 d->size = (int)total_len;
5404 d->cached = 1;
5405
5406 if (total_len > size)
5407 total_len = size;
5408 memcpy(buf, d->buf, total_len);
5409 rv = total_len;
5410
5411 err:
5412 free(cg);
5413 return rv;
5414 }
5415 /* Return a positive number on success, return 0 on failure.*/
5416 pthread_t load_daemon(int load_use)
5417 {
5418 int ret;
5419 pthread_t pid;
5420
5421 ret = init_load();
5422 if (ret == -1) {
5423 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5424 return 0;
5425 }
5426 ret = pthread_create(&pid, NULL, load_begin, NULL);
5427 if (ret != 0) {
5428 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5429 load_free();
5430 return 0;
5431 }
5432 /* use loadavg, here loadavg = 1*/
5433 loadavg = load_use;
5434 return pid;
5435 }
5436
5437 /* Returns 0 on success. */
5438 int stop_load_daemon(pthread_t pid)
5439 {
5440 int s;
5441
5442 /* Signal the thread to gracefully stop */
5443 loadavg_stop = 1;
5444
5445 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5446 if (s != 0) {
5447 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5448 return -1;
5449 }
5450
5451 load_free();
5452 loadavg_stop = 0;
5453
5454 return 0;
5455 }
5456
5457 static off_t get_procfile_size(const char *which)
5458 {
5459 FILE *f = fopen(which, "r");
5460 char *line = NULL;
5461 size_t len = 0;
5462 ssize_t sz, answer = 0;
5463 if (!f)
5464 return 0;
5465
5466 while ((sz = getline(&line, &len, f)) != -1)
5467 answer += sz;
5468 fclose (f);
5469 free(line);
5470
5471 return answer;
5472 }
5473
5474 int proc_getattr(const char *path, struct stat *sb)
5475 {
5476 struct timespec now;
5477
5478 memset(sb, 0, sizeof(struct stat));
5479 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5480 return -EINVAL;
5481 sb->st_uid = sb->st_gid = 0;
5482 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5483 if (strcmp(path, "/proc") == 0) {
5484 sb->st_mode = S_IFDIR | 00555;
5485 sb->st_nlink = 2;
5486 return 0;
5487 }
5488 if (strcmp(path, "/proc/meminfo") == 0 ||
5489 strcmp(path, "/proc/cpuinfo") == 0 ||
5490 strcmp(path, "/proc/uptime") == 0 ||
5491 strcmp(path, "/proc/stat") == 0 ||
5492 strcmp(path, "/proc/diskstats") == 0 ||
5493 strcmp(path, "/proc/swaps") == 0 ||
5494 strcmp(path, "/proc/loadavg") == 0) {
5495 sb->st_size = 0;
5496 sb->st_mode = S_IFREG | 00444;
5497 sb->st_nlink = 1;
5498 return 0;
5499 }
5500
5501 return -ENOENT;
5502 }
5503
5504 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5505 struct fuse_file_info *fi)
5506 {
5507 if (filler(buf, ".", NULL, 0) != 0 ||
5508 filler(buf, "..", NULL, 0) != 0 ||
5509 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5510 filler(buf, "meminfo", NULL, 0) != 0 ||
5511 filler(buf, "stat", NULL, 0) != 0 ||
5512 filler(buf, "uptime", NULL, 0) != 0 ||
5513 filler(buf, "diskstats", NULL, 0) != 0 ||
5514 filler(buf, "swaps", NULL, 0) != 0 ||
5515 filler(buf, "loadavg", NULL, 0) != 0)
5516 return -EINVAL;
5517 return 0;
5518 }
5519
5520 int proc_open(const char *path, struct fuse_file_info *fi)
5521 {
5522 int type = -1;
5523 struct file_info *info;
5524
5525 if (strcmp(path, "/proc/meminfo") == 0)
5526 type = LXC_TYPE_PROC_MEMINFO;
5527 else if (strcmp(path, "/proc/cpuinfo") == 0)
5528 type = LXC_TYPE_PROC_CPUINFO;
5529 else if (strcmp(path, "/proc/uptime") == 0)
5530 type = LXC_TYPE_PROC_UPTIME;
5531 else if (strcmp(path, "/proc/stat") == 0)
5532 type = LXC_TYPE_PROC_STAT;
5533 else if (strcmp(path, "/proc/diskstats") == 0)
5534 type = LXC_TYPE_PROC_DISKSTATS;
5535 else if (strcmp(path, "/proc/swaps") == 0)
5536 type = LXC_TYPE_PROC_SWAPS;
5537 else if (strcmp(path, "/proc/loadavg") == 0)
5538 type = LXC_TYPE_PROC_LOADAVG;
5539 if (type == -1)
5540 return -ENOENT;
5541
5542 info = malloc(sizeof(*info));
5543 if (!info)
5544 return -ENOMEM;
5545
5546 memset(info, 0, sizeof(*info));
5547 info->type = type;
5548
5549 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5550 do {
5551 info->buf = malloc(info->buflen);
5552 } while (!info->buf);
5553 memset(info->buf, 0, info->buflen);
5554 /* set actual size to buffer size */
5555 info->size = info->buflen;
5556
5557 fi->fh = (unsigned long)info;
5558 return 0;
5559 }
5560
5561 int proc_access(const char *path, int mask)
5562 {
5563 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5564 return 0;
5565
5566 /* these are all read-only */
5567 if ((mask & ~R_OK) != 0)
5568 return -EACCES;
5569 return 0;
5570 }
5571
5572 int proc_release(const char *path, struct fuse_file_info *fi)
5573 {
5574 do_release_file_info(fi);
5575 return 0;
5576 }
5577
5578 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5579 struct fuse_file_info *fi)
5580 {
5581 struct file_info *f = (struct file_info *) fi->fh;
5582
5583 switch (f->type) {
5584 case LXC_TYPE_PROC_MEMINFO:
5585 return proc_meminfo_read(buf, size, offset, fi);
5586 case LXC_TYPE_PROC_CPUINFO:
5587 return proc_cpuinfo_read(buf, size, offset, fi);
5588 case LXC_TYPE_PROC_UPTIME:
5589 return proc_uptime_read(buf, size, offset, fi);
5590 case LXC_TYPE_PROC_STAT:
5591 return proc_stat_read(buf, size, offset, fi);
5592 case LXC_TYPE_PROC_DISKSTATS:
5593 return proc_diskstats_read(buf, size, offset, fi);
5594 case LXC_TYPE_PROC_SWAPS:
5595 return proc_swaps_read(buf, size, offset, fi);
5596 case LXC_TYPE_PROC_LOADAVG:
5597 return proc_loadavg_read(buf, size, offset, fi);
5598 default:
5599 return -EINVAL;
5600 }
5601 }
5602
5603 /*
5604 * Functions needed to setup cgroups in the __constructor__.
5605 */
5606
5607 static bool mkdir_p(const char *dir, mode_t mode)
5608 {
5609 const char *tmp = dir;
5610 const char *orig = dir;
5611 char *makeme;
5612
5613 do {
5614 dir = tmp + strspn(tmp, "/");
5615 tmp = dir + strcspn(dir, "/");
5616 makeme = strndup(orig, dir - orig);
5617 if (!makeme)
5618 return false;
5619 if (mkdir(makeme, mode) && errno != EEXIST) {
5620 lxcfs_error("Failed to create directory '%s': %s.\n",
5621 makeme, strerror(errno));
5622 free(makeme);
5623 return false;
5624 }
5625 free(makeme);
5626 } while(tmp != dir);
5627
5628 return true;
5629 }
5630
5631 static bool umount_if_mounted(void)
5632 {
5633 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5634 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5635 return false;
5636 }
5637 return true;
5638 }
5639
5640 /* __typeof__ should be safe to use with all compilers. */
5641 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5642 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5643 {
5644 return (fs->f_type == (fs_type_magic)magic_val);
5645 }
5646
5647 /*
5648 * looking at fs/proc_namespace.c, it appears we can
5649 * actually expect the rootfs entry to very specifically contain
5650 * " - rootfs rootfs "
5651 * IIUC, so long as we've chrooted so that rootfs is not our root,
5652 * the rootfs entry should always be skipped in mountinfo contents.
5653 */
5654 static bool is_on_ramfs(void)
5655 {
5656 FILE *f;
5657 char *p, *p2;
5658 char *line = NULL;
5659 size_t len = 0;
5660 int i;
5661
5662 f = fopen("/proc/self/mountinfo", "r");
5663 if (!f)
5664 return false;
5665
5666 while (getline(&line, &len, f) != -1) {
5667 for (p = line, i = 0; p && i < 4; i++)
5668 p = strchr(p + 1, ' ');
5669 if (!p)
5670 continue;
5671 p2 = strchr(p + 1, ' ');
5672 if (!p2)
5673 continue;
5674 *p2 = '\0';
5675 if (strcmp(p + 1, "/") == 0) {
5676 // this is '/'. is it the ramfs?
5677 p = strchr(p2 + 1, '-');
5678 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5679 free(line);
5680 fclose(f);
5681 return true;
5682 }
5683 }
5684 }
5685 free(line);
5686 fclose(f);
5687 return false;
5688 }
5689
5690 static int pivot_enter()
5691 {
5692 int ret = -1, oldroot = -1, newroot = -1;
5693
5694 oldroot = open("/", O_DIRECTORY | O_RDONLY);
5695 if (oldroot < 0) {
5696 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5697 return ret;
5698 }
5699
5700 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5701 if (newroot < 0) {
5702 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5703 goto err;
5704 }
5705
5706 /* change into new root fs */
5707 if (fchdir(newroot) < 0) {
5708 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5709 goto err;
5710 }
5711
5712 /* pivot_root into our new root fs */
5713 if (pivot_root(".", ".") < 0) {
5714 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5715 goto err;
5716 }
5717
5718 /*
5719 * At this point the old-root is mounted on top of our new-root.
5720 * To unmounted it we must not be chdir'd into it, so escape back
5721 * to the old-root.
5722 */
5723 if (fchdir(oldroot) < 0) {
5724 lxcfs_error("%s\n", "Failed to enter old root.");
5725 goto err;
5726 }
5727
5728 if (umount2(".", MNT_DETACH) < 0) {
5729 lxcfs_error("%s\n", "Failed to detach old root.");
5730 goto err;
5731 }
5732
5733 if (fchdir(newroot) < 0) {
5734 lxcfs_error("%s\n", "Failed to re-enter new root.");
5735 goto err;
5736 }
5737
5738 ret = 0;
5739
5740 err:
5741 if (oldroot > 0)
5742 close(oldroot);
5743 if (newroot > 0)
5744 close(newroot);
5745
5746 return ret;
5747 }
5748
5749 static int chroot_enter()
5750 {
5751 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5752 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5753 return -1;
5754 }
5755
5756 if (chroot(".") < 0) {
5757 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
5758 return -1;
5759 }
5760
5761 if (chdir("/") < 0) {
5762 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
5763 return -1;
5764 }
5765
5766 return 0;
5767 }
5768
5769 static int permute_and_enter(void)
5770 {
5771 struct statfs sb;
5772
5773 if (statfs("/", &sb) < 0) {
5774 lxcfs_error("%s\n", "Could not stat / mountpoint.");
5775 return -1;
5776 }
5777
5778 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5779 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5780 * /proc/1/mountinfo. */
5781 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5782 return chroot_enter();
5783
5784 if (pivot_enter() < 0) {
5785 lxcfs_error("%s\n", "Could not perform pivot root.");
5786 return -1;
5787 }
5788
5789 return 0;
5790 }
5791
5792 /* Prepare our new clean root. */
5793 static int permute_prepare(void)
5794 {
5795 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
5796 lxcfs_error("%s\n", "Failed to create directory for new root.");
5797 return -1;
5798 }
5799
5800 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
5801 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
5802 return -1;
5803 }
5804
5805 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
5806 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
5807 return -1;
5808 }
5809
5810 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
5811 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
5812 return -1;
5813 }
5814
5815 return 0;
5816 }
5817
5818 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
5819 static bool permute_root(void)
5820 {
5821 /* Prepare new root. */
5822 if (permute_prepare() < 0)
5823 return false;
5824
5825 /* Pivot into new root. */
5826 if (permute_and_enter() < 0)
5827 return false;
5828
5829 return true;
5830 }
5831
5832 static int preserve_mnt_ns(int pid)
5833 {
5834 int ret;
5835 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
5836 char path[len];
5837
5838 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
5839 if (ret < 0 || (size_t)ret >= len)
5840 return -1;
5841
5842 return open(path, O_RDONLY | O_CLOEXEC);
5843 }
5844
5845 static bool cgfs_prepare_mounts(void)
5846 {
5847 if (!mkdir_p(BASEDIR, 0700)) {
5848 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
5849 return false;
5850 }
5851
5852 if (!umount_if_mounted()) {
5853 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
5854 return false;
5855 }
5856
5857 if (unshare(CLONE_NEWNS) < 0) {
5858 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
5859 return false;
5860 }
5861
5862 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
5863 if (cgroup_mount_ns_fd < 0) {
5864 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
5865 return false;
5866 }
5867
5868 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
5869 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
5870 return false;
5871 }
5872
5873 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
5874 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
5875 return false;
5876 }
5877
5878 return true;
5879 }
5880
5881 static bool cgfs_mount_hierarchies(void)
5882 {
5883 char *target;
5884 size_t clen, len;
5885 int i, ret;
5886
5887 for (i = 0; i < num_hierarchies; i++) {
5888 char *controller = hierarchies[i];
5889
5890 clen = strlen(controller);
5891 len = strlen(BASEDIR) + clen + 2;
5892 target = malloc(len);
5893 if (!target)
5894 return false;
5895
5896 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
5897 if (ret < 0 || ret >= len) {
5898 free(target);
5899 return false;
5900 }
5901 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
5902 free(target);
5903 return false;
5904 }
5905 if (!strcmp(controller, "unified"))
5906 ret = mount("none", target, "cgroup2", 0, NULL);
5907 else
5908 ret = mount(controller, target, "cgroup", 0, controller);
5909 if (ret < 0) {
5910 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
5911 free(target);
5912 return false;
5913 }
5914
5915 fd_hierarchies[i] = open(target, O_DIRECTORY);
5916 if (fd_hierarchies[i] < 0) {
5917 free(target);
5918 return false;
5919 }
5920 free(target);
5921 }
5922 return true;
5923 }
5924
5925 static bool cgfs_setup_controllers(void)
5926 {
5927 if (!cgfs_prepare_mounts())
5928 return false;
5929
5930 if (!cgfs_mount_hierarchies()) {
5931 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
5932 return false;
5933 }
5934
5935 if (!permute_root())
5936 return false;
5937
5938 return true;
5939 }
5940
5941 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
5942 {
5943 FILE *f;
5944 char *cret, *line = NULL;
5945 char cwd[MAXPATHLEN];
5946 size_t len = 0;
5947 int i, init_ns = -1;
5948 bool found_unified = false;
5949
5950 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
5951 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
5952 return;
5953 }
5954
5955 while (getline(&line, &len, f) != -1) {
5956 char *idx, *p, *p2;
5957
5958 p = strchr(line, ':');
5959 if (!p)
5960 goto out;
5961 idx = line;
5962 *(p++) = '\0';
5963
5964 p2 = strrchr(p, ':');
5965 if (!p2)
5966 goto out;
5967 *p2 = '\0';
5968
5969 /* With cgroupv2 /proc/self/cgroup can contain entries of the
5970 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
5971 * because it parses out the empty string "" and later on passes
5972 * it to mount(). Let's skip such entries.
5973 */
5974 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
5975 found_unified = true;
5976 p = "unified";
5977 }
5978
5979 if (!store_hierarchy(line, p))
5980 goto out;
5981 }
5982
5983 /* Preserve initial namespace. */
5984 init_ns = preserve_mnt_ns(getpid());
5985 if (init_ns < 0) {
5986 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
5987 goto out;
5988 }
5989
5990 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
5991 if (!fd_hierarchies) {
5992 lxcfs_error("%s\n", strerror(errno));
5993 goto out;
5994 }
5995
5996 for (i = 0; i < num_hierarchies; i++)
5997 fd_hierarchies[i] = -1;
5998
5999 cret = getcwd(cwd, MAXPATHLEN);
6000 if (!cret)
6001 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6002
6003 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6004 * to privately mount lxcfs cgroups. */
6005 if (!cgfs_setup_controllers()) {
6006 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
6007 goto out;
6008 }
6009
6010 if (setns(init_ns, 0) < 0) {
6011 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
6012 goto out;
6013 }
6014
6015 if (!cret || chdir(cwd) < 0)
6016 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6017
6018 if (!init_cpuview()) {
6019 lxcfs_error("%s\n", "failed to init CPU view");
6020 goto out;
6021 }
6022
6023 print_subsystems();
6024
6025 out:
6026 free(line);
6027 fclose(f);
6028 if (init_ns >= 0)
6029 close(init_ns);
6030 }
6031
6032 static void __attribute__((destructor)) free_subsystems(void)
6033 {
6034 int i;
6035
6036 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6037
6038 for (i = 0; i < num_hierarchies; i++) {
6039 if (hierarchies[i])
6040 free(hierarchies[i]);
6041 if (fd_hierarchies && fd_hierarchies[i] >= 0)
6042 close(fd_hierarchies[i]);
6043 }
6044 free(hierarchies);
6045 free(fd_hierarchies);
6046 free_cpuview();
6047
6048 if (cgroup_mount_ns_fd >= 0)
6049 close(cgroup_mount_ns_fd);
6050 }