]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
utils: split helpers from bindings.c into utils.{c,h}
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #define __STDC_FORMAT_MACROS
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <fuse.h>
16 #include <inttypes.h>
17 #include <libgen.h>
18 #include <pthread.h>
19 #include <sched.h>
20 #include <stdarg.h>
21 #include <stdbool.h>
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <time.h>
27 #include <unistd.h>
28 #include <wait.h>
29 #include <linux/magic.h>
30 #include <linux/sched.h>
31 #include <sys/epoll.h>
32 #include <sys/mman.h>
33 #include <sys/mount.h>
34 #include <sys/param.h>
35 #include <sys/socket.h>
36 #include <sys/syscall.h>
37 #include <sys/sysinfo.h>
38 #include <sys/vfs.h>
39
40 #include "bindings.h"
41 #include "config.h"
42 #include "cgroups/cgroup.h"
43 #include "cgroups/cgroup_utils.h"
44 #include "memory_utils.h"
45 #include "utils.h"
46
47 /* Define pivot_root() if missing from the C library */
48 #ifndef HAVE_PIVOT_ROOT
49 static int pivot_root(const char * new_root, const char * put_old)
50 {
51 #ifdef __NR_pivot_root
52 return syscall(__NR_pivot_root, new_root, put_old);
53 #else
54 errno = ENOSYS;
55 return -1;
56 #endif
57 }
58 #else
59 extern int pivot_root(const char * new_root, const char * put_old);
60 #endif
61
62 struct cpuacct_usage {
63 uint64_t user;
64 uint64_t system;
65 uint64_t idle;
66 bool online;
67 };
68
69 /* The function of hash table.*/
70 #define LOAD_SIZE 100 /*the size of hash_table */
71 #define FLUSH_TIME 5 /*the flush rate */
72 #define DEPTH_DIR 3 /*the depth of per cgroup */
73 /* The function of calculate loadavg .*/
74 #define FSHIFT 11 /* nr of bits of precision */
75 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
76 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
77 #define EXP_5 2014 /* 1/exp(5sec/5min) */
78 #define EXP_15 2037 /* 1/exp(5sec/15min) */
79 #define LOAD_INT(x) ((x) >> FSHIFT)
80 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
81 /*
82 * This parameter is used for proc_loadavg_read().
83 * 1 means use loadavg, 0 means not use.
84 */
85 static int loadavg = 0;
86 static volatile sig_atomic_t loadavg_stop = 0;
87 static int calc_hash(const char *name)
88 {
89 unsigned int hash = 0;
90 unsigned int x = 0;
91 /* ELFHash algorithm. */
92 while (*name) {
93 hash = (hash << 4) + *name++;
94 x = hash & 0xf0000000;
95 if (x != 0)
96 hash ^= (x >> 24);
97 hash &= ~x;
98 }
99 return (hash & 0x7fffffff);
100 }
101
102 struct load_node {
103 char *cg; /*cg */
104 unsigned long avenrun[3]; /* Load averages */
105 unsigned int run_pid;
106 unsigned int total_pid;
107 unsigned int last_pid;
108 int cfd; /* The file descriptor of the mounted cgroup */
109 struct load_node *next;
110 struct load_node **pre;
111 };
112
113 struct load_head {
114 /*
115 * The lock is about insert load_node and refresh load_node.To the first
116 * load_node of each hash bucket, insert and refresh in this hash bucket is
117 * mutually exclusive.
118 */
119 pthread_mutex_t lock;
120 /*
121 * The rdlock is about read loadavg and delete load_node.To each hash
122 * bucket, read and delete is mutually exclusive. But at the same time, we
123 * allow paratactic read operation. This rdlock is at list level.
124 */
125 pthread_rwlock_t rdlock;
126 /*
127 * The rilock is about read loadavg and insert load_node.To the first
128 * load_node of each hash bucket, read and insert is mutually exclusive.
129 * But at the same time, we allow paratactic read operation.
130 */
131 pthread_rwlock_t rilock;
132 struct load_node *next;
133 };
134
135 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
136 /*
137 * init_load initialize the hash table.
138 * Return 0 on success, return -1 on failure.
139 */
140 static int init_load(void)
141 {
142 int i;
143 int ret;
144
145 for (i = 0; i < LOAD_SIZE; i++) {
146 load_hash[i].next = NULL;
147 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
148 if (ret != 0) {
149 lxcfs_error("%s\n", "Failed to initialize lock");
150 goto out3;
151 }
152 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
153 if (ret != 0) {
154 lxcfs_error("%s\n", "Failed to initialize rdlock");
155 goto out2;
156 }
157 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
158 if (ret != 0) {
159 lxcfs_error("%s\n", "Failed to initialize rilock");
160 goto out1;
161 }
162 }
163 return 0;
164 out1:
165 pthread_rwlock_destroy(&load_hash[i].rdlock);
166 out2:
167 pthread_mutex_destroy(&load_hash[i].lock);
168 out3:
169 while (i > 0) {
170 i--;
171 pthread_mutex_destroy(&load_hash[i].lock);
172 pthread_rwlock_destroy(&load_hash[i].rdlock);
173 pthread_rwlock_destroy(&load_hash[i].rilock);
174 }
175 return -1;
176 }
177
178 static void insert_node(struct load_node **n, int locate)
179 {
180 struct load_node *f;
181
182 pthread_mutex_lock(&load_hash[locate].lock);
183 pthread_rwlock_wrlock(&load_hash[locate].rilock);
184 f = load_hash[locate].next;
185 load_hash[locate].next = *n;
186
187 (*n)->pre = &(load_hash[locate].next);
188 if (f)
189 f->pre = &((*n)->next);
190 (*n)->next = f;
191 pthread_mutex_unlock(&load_hash[locate].lock);
192 pthread_rwlock_unlock(&load_hash[locate].rilock);
193 }
194 /*
195 * locate_node() finds special node. Not return NULL means success.
196 * It should be noted that rdlock isn't unlocked at the end of code
197 * because this function is used to read special node. Delete is not
198 * allowed before read has ended.
199 * unlock rdlock only in proc_loadavg_read().
200 */
201 static struct load_node *locate_node(char *cg, int locate)
202 {
203 struct load_node *f = NULL;
204 int i = 0;
205
206 pthread_rwlock_rdlock(&load_hash[locate].rilock);
207 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
208 if (load_hash[locate].next == NULL) {
209 pthread_rwlock_unlock(&load_hash[locate].rilock);
210 return f;
211 }
212 f = load_hash[locate].next;
213 pthread_rwlock_unlock(&load_hash[locate].rilock);
214 while (f && ((i = strcmp(f->cg, cg)) != 0))
215 f = f->next;
216 return f;
217 }
218
219 /* Delete the load_node n and return the next node of it. */
220 static struct load_node *del_node(struct load_node *n, int locate)
221 {
222 struct load_node *g;
223
224 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
225 if (n->next == NULL) {
226 *(n->pre) = NULL;
227 } else {
228 *(n->pre) = n->next;
229 n->next->pre = n->pre;
230 }
231 g = n->next;
232 free_disarm(n->cg);
233 free_disarm(n);
234 pthread_rwlock_unlock(&load_hash[locate].rdlock);
235 return g;
236 }
237
238 static void load_free(void)
239 {
240 struct load_node *f, *p;
241
242 for (int i = 0; i < LOAD_SIZE; i++) {
243 pthread_mutex_lock(&load_hash[i].lock);
244 pthread_rwlock_wrlock(&load_hash[i].rilock);
245 pthread_rwlock_wrlock(&load_hash[i].rdlock);
246 if (load_hash[i].next == NULL) {
247 pthread_mutex_unlock(&load_hash[i].lock);
248 pthread_mutex_destroy(&load_hash[i].lock);
249 pthread_rwlock_unlock(&load_hash[i].rilock);
250 pthread_rwlock_destroy(&load_hash[i].rilock);
251 pthread_rwlock_unlock(&load_hash[i].rdlock);
252 pthread_rwlock_destroy(&load_hash[i].rdlock);
253 continue;
254 }
255
256 for (f = load_hash[i].next; f;) {
257 free_disarm(f->cg);
258 p = f->next;
259 free_disarm(f);
260 f = p;
261 }
262
263 pthread_mutex_unlock(&load_hash[i].lock);
264 pthread_mutex_destroy(&load_hash[i].lock);
265 pthread_rwlock_unlock(&load_hash[i].rilock);
266 pthread_rwlock_destroy(&load_hash[i].rilock);
267 pthread_rwlock_unlock(&load_hash[i].rdlock);
268 pthread_rwlock_destroy(&load_hash[i].rdlock);
269 }
270 }
271
272 /* Data for CPU view */
273 struct cg_proc_stat {
274 char *cg;
275 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
276 struct cpuacct_usage *view; // Usage stats reported to the container
277 int cpu_count;
278 pthread_mutex_t lock; // For node manipulation
279 struct cg_proc_stat *next;
280 };
281
282 struct cg_proc_stat_head {
283 struct cg_proc_stat *next;
284 time_t lastcheck;
285
286 /*
287 * For access to the list. Reading can be parallel, pruning is exclusive.
288 */
289 pthread_rwlock_t lock;
290 };
291
292 #define CPUVIEW_HASH_SIZE 100
293 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
294
295 static bool cpuview_init_head(struct cg_proc_stat_head **head)
296 {
297 *head = malloc(sizeof(struct cg_proc_stat_head));
298 if (!(*head)) {
299 lxcfs_error("%s\n", strerror(errno));
300 return false;
301 }
302
303 (*head)->lastcheck = time(NULL);
304 (*head)->next = NULL;
305
306 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
307 lxcfs_error("%s\n", "Failed to initialize list lock");
308 free_disarm(*head);
309 return false;
310 }
311
312 return true;
313 }
314
315 static bool init_cpuview()
316 {
317 int i;
318
319 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
320 proc_stat_history[i] = NULL;
321
322 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
323 if (!cpuview_init_head(&proc_stat_history[i]))
324 goto err;
325 }
326
327 return true;
328
329 err:
330 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
331 if (proc_stat_history[i])
332 free_disarm(proc_stat_history[i]);
333 }
334
335 return false;
336 }
337
338 static void free_proc_stat_node(struct cg_proc_stat *node)
339 {
340 pthread_mutex_destroy(&node->lock);
341 free_disarm(node->cg);
342 free_disarm(node->usage);
343 free_disarm(node->view);
344 free_disarm(node);
345 }
346
347 static void cpuview_free_head(struct cg_proc_stat_head *head)
348 {
349 struct cg_proc_stat *node, *tmp;
350
351 if (head->next) {
352 node = head->next;
353
354 for (;;) {
355 tmp = node;
356 node = node->next;
357 free_proc_stat_node(tmp);
358
359 if (!node)
360 break;
361 }
362 }
363
364 pthread_rwlock_destroy(&head->lock);
365 free_disarm(head);
366 }
367
368 static void free_cpuview()
369 {
370 int i;
371
372 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
373 if (proc_stat_history[i])
374 cpuview_free_head(proc_stat_history[i]);
375 }
376 }
377
378 /*
379 * A table caching which pid is init for a pid namespace.
380 * When looking up which pid is init for $qpid, we first
381 * 1. Stat /proc/$qpid/ns/pid.
382 * 2. Check whether the ino_t is in our store.
383 * a. if not, fork a child in qpid's ns to send us
384 * ucred.pid = 1, and read the initpid. Cache
385 * initpid and creation time for /proc/initpid
386 * in a new store entry.
387 * b. if so, verify that /proc/initpid still matches
388 * what we have saved. If not, clear the store
389 * entry and go back to a. If so, return the
390 * cached initpid.
391 */
392 struct pidns_init_store {
393 ino_t ino; // inode number for /proc/$pid/ns/pid
394 pid_t initpid; // the pid of nit in that ns
395 long int ctime; // the time at which /proc/$initpid was created
396 struct pidns_init_store *next;
397 long int lastcheck;
398 };
399
400 /* lol - look at how they are allocated in the kernel */
401 #define PIDNS_HASH_SIZE 4096
402 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
403
404 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
405 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
406 static void lock_mutex(pthread_mutex_t *l)
407 {
408 int ret;
409
410 if ((ret = pthread_mutex_lock(l)) != 0) {
411 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
412 exit(1);
413 }
414 }
415
416 struct cgroup_ops *cgroup_ops;
417
418 static void unlock_mutex(pthread_mutex_t *l)
419 {
420 int ret;
421
422 if ((ret = pthread_mutex_unlock(l)) != 0) {
423 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
424 exit(1);
425 }
426 }
427
428 static void store_lock(void)
429 {
430 lock_mutex(&pidns_store_mutex);
431 }
432
433 static void store_unlock(void)
434 {
435 unlock_mutex(&pidns_store_mutex);
436 }
437
438 /* Must be called under store_lock */
439 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
440 {
441 struct stat initsb;
442 char fnam[100];
443
444 snprintf(fnam, 100, "/proc/%d", e->initpid);
445 if (stat(fnam, &initsb) < 0)
446 return false;
447
448 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
449 initsb.st_ctime, e->initpid);
450
451 if (e->ctime != initsb.st_ctime)
452 return false;
453 return true;
454 }
455
456 /* Must be called under store_lock */
457 static void remove_initpid(struct pidns_init_store *e)
458 {
459 struct pidns_init_store *tmp;
460 int h;
461
462 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
463
464 h = HASH(e->ino);
465 if (pidns_hash_table[h] == e) {
466 pidns_hash_table[h] = e->next;
467 free_disarm(e);
468 return;
469 }
470
471 tmp = pidns_hash_table[h];
472 while (tmp) {
473 if (tmp->next == e) {
474 tmp->next = e->next;
475 free_disarm(e);
476 return;
477 }
478 tmp = tmp->next;
479 }
480 }
481
482 #define PURGE_SECS 5
483 /* Must be called under store_lock */
484 static void prune_initpid_store(void)
485 {
486 static long int last_prune = 0;
487 struct pidns_init_store *e, *prev, *delme;
488 long int now, threshold;
489 int i;
490
491 if (!last_prune) {
492 last_prune = time(NULL);
493 return;
494 }
495 now = time(NULL);
496 if (now < last_prune + PURGE_SECS)
497 return;
498
499 lxcfs_debug("%s\n", "Pruning.");
500
501 last_prune = now;
502 threshold = now - 2 * PURGE_SECS;
503
504 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
505 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
506 if (e->lastcheck < threshold) {
507
508 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
509
510 delme = e;
511 if (prev)
512 prev->next = e->next;
513 else
514 pidns_hash_table[i] = e->next;
515 e = e->next;
516 free_disarm(delme);
517 } else {
518 prev = e;
519 e = e->next;
520 }
521 }
522 }
523 }
524
525 /* Must be called under store_lock */
526 static void save_initpid(struct stat *sb, pid_t pid)
527 {
528 struct pidns_init_store *e;
529 char fpath[100];
530 struct stat procsb;
531 int h;
532
533 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
534
535 snprintf(fpath, 100, "/proc/%d", pid);
536 if (stat(fpath, &procsb) < 0)
537 return;
538 do {
539 e = malloc(sizeof(*e));
540 } while (!e);
541 e->ino = sb->st_ino;
542 e->initpid = pid;
543 e->ctime = procsb.st_ctime;
544 h = HASH(e->ino);
545 e->next = pidns_hash_table[h];
546 e->lastcheck = time(NULL);
547 pidns_hash_table[h] = e;
548 }
549
550 /*
551 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
552 * entry for the inode number and creation time. Verify that the init pid
553 * is still valid. If not, remove it. Return the entry if valid, NULL
554 * otherwise.
555 * Must be called under store_lock
556 */
557 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
558 {
559 int h = HASH(sb->st_ino);
560 struct pidns_init_store *e = pidns_hash_table[h];
561
562 while (e) {
563 if (e->ino == sb->st_ino) {
564 if (initpid_still_valid(e, sb)) {
565 e->lastcheck = time(NULL);
566 return e;
567 }
568 remove_initpid(e);
569 return NULL;
570 }
571 e = e->next;
572 }
573
574 return NULL;
575 }
576
577 static int is_dir(const char *path, int fd)
578 {
579 struct stat statbuf;
580 int ret = fstatat(fd, path, &statbuf, fd);
581 if (ret == 0 && S_ISDIR(statbuf.st_mode))
582 return 1;
583 return 0;
584 }
585
586 static bool write_string(const char *fnam, const char *string, int fd)
587 {
588 FILE *f;
589 size_t len, ret;
590
591 f = fdopen(fd, "w");
592 if (!f)
593 return false;
594
595 len = strlen(string);
596 ret = fwrite(string, 1, len, f);
597 if (ret != len) {
598 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
599 strerror(errno), string, fnam);
600 fclose(f);
601 return false;
602 }
603
604 if (fclose(f) < 0) {
605 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
606 return false;
607 }
608
609 return true;
610 }
611
612 struct cgfs_files {
613 char *name;
614 uint32_t uid, gid;
615 uint32_t mode;
616 };
617
618 static void print_subsystems(void)
619 {
620 int i = 0;
621
622 fprintf(stderr, "mount namespace: %d\n", cgroup_ops->mntns_fd);
623 fprintf(stderr, "hierarchies:\n");
624 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
625 __do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false);
626 fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: "");
627 }
628 }
629
630 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
631 const char *value)
632 {
633 int ret, fd, cfd;
634 size_t len;
635 char *fnam;
636
637 cfd = get_cgroup_fd(controller);
638 if (cfd < 0)
639 return false;
640
641 /* Make sure we pass a relative path to *at() family of functions.
642 * . + /cgroup + / + file + \0
643 */
644 len = strlen(cgroup) + strlen(file) + 3;
645 fnam = alloca(len);
646 ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
647 if (ret < 0 || (size_t)ret >= len)
648 return false;
649
650 fd = openat(cfd, fnam, O_WRONLY);
651 if (fd < 0)
652 return false;
653
654 return write_string(fnam, value, fd);
655 }
656
657 // Chown all the files in the cgroup directory. We do this when we create
658 // a cgroup on behalf of a user.
659 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
660 {
661 struct dirent *direntp;
662 char path[MAXPATHLEN];
663 size_t len;
664 DIR *d;
665 int fd1, ret;
666
667 len = strlen(dirname);
668 if (len >= MAXPATHLEN) {
669 lxcfs_error("Pathname too long: %s\n", dirname);
670 return;
671 }
672
673 fd1 = openat(fd, dirname, O_DIRECTORY);
674 if (fd1 < 0)
675 return;
676
677 d = fdopendir(fd1);
678 if (!d) {
679 lxcfs_error("Failed to open %s\n", dirname);
680 return;
681 }
682
683 while ((direntp = readdir(d))) {
684 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
685 continue;
686 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
687 if (ret < 0 || ret >= MAXPATHLEN) {
688 lxcfs_error("Pathname too long under %s\n", dirname);
689 continue;
690 }
691 if (fchownat(fd, path, uid, gid, 0) < 0)
692 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
693 }
694 closedir(d);
695 }
696
697 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
698 {
699 int cfd;
700 size_t len;
701 char *dirnam;
702
703 cfd = get_cgroup_fd(controller);
704 if (cfd < 0)
705 return -EINVAL;
706
707 /* Make sure we pass a relative path to *at() family of functions.
708 * . + /cg + \0
709 */
710 len = strlen(cg) + 2;
711 dirnam = alloca(len);
712 snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
713
714 if (mkdirat(cfd, dirnam, 0755) < 0)
715 return -errno;
716
717 if (uid == 0 && gid == 0)
718 return 0;
719
720 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
721 return -errno;
722
723 chown_all_cgroup_files(dirnam, uid, gid, cfd);
724
725 return 0;
726 }
727
728 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
729 {
730 struct dirent *direntp;
731 DIR *dir;
732 bool ret = false;
733 char pathname[MAXPATHLEN];
734 int dupfd;
735
736 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
737 if (dupfd < 0)
738 return false;
739
740 dir = fdopendir(dupfd);
741 if (!dir) {
742 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
743 close(dupfd);
744 return false;
745 }
746
747 while ((direntp = readdir(dir))) {
748 struct stat mystat;
749 int rc;
750
751 if (!strcmp(direntp->d_name, ".") ||
752 !strcmp(direntp->d_name, ".."))
753 continue;
754
755 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
756 if (rc < 0 || rc >= MAXPATHLEN) {
757 lxcfs_error("%s\n", "Pathname too long.");
758 continue;
759 }
760
761 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
762 if (rc) {
763 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
764 continue;
765 }
766 if (S_ISDIR(mystat.st_mode))
767 if (!recursive_rmdir(pathname, fd, cfd))
768 lxcfs_debug("Error removing %s.\n", pathname);
769 }
770
771 ret = true;
772 if (closedir(dir) < 0) {
773 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
774 ret = false;
775 }
776
777 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
778 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
779 ret = false;
780 }
781
782 close(dupfd);
783
784 return ret;
785 }
786
787 bool cgfs_remove(const char *controller, const char *cg)
788 {
789 int fd, cfd;
790 size_t len;
791 char *dirnam;
792 bool bret;
793
794 cfd = get_cgroup_fd(controller);
795 if (cfd < 0)
796 return false;
797
798 /* Make sure we pass a relative path to *at() family of functions.
799 * . + /cg + \0
800 */
801 len = strlen(cg) + 2;
802 dirnam = alloca(len);
803 snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
804
805 fd = openat(cfd, dirnam, O_DIRECTORY);
806 if (fd < 0)
807 return false;
808
809 bret = recursive_rmdir(dirnam, fd, cfd);
810 close(fd);
811 return bret;
812 }
813
814 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
815 {
816 int cfd;
817 size_t len;
818 char *pathname;
819
820 cfd = get_cgroup_fd(controller);
821 if (cfd < 0)
822 return false;
823
824 /* Make sure we pass a relative path to *at() family of functions.
825 * . + /file + \0
826 */
827 len = strlen(file) + 2;
828 pathname = alloca(len);
829 snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
830 if (fchmodat(cfd, pathname, mode, 0) < 0)
831 return false;
832 return true;
833 }
834
835 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
836 {
837 size_t len;
838 char *fname;
839
840 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
841 fname = alloca(len);
842 snprintf(fname, len, "%s/tasks", dirname);
843 if (fchownat(fd, fname, uid, gid, 0) != 0)
844 return -errno;
845 snprintf(fname, len, "%s/cgroup.procs", dirname);
846 if (fchownat(fd, fname, uid, gid, 0) != 0)
847 return -errno;
848 return 0;
849 }
850
851 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
852 {
853 int cfd;
854 size_t len;
855 char *pathname;
856
857 cfd = get_cgroup_fd(controller);
858 if (cfd < 0)
859 return false;
860
861 /* Make sure we pass a relative path to *at() family of functions.
862 * . + /file + \0
863 */
864 len = strlen(file) + 2;
865 pathname = alloca(len);
866 snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
867 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
868 return -errno;
869
870 if (is_dir(pathname, cfd))
871 // like cgmanager did, we want to chown the tasks file as well
872 return chown_tasks_files(pathname, uid, gid, cfd);
873
874 return 0;
875 }
876
877 FILE *open_pids_file(const char *controller, const char *cgroup)
878 {
879 int fd, cfd;
880 size_t len;
881 char *pathname;
882
883 cfd = get_cgroup_fd(controller);
884 if (cfd < 0)
885 return false;
886
887 /* Make sure we pass a relative path to *at() family of functions.
888 * . + /cgroup + / "cgroup.procs" + \0
889 */
890 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
891 pathname = alloca(len);
892 snprintf(pathname, len, "%s%s/cgroup.procs", dot_or_empty(cgroup), cgroup);
893
894 fd = openat(cfd, pathname, O_WRONLY);
895 if (fd < 0)
896 return NULL;
897
898 return fdopen(fd, "w");
899 }
900
901 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
902 void ***list, size_t typesize,
903 void* (*iterator)(const char*, const char*, const char*))
904 {
905 int cfd, fd, ret;
906 size_t len;
907 char *cg;
908 char pathname[MAXPATHLEN];
909 size_t sz = 0, asz = 0;
910 struct dirent *dirent;
911 DIR *dir;
912
913 cfd = get_cgroup_fd(controller);
914 *list = NULL;
915 if (cfd < 0)
916 return false;
917
918 /* Make sure we pass a relative path to *at() family of functions. */
919 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
920 cg = alloca(len);
921 ret = snprintf(cg, len, "%s%s", dot_or_empty(cgroup), cgroup);
922 if (ret < 0 || (size_t)ret >= len) {
923 lxcfs_error("Pathname too long under %s\n", cgroup);
924 return false;
925 }
926
927 fd = openat(cfd, cg, O_DIRECTORY);
928 if (fd < 0)
929 return false;
930
931 dir = fdopendir(fd);
932 if (!dir)
933 return false;
934
935 while ((dirent = readdir(dir))) {
936 struct stat mystat;
937
938 if (!strcmp(dirent->d_name, ".") ||
939 !strcmp(dirent->d_name, ".."))
940 continue;
941
942 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
943 if (ret < 0 || ret >= MAXPATHLEN) {
944 lxcfs_error("Pathname too long under %s\n", cg);
945 continue;
946 }
947
948 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
949 if (ret) {
950 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
951 continue;
952 }
953 if ((!directories && !S_ISREG(mystat.st_mode)) ||
954 (directories && !S_ISDIR(mystat.st_mode)))
955 continue;
956
957 if (sz+2 >= asz) {
958 void **tmp;
959 asz += BATCH_SIZE;
960 do {
961 tmp = realloc(*list, asz * typesize);
962 } while (!tmp);
963 *list = tmp;
964 }
965 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
966 (*list)[sz+1] = NULL;
967 sz++;
968 }
969 if (closedir(dir) < 0) {
970 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
971 return false;
972 }
973 return true;
974 }
975
976 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
977 {
978 char *dup;
979 do {
980 dup = strdup(dir_entry);
981 } while (!dup);
982 return dup;
983 }
984
985 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
986 {
987 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
988 }
989
990 void free_key(struct cgfs_files *k)
991 {
992 if (!k)
993 return;
994 free_disarm(k->name);
995 free_disarm(k);
996 }
997
998 void free_keys(struct cgfs_files **keys)
999 {
1000 int i;
1001
1002 if (!keys)
1003 return;
1004 for (i = 0; keys[i]; i++) {
1005 free_key(keys[i]);
1006 }
1007 free_disarm(keys);
1008 }
1009
1010 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1011 {
1012 int ret, cfd;
1013 size_t len;
1014 char *fnam;
1015
1016 cfd = get_cgroup_fd(controller);
1017 if (cfd < 0)
1018 return false;
1019
1020 /* Make sure we pass a relative path to *at() family of functions.
1021 * . + /cgroup + / + file + \0
1022 */
1023 len = strlen(cgroup) + strlen(file) + 3;
1024 fnam = alloca(len);
1025 ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
1026 if (ret < 0 || (size_t)ret >= len)
1027 return false;
1028
1029 return (faccessat(cfd, fnam, F_OK, 0) == 0);
1030 }
1031
1032 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1033 {
1034 int ret, cfd;
1035 size_t len;
1036 char *fnam;
1037 struct stat sb;
1038 struct cgfs_files *newkey;
1039
1040 cfd = get_cgroup_fd(controller);
1041 if (cfd < 0)
1042 return false;
1043
1044 if (file && *file == '/')
1045 file++;
1046
1047 if (file && strchr(file, '/'))
1048 return NULL;
1049
1050 /* Make sure we pass a relative path to *at() family of functions.
1051 * . + /cgroup + / + file + \0
1052 */
1053 len = strlen(cgroup) + 3;
1054 if (file)
1055 len += strlen(file) + 1;
1056 fnam = alloca(len);
1057 snprintf(fnam, len, "%s%s%s%s", dot_or_empty(cgroup), cgroup,
1058 file ? "/" : "", file ? file : "");
1059
1060 ret = fstatat(cfd, fnam, &sb, 0);
1061 if (ret < 0)
1062 return NULL;
1063
1064 do {
1065 newkey = malloc(sizeof(struct cgfs_files));
1066 } while (!newkey);
1067 if (file)
1068 newkey->name = must_copy_string(file);
1069 else if (strrchr(cgroup, '/'))
1070 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1071 else
1072 newkey->name = must_copy_string(cgroup);
1073 newkey->uid = sb.st_uid;
1074 newkey->gid = sb.st_gid;
1075 newkey->mode = sb.st_mode;
1076
1077 return newkey;
1078 }
1079
1080 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1081 {
1082 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1083 if (!entry) {
1084 lxcfs_error("Error getting files under %s:%s\n", controller,
1085 cgroup);
1086 }
1087 return entry;
1088 }
1089
1090 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1091 {
1092 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1093 }
1094
1095 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1096 {
1097 int cfd;
1098 size_t len;
1099 char *fnam;
1100 int ret;
1101 struct stat sb;
1102
1103 cfd = get_cgroup_fd(controller);
1104 if (cfd < 0)
1105 return false;
1106
1107 /* Make sure we pass a relative path to *at() family of functions.
1108 * . + /cgroup + / + f + \0
1109 */
1110 len = strlen(cgroup) + strlen(f) + 3;
1111 fnam = alloca(len);
1112 ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, f);
1113 if (ret < 0 || (size_t)ret >= len)
1114 return false;
1115
1116 ret = fstatat(cfd, fnam, &sb, 0);
1117 if (ret < 0 || !S_ISDIR(sb.st_mode))
1118 return false;
1119
1120 return true;
1121 }
1122
1123 #define SEND_CREDS_OK 0
1124 #define SEND_CREDS_NOTSK 1
1125 #define SEND_CREDS_FAIL 2
1126 static bool recv_creds(int sock, struct ucred *cred, char *v);
1127 static int wait_for_pid(pid_t pid);
1128 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1129 static int send_creds_clone_wrapper(void *arg);
1130
1131 /*
1132 * clone a task which switches to @task's namespace and writes '1'.
1133 * over a unix sock so we can read the task's reaper's pid in our
1134 * namespace
1135 *
1136 * Note: glibc's fork() does not respect pidns, which can lead to failed
1137 * assertions inside glibc (and thus failed forks) if the child's pid in
1138 * the pidns and the parent pid outside are identical. Using clone prevents
1139 * this issue.
1140 */
1141 static void write_task_init_pid_exit(int sock, pid_t target)
1142 {
1143 char fnam[100];
1144 pid_t pid;
1145 int fd, ret;
1146 size_t stack_size = sysconf(_SC_PAGESIZE);
1147 void *stack = alloca(stack_size);
1148
1149 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1150 if (ret < 0 || ret >= sizeof(fnam))
1151 _exit(1);
1152
1153 fd = open(fnam, O_RDONLY);
1154 if (fd < 0) {
1155 perror("write_task_init_pid_exit open of ns/pid");
1156 _exit(1);
1157 }
1158 if (setns(fd, 0)) {
1159 perror("write_task_init_pid_exit setns 1");
1160 close(fd);
1161 _exit(1);
1162 }
1163 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1164 if (pid < 0)
1165 _exit(1);
1166 if (pid != 0) {
1167 if (!wait_for_pid(pid))
1168 _exit(1);
1169 _exit(0);
1170 }
1171 }
1172
1173 static int send_creds_clone_wrapper(void *arg) {
1174 struct ucred cred;
1175 char v;
1176 int sock = *(int *)arg;
1177
1178 /* we are the child */
1179 cred.uid = 0;
1180 cred.gid = 0;
1181 cred.pid = 1;
1182 v = '1';
1183 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1184 return 1;
1185 return 0;
1186 }
1187
1188 static pid_t get_init_pid_for_task(pid_t task)
1189 {
1190 int sock[2];
1191 pid_t pid;
1192 pid_t ret = -1;
1193 char v = '0';
1194 struct ucred cred;
1195
1196 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1197 perror("socketpair");
1198 return -1;
1199 }
1200
1201 pid = fork();
1202 if (pid < 0)
1203 goto out;
1204 if (!pid) {
1205 close(sock[1]);
1206 write_task_init_pid_exit(sock[0], task);
1207 _exit(0);
1208 }
1209
1210 if (!recv_creds(sock[1], &cred, &v))
1211 goto out;
1212 ret = cred.pid;
1213
1214 out:
1215 close(sock[0]);
1216 close(sock[1]);
1217 if (pid > 0)
1218 wait_for_pid(pid);
1219 return ret;
1220 }
1221
1222 pid_t lookup_initpid_in_store(pid_t qpid)
1223 {
1224 pid_t answer = 0;
1225 struct stat sb;
1226 struct pidns_init_store *e;
1227 char fnam[100];
1228
1229 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1230 store_lock();
1231 if (stat(fnam, &sb) < 0)
1232 goto out;
1233 e = lookup_verify_initpid(&sb);
1234 if (e) {
1235 answer = e->initpid;
1236 goto out;
1237 }
1238 answer = get_init_pid_for_task(qpid);
1239 if (answer > 0)
1240 save_initpid(&sb, answer);
1241
1242 out:
1243 /* we prune at end in case we are returning
1244 * the value we were about to return */
1245 prune_initpid_store();
1246 store_unlock();
1247 return answer;
1248 }
1249
1250 static int wait_for_pid(pid_t pid)
1251 {
1252 int status, ret;
1253
1254 if (pid <= 0)
1255 return -1;
1256
1257 again:
1258 ret = waitpid(pid, &status, 0);
1259 if (ret == -1) {
1260 if (errno == EINTR)
1261 goto again;
1262 return -1;
1263 }
1264 if (ret != pid)
1265 goto again;
1266 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1267 return -1;
1268 return 0;
1269 }
1270
1271
1272 /*
1273 * append pid to *src.
1274 * src: a pointer to a char* in which ot append the pid.
1275 * sz: the number of characters printed so far, minus trailing \0.
1276 * asz: the allocated size so far
1277 * pid: the pid to append
1278 */
1279 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1280 {
1281 must_strcat(src, sz, asz, "%d\n", (int)pid);
1282 }
1283
1284 /*
1285 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1286 * valid in the caller's namespace, return the id mapped into
1287 * pid's namespace.
1288 * Returns the mapped id, or -1 on error.
1289 */
1290 unsigned int
1291 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1292 {
1293 unsigned int nsuid, // base id for a range in the idfile's namespace
1294 hostuid, // base id for a range in the caller's namespace
1295 count; // number of ids in this range
1296 char line[400];
1297 int ret;
1298
1299 fseek(idfile, 0L, SEEK_SET);
1300 while (fgets(line, 400, idfile)) {
1301 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1302 if (ret != 3)
1303 continue;
1304 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1305 /*
1306 * uids wrapped around - unexpected as this is a procfile,
1307 * so just bail.
1308 */
1309 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1310 nsuid, hostuid, count, line);
1311 return -1;
1312 }
1313 if (hostuid <= in_id && hostuid+count > in_id) {
1314 /*
1315 * now since hostuid <= in_id < hostuid+count, and
1316 * hostuid+count and nsuid+count do not wrap around,
1317 * we know that nsuid+(in_id-hostuid) which must be
1318 * less that nsuid+(count) must not wrap around
1319 */
1320 return (in_id - hostuid) + nsuid;
1321 }
1322 }
1323
1324 // no answer found
1325 return -1;
1326 }
1327
1328 /*
1329 * for is_privileged_over,
1330 * specify whether we require the calling uid to be root in his
1331 * namespace
1332 */
1333 #define NS_ROOT_REQD true
1334 #define NS_ROOT_OPT false
1335
1336 #define PROCLEN 100
1337
1338 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1339 {
1340 char fpath[PROCLEN];
1341 int ret;
1342 bool answer = false;
1343 uid_t nsuid;
1344
1345 if (victim == -1 || uid == -1)
1346 return false;
1347
1348 /*
1349 * If the request is one not requiring root in the namespace,
1350 * then having the same uid suffices. (i.e. uid 1000 has write
1351 * access to files owned by uid 1000
1352 */
1353 if (!req_ns_root && uid == victim)
1354 return true;
1355
1356 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1357 if (ret < 0 || ret >= PROCLEN)
1358 return false;
1359 FILE *f = fopen(fpath, "r");
1360 if (!f)
1361 return false;
1362
1363 /* if caller's not root in his namespace, reject */
1364 nsuid = convert_id_to_ns(f, uid);
1365 if (nsuid)
1366 goto out;
1367
1368 /*
1369 * If victim is not mapped into caller's ns, reject.
1370 * XXX I'm not sure this check is needed given that fuse
1371 * will be sending requests where the vfs has converted
1372 */
1373 nsuid = convert_id_to_ns(f, victim);
1374 if (nsuid == -1)
1375 goto out;
1376
1377 answer = true;
1378
1379 out:
1380 fclose(f);
1381 return answer;
1382 }
1383
1384 static bool perms_include(int fmode, mode_t req_mode)
1385 {
1386 mode_t r;
1387
1388 switch (req_mode & O_ACCMODE) {
1389 case O_RDONLY:
1390 r = S_IROTH;
1391 break;
1392 case O_WRONLY:
1393 r = S_IWOTH;
1394 break;
1395 case O_RDWR:
1396 r = S_IROTH | S_IWOTH;
1397 break;
1398 default:
1399 return false;
1400 }
1401 return ((fmode & r) == r);
1402 }
1403
1404
1405 /*
1406 * taskcg is a/b/c
1407 * querycg is /a/b/c/d/e
1408 * we return 'd'
1409 */
1410 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1411 {
1412 char *start, *end;
1413
1414 if (strlen(taskcg) <= strlen(querycg)) {
1415 lxcfs_error("%s\n", "I was fed bad input.");
1416 return NULL;
1417 }
1418
1419 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1420 start = strdup(taskcg + 1);
1421 else
1422 start = strdup(taskcg + strlen(querycg) + 1);
1423 if (!start)
1424 return NULL;
1425 end = strchr(start, '/');
1426 if (end)
1427 *end = '\0';
1428 return start;
1429 }
1430
1431 char *get_pid_cgroup(pid_t pid, const char *contrl)
1432 {
1433 int cfd;
1434
1435 cfd = get_cgroup_fd(contrl);
1436 if (cfd < 0)
1437 return false;
1438
1439 if (pure_unified_layout(cgroup_ops))
1440 return cg_unified_get_current_cgroup(pid);
1441
1442 return cg_legacy_get_current_cgroup(pid, contrl);
1443 }
1444
1445 /*
1446 * check whether a fuse context may access a cgroup dir or file
1447 *
1448 * If file is not null, it is a cgroup file to check under cg.
1449 * If file is null, then we are checking perms on cg itself.
1450 *
1451 * For files we can check the mode of the list_keys result.
1452 * For cgroups, we must make assumptions based on the files under the
1453 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1454 * yet.
1455 */
1456 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1457 {
1458 struct cgfs_files *k = NULL;
1459 bool ret = false;
1460
1461 k = cgfs_get_key(contrl, cg, file);
1462 if (!k)
1463 return false;
1464
1465 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1466 if (perms_include(k->mode >> 6, mode)) {
1467 ret = true;
1468 goto out;
1469 }
1470 }
1471 if (fc->gid == k->gid) {
1472 if (perms_include(k->mode >> 3, mode)) {
1473 ret = true;
1474 goto out;
1475 }
1476 }
1477 ret = perms_include(k->mode, mode);
1478
1479 out:
1480 free_key(k);
1481 return ret;
1482 }
1483
1484 #define INITSCOPE "/init.scope"
1485 void prune_init_slice(char *cg)
1486 {
1487 char *point;
1488 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1489
1490 if (cg_len < initscope_len)
1491 return;
1492
1493 point = cg + cg_len - initscope_len;
1494 if (strcmp(point, INITSCOPE) == 0) {
1495 if (point == cg)
1496 *(point+1) = '\0';
1497 else
1498 *point = '\0';
1499 }
1500 }
1501
1502 /*
1503 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1504 * If pid is in /a, he may act on /a/b, but not on /b.
1505 * if the answer is false and nextcg is not NULL, then *nextcg will point
1506 * to a string containing the next cgroup directory under cg, which must be
1507 * freed by the caller.
1508 */
1509 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1510 {
1511 bool answer = false;
1512 char *c2 = get_pid_cgroup(pid, contrl);
1513 char *linecmp;
1514
1515 if (!c2)
1516 return false;
1517 prune_init_slice(c2);
1518
1519 /*
1520 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1521 * they pass in a cgroup without leading '/'
1522 *
1523 * The original line here was:
1524 * linecmp = *cg == '/' ? c2 : c2+1;
1525 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1526 * Serge, do you know?
1527 */
1528 if (*cg == '/' || !strncmp(cg, "./", 2))
1529 linecmp = c2;
1530 else
1531 linecmp = c2 + 1;
1532 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1533 if (nextcg) {
1534 *nextcg = get_next_cgroup_dir(linecmp, cg);
1535 }
1536 goto out;
1537 }
1538 answer = true;
1539
1540 out:
1541 free(c2);
1542 return answer;
1543 }
1544
1545 /*
1546 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1547 */
1548 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1549 {
1550 bool answer = false;
1551 char *c2, *task_cg;
1552 size_t target_len, task_len;
1553
1554 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1555 return true;
1556
1557 c2 = get_pid_cgroup(pid, contrl);
1558 if (!c2)
1559 return false;
1560 prune_init_slice(c2);
1561
1562 task_cg = c2 + 1;
1563 target_len = strlen(cg);
1564 task_len = strlen(task_cg);
1565 if (task_len == 0) {
1566 /* Task is in the root cg, it can see everything. This case is
1567 * not handled by the strmcps below, since they test for the
1568 * last /, but that is the first / that we've chopped off
1569 * above.
1570 */
1571 answer = true;
1572 goto out;
1573 }
1574 if (strcmp(cg, task_cg) == 0) {
1575 answer = true;
1576 goto out;
1577 }
1578 if (target_len < task_len) {
1579 /* looking up a parent dir */
1580 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1581 answer = true;
1582 goto out;
1583 }
1584 if (target_len > task_len) {
1585 /* looking up a child dir */
1586 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1587 answer = true;
1588 goto out;
1589 }
1590
1591 out:
1592 free(c2);
1593 return answer;
1594 }
1595
1596 /*
1597 * given /cgroup/freezer/a/b, return "freezer".
1598 * the returned char* should NOT be freed.
1599 */
1600 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1601 {
1602 const char *p1;
1603 char *contr, *slash;
1604
1605 if (strlen(path) < 9) {
1606 errno = EACCES;
1607 return NULL;
1608 }
1609 if (*(path + 7) != '/') {
1610 errno = EINVAL;
1611 return NULL;
1612 }
1613 p1 = path + 8;
1614 contr = strdupa(p1);
1615 if (!contr) {
1616 errno = ENOMEM;
1617 return NULL;
1618 }
1619 slash = strstr(contr, "/");
1620 if (slash)
1621 *slash = '\0';
1622
1623 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1624 if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0)
1625 return (*h)->__controllers;
1626 }
1627 errno = ENOENT;
1628 return NULL;
1629 }
1630
1631 /*
1632 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1633 * Note that the returned value may include files (keynames) etc
1634 */
1635 static const char *find_cgroup_in_path(const char *path)
1636 {
1637 const char *p1;
1638
1639 if (strlen(path) < 9) {
1640 errno = EACCES;
1641 return NULL;
1642 }
1643 p1 = strstr(path + 8, "/");
1644 if (!p1) {
1645 errno = EINVAL;
1646 return NULL;
1647 }
1648 errno = 0;
1649 return p1 + 1;
1650 }
1651
1652 /*
1653 * split the last path element from the path in @cg.
1654 * @dir is newly allocated and should be freed, @last not
1655 */
1656 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1657 {
1658 char *p;
1659
1660 do {
1661 *dir = strdup(cg);
1662 } while (!*dir);
1663 *last = strrchr(cg, '/');
1664 if (!*last) {
1665 *last = NULL;
1666 return;
1667 }
1668 p = strrchr(*dir, '/');
1669 *p = '\0';
1670 }
1671
1672 /*
1673 * FUSE ops for /cgroup
1674 */
1675
1676 int cg_getattr(const char *path, struct stat *sb)
1677 {
1678 struct timespec now;
1679 struct fuse_context *fc = fuse_get_context();
1680 char * cgdir = NULL;
1681 char *last = NULL, *path1, *path2;
1682 struct cgfs_files *k = NULL;
1683 const char *cgroup;
1684 const char *controller = NULL;
1685 int ret = -ENOENT;
1686
1687
1688 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1689 return -EIO;
1690
1691 memset(sb, 0, sizeof(struct stat));
1692
1693 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1694 return -EINVAL;
1695
1696 sb->st_uid = sb->st_gid = 0;
1697 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1698 sb->st_size = 0;
1699
1700 if (strcmp(path, "/cgroup") == 0) {
1701 sb->st_mode = S_IFDIR | 00755;
1702 sb->st_nlink = 2;
1703 return 0;
1704 }
1705
1706 controller = pick_controller_from_path(fc, path);
1707 if (!controller)
1708 return -errno;
1709 cgroup = find_cgroup_in_path(path);
1710 if (!cgroup) {
1711 /* this is just /cgroup/controller, return it as a dir */
1712 sb->st_mode = S_IFDIR | 00755;
1713 sb->st_nlink = 2;
1714 return 0;
1715 }
1716
1717 get_cgdir_and_path(cgroup, &cgdir, &last);
1718
1719 if (!last) {
1720 path1 = "/";
1721 path2 = cgdir;
1722 } else {
1723 path1 = cgdir;
1724 path2 = last;
1725 }
1726
1727 pid_t initpid = lookup_initpid_in_store(fc->pid);
1728 if (initpid <= 1 || is_shared_pidns(initpid))
1729 initpid = fc->pid;
1730 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1731 * Then check that caller's cgroup is under path if last is a child
1732 * cgroup, or cgdir if last is a file */
1733
1734 if (is_child_cgroup(controller, path1, path2)) {
1735 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1736 ret = -ENOENT;
1737 goto out;
1738 }
1739 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1740 /* this is just /cgroup/controller, return it as a dir */
1741 sb->st_mode = S_IFDIR | 00555;
1742 sb->st_nlink = 2;
1743 ret = 0;
1744 goto out;
1745 }
1746 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1747 ret = -EACCES;
1748 goto out;
1749 }
1750
1751 // get uid, gid, from '/tasks' file and make up a mode
1752 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1753 sb->st_mode = S_IFDIR | 00755;
1754 k = cgfs_get_key(controller, cgroup, NULL);
1755 if (!k) {
1756 sb->st_uid = sb->st_gid = 0;
1757 } else {
1758 sb->st_uid = k->uid;
1759 sb->st_gid = k->gid;
1760 }
1761 free_key(k);
1762 sb->st_nlink = 2;
1763 ret = 0;
1764 goto out;
1765 }
1766
1767 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1768 sb->st_mode = S_IFREG | k->mode;
1769 sb->st_nlink = 1;
1770 sb->st_uid = k->uid;
1771 sb->st_gid = k->gid;
1772 sb->st_size = 0;
1773 free_key(k);
1774 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1775 ret = -ENOENT;
1776 goto out;
1777 }
1778 ret = 0;
1779 }
1780
1781 out:
1782 free(cgdir);
1783 return ret;
1784 }
1785
1786 int cg_opendir(const char *path, struct fuse_file_info *fi)
1787 {
1788 struct fuse_context *fc = fuse_get_context();
1789 const char *cgroup;
1790 struct file_info *dir_info;
1791 char *controller = NULL;
1792
1793 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1794 return -EIO;
1795
1796 if (strcmp(path, "/cgroup") == 0) {
1797 cgroup = NULL;
1798 controller = NULL;
1799 } else {
1800 // return list of keys for the controller, and list of child cgroups
1801 controller = pick_controller_from_path(fc, path);
1802 if (!controller)
1803 return -errno;
1804
1805 cgroup = find_cgroup_in_path(path);
1806 if (!cgroup) {
1807 /* this is just /cgroup/controller, return its contents */
1808 cgroup = "/";
1809 }
1810 }
1811
1812 pid_t initpid = lookup_initpid_in_store(fc->pid);
1813 if (initpid <= 1 || is_shared_pidns(initpid))
1814 initpid = fc->pid;
1815 if (cgroup) {
1816 if (!caller_may_see_dir(initpid, controller, cgroup))
1817 return -ENOENT;
1818 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1819 return -EACCES;
1820 }
1821
1822 /* we'll free this at cg_releasedir */
1823 dir_info = malloc(sizeof(*dir_info));
1824 if (!dir_info)
1825 return -ENOMEM;
1826 dir_info->controller = must_copy_string(controller);
1827 dir_info->cgroup = must_copy_string(cgroup);
1828 dir_info->type = LXC_TYPE_CGDIR;
1829 dir_info->buf = NULL;
1830 dir_info->file = NULL;
1831 dir_info->buflen = 0;
1832
1833 fi->fh = (unsigned long)dir_info;
1834 return 0;
1835 }
1836
1837 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1838 struct fuse_file_info *fi)
1839 {
1840 struct file_info *d = (struct file_info *)fi->fh;
1841 struct cgfs_files **list = NULL;
1842 int i, ret;
1843 char *nextcg = NULL;
1844 struct fuse_context *fc = fuse_get_context();
1845 char **clist = NULL;
1846
1847 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1848 return -EIO;
1849
1850 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1851 return -EIO;
1852
1853 if (d->type != LXC_TYPE_CGDIR) {
1854 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1855 return -EIO;
1856 }
1857 if (!d->cgroup && !d->controller) {
1858 /*
1859 * ls /var/lib/lxcfs/cgroup - just show list of controllers.
1860 * This only works with the legacy hierarchy.
1861 */
1862 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1863 if (is_unified_hierarchy(*h))
1864 continue;
1865
1866 if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0))
1867 return -EIO;
1868 }
1869
1870 return 0;
1871 }
1872
1873 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1874 // not a valid cgroup
1875 ret = -EINVAL;
1876 goto out;
1877 }
1878
1879 pid_t initpid = lookup_initpid_in_store(fc->pid);
1880 if (initpid <= 1 || is_shared_pidns(initpid))
1881 initpid = fc->pid;
1882 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1883 if (nextcg) {
1884 ret = filler(buf, nextcg, NULL, 0);
1885 free(nextcg);
1886 if (ret != 0) {
1887 ret = -EIO;
1888 goto out;
1889 }
1890 }
1891 ret = 0;
1892 goto out;
1893 }
1894
1895 for (i = 0; list && list[i]; i++) {
1896 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1897 ret = -EIO;
1898 goto out;
1899 }
1900 }
1901
1902 // now get the list of child cgroups
1903
1904 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1905 ret = 0;
1906 goto out;
1907 }
1908 if (clist) {
1909 for (i = 0; clist[i]; i++) {
1910 if (filler(buf, clist[i], NULL, 0) != 0) {
1911 ret = -EIO;
1912 goto out;
1913 }
1914 }
1915 }
1916 ret = 0;
1917
1918 out:
1919 free_keys(list);
1920 if (clist) {
1921 for (i = 0; clist[i]; i++)
1922 free(clist[i]);
1923 free(clist);
1924 }
1925 return ret;
1926 }
1927
1928 void do_release_file_info(struct fuse_file_info *fi)
1929 {
1930 struct file_info *f = (struct file_info *)fi->fh;
1931
1932 if (!f)
1933 return;
1934
1935 fi->fh = 0;
1936
1937 free_disarm(f->controller);
1938 free_disarm(f->cgroup);
1939 free_disarm(f->file);
1940 free_disarm(f->buf);
1941 free_disarm(f);
1942 }
1943
1944 int cg_releasedir(const char *path, struct fuse_file_info *fi)
1945 {
1946 do_release_file_info(fi);
1947 return 0;
1948 }
1949
1950 int cg_open(const char *path, struct fuse_file_info *fi)
1951 {
1952 const char *cgroup;
1953 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1954 struct cgfs_files *k = NULL;
1955 struct file_info *file_info;
1956 struct fuse_context *fc = fuse_get_context();
1957 int ret;
1958
1959 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1960 return -EIO;
1961
1962 controller = pick_controller_from_path(fc, path);
1963 if (!controller)
1964 return -errno;
1965 cgroup = find_cgroup_in_path(path);
1966 if (!cgroup)
1967 return -errno;
1968
1969 get_cgdir_and_path(cgroup, &cgdir, &last);
1970 if (!last) {
1971 path1 = "/";
1972 path2 = cgdir;
1973 } else {
1974 path1 = cgdir;
1975 path2 = last;
1976 }
1977
1978 k = cgfs_get_key(controller, path1, path2);
1979 if (!k) {
1980 ret = -EINVAL;
1981 goto out;
1982 }
1983 free_key(k);
1984
1985 pid_t initpid = lookup_initpid_in_store(fc->pid);
1986 if (initpid <= 1 || is_shared_pidns(initpid))
1987 initpid = fc->pid;
1988 if (!caller_may_see_dir(initpid, controller, path1)) {
1989 ret = -ENOENT;
1990 goto out;
1991 }
1992 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
1993 ret = -EACCES;
1994 goto out;
1995 }
1996
1997 /* we'll free this at cg_release */
1998 file_info = malloc(sizeof(*file_info));
1999 if (!file_info) {
2000 ret = -ENOMEM;
2001 goto out;
2002 }
2003 file_info->controller = must_copy_string(controller);
2004 file_info->cgroup = must_copy_string(path1);
2005 file_info->file = must_copy_string(path2);
2006 file_info->type = LXC_TYPE_CGFILE;
2007 file_info->buf = NULL;
2008 file_info->buflen = 0;
2009
2010 fi->fh = (unsigned long)file_info;
2011 ret = 0;
2012
2013 out:
2014 free(cgdir);
2015 return ret;
2016 }
2017
2018 int cg_access(const char *path, int mode)
2019 {
2020 int ret;
2021 const char *cgroup;
2022 char *path1, *path2, *controller;
2023 char *last = NULL, *cgdir = NULL;
2024 struct cgfs_files *k = NULL;
2025 struct fuse_context *fc = fuse_get_context();
2026
2027 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2028 return -EIO;
2029
2030 if (strcmp(path, "/cgroup") == 0)
2031 return 0;
2032
2033 controller = pick_controller_from_path(fc, path);
2034 if (!controller)
2035 return -errno;
2036 cgroup = find_cgroup_in_path(path);
2037 if (!cgroup) {
2038 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2039 if ((mode & W_OK) == 0)
2040 return 0;
2041 return -EACCES;
2042 }
2043
2044 get_cgdir_and_path(cgroup, &cgdir, &last);
2045 if (!last) {
2046 path1 = "/";
2047 path2 = cgdir;
2048 } else {
2049 path1 = cgdir;
2050 path2 = last;
2051 }
2052
2053 k = cgfs_get_key(controller, path1, path2);
2054 if (!k) {
2055 if ((mode & W_OK) == 0)
2056 ret = 0;
2057 else
2058 ret = -EACCES;
2059 goto out;
2060 }
2061 free_key(k);
2062
2063 pid_t initpid = lookup_initpid_in_store(fc->pid);
2064 if (initpid <= 1 || is_shared_pidns(initpid))
2065 initpid = fc->pid;
2066 if (!caller_may_see_dir(initpid, controller, path1)) {
2067 ret = -ENOENT;
2068 goto out;
2069 }
2070 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2071 ret = -EACCES;
2072 goto out;
2073 }
2074
2075 ret = 0;
2076
2077 out:
2078 free(cgdir);
2079 return ret;
2080 }
2081
2082 int cg_release(const char *path, struct fuse_file_info *fi)
2083 {
2084 do_release_file_info(fi);
2085 return 0;
2086 }
2087
2088 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2089
2090 static bool wait_for_sock(int sock, int timeout)
2091 {
2092 struct epoll_event ev;
2093 int epfd, ret, now, starttime, deltatime, saved_errno;
2094
2095 if ((starttime = time(NULL)) < 0)
2096 return false;
2097
2098 if ((epfd = epoll_create(1)) < 0) {
2099 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2100 return false;
2101 }
2102
2103 ev.events = POLLIN_SET;
2104 ev.data.fd = sock;
2105 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2106 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2107 close(epfd);
2108 return false;
2109 }
2110
2111 again:
2112 if ((now = time(NULL)) < 0) {
2113 close(epfd);
2114 return false;
2115 }
2116
2117 deltatime = (starttime + timeout) - now;
2118 if (deltatime < 0) { // timeout
2119 errno = 0;
2120 close(epfd);
2121 return false;
2122 }
2123 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2124 if (ret < 0 && errno == EINTR)
2125 goto again;
2126 saved_errno = errno;
2127 close(epfd);
2128
2129 if (ret <= 0) {
2130 errno = saved_errno;
2131 return false;
2132 }
2133 return true;
2134 }
2135
2136 static int msgrecv(int sockfd, void *buf, size_t len)
2137 {
2138 if (!wait_for_sock(sockfd, 2))
2139 return -1;
2140 return recv(sockfd, buf, len, MSG_DONTWAIT);
2141 }
2142
2143 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2144 {
2145 struct msghdr msg = { 0 };
2146 struct iovec iov;
2147 struct cmsghdr *cmsg;
2148 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2149 char buf[1];
2150 buf[0] = 'p';
2151
2152 if (pingfirst) {
2153 if (msgrecv(sock, buf, 1) != 1) {
2154 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2155 return SEND_CREDS_FAIL;
2156 }
2157 }
2158
2159 msg.msg_control = cmsgbuf;
2160 msg.msg_controllen = sizeof(cmsgbuf);
2161
2162 cmsg = CMSG_FIRSTHDR(&msg);
2163 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2164 cmsg->cmsg_level = SOL_SOCKET;
2165 cmsg->cmsg_type = SCM_CREDENTIALS;
2166 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2167
2168 msg.msg_name = NULL;
2169 msg.msg_namelen = 0;
2170
2171 buf[0] = v;
2172 iov.iov_base = buf;
2173 iov.iov_len = sizeof(buf);
2174 msg.msg_iov = &iov;
2175 msg.msg_iovlen = 1;
2176
2177 if (sendmsg(sock, &msg, 0) < 0) {
2178 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2179 if (errno == 3)
2180 return SEND_CREDS_NOTSK;
2181 return SEND_CREDS_FAIL;
2182 }
2183
2184 return SEND_CREDS_OK;
2185 }
2186
2187 static bool recv_creds(int sock, struct ucred *cred, char *v)
2188 {
2189 struct msghdr msg = { 0 };
2190 struct iovec iov;
2191 struct cmsghdr *cmsg;
2192 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2193 char buf[1];
2194 int ret;
2195 int optval = 1;
2196
2197 *v = '1';
2198
2199 cred->pid = -1;
2200 cred->uid = -1;
2201 cred->gid = -1;
2202
2203 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2204 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2205 return false;
2206 }
2207 buf[0] = '1';
2208 if (write(sock, buf, 1) != 1) {
2209 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2210 return false;
2211 }
2212
2213 msg.msg_name = NULL;
2214 msg.msg_namelen = 0;
2215 msg.msg_control = cmsgbuf;
2216 msg.msg_controllen = sizeof(cmsgbuf);
2217
2218 iov.iov_base = buf;
2219 iov.iov_len = sizeof(buf);
2220 msg.msg_iov = &iov;
2221 msg.msg_iovlen = 1;
2222
2223 if (!wait_for_sock(sock, 2)) {
2224 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2225 return false;
2226 }
2227 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2228 if (ret < 0) {
2229 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2230 return false;
2231 }
2232
2233 cmsg = CMSG_FIRSTHDR(&msg);
2234
2235 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2236 cmsg->cmsg_level == SOL_SOCKET &&
2237 cmsg->cmsg_type == SCM_CREDENTIALS) {
2238 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2239 }
2240 *v = buf[0];
2241
2242 return true;
2243 }
2244
2245 struct pid_ns_clone_args {
2246 int *cpipe;
2247 int sock;
2248 pid_t tpid;
2249 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2250 };
2251
2252 /*
2253 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2254 * with clone(). This simply writes '1' as ACK back to the parent
2255 * before calling the actual wrapped function.
2256 */
2257 static int pid_ns_clone_wrapper(void *arg) {
2258 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2259 char b = '1';
2260
2261 close(args->cpipe[0]);
2262 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2263 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2264 close(args->cpipe[1]);
2265 return args->wrapped(args->sock, args->tpid);
2266 }
2267
2268 /*
2269 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2270 * int value back over the socket. This shifts the pid from the
2271 * sender's pidns into tpid's pidns.
2272 */
2273 static int pid_to_ns(int sock, pid_t tpid)
2274 {
2275 char v = '0';
2276 struct ucred cred;
2277
2278 while (recv_creds(sock, &cred, &v)) {
2279 if (v == '1')
2280 return 0;
2281 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2282 return 1;
2283 }
2284 return 0;
2285 }
2286
2287
2288 /*
2289 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2290 * in your old pidns. Only children which you clone will be in the target
2291 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2292 * actually convert pids.
2293 *
2294 * Note: glibc's fork() does not respect pidns, which can lead to failed
2295 * assertions inside glibc (and thus failed forks) if the child's pid in
2296 * the pidns and the parent pid outside are identical. Using clone prevents
2297 * this issue.
2298 */
2299 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2300 {
2301 int newnsfd = -1, ret, cpipe[2];
2302 char fnam[100];
2303 pid_t cpid;
2304 char v;
2305
2306 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2307 if (ret < 0 || ret >= sizeof(fnam))
2308 _exit(1);
2309 newnsfd = open(fnam, O_RDONLY);
2310 if (newnsfd < 0)
2311 _exit(1);
2312 if (setns(newnsfd, 0) < 0)
2313 _exit(1);
2314 close(newnsfd);
2315
2316 if (pipe(cpipe) < 0)
2317 _exit(1);
2318
2319 struct pid_ns_clone_args args = {
2320 .cpipe = cpipe,
2321 .sock = sock,
2322 .tpid = tpid,
2323 .wrapped = &pid_to_ns
2324 };
2325 size_t stack_size = sysconf(_SC_PAGESIZE);
2326 void *stack = alloca(stack_size);
2327
2328 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2329 if (cpid < 0)
2330 _exit(1);
2331
2332 // give the child 1 second to be done forking and
2333 // write its ack
2334 if (!wait_for_sock(cpipe[0], 1))
2335 _exit(1);
2336 ret = read(cpipe[0], &v, 1);
2337 if (ret != sizeof(char) || v != '1')
2338 _exit(1);
2339
2340 if (!wait_for_pid(cpid))
2341 _exit(1);
2342 _exit(0);
2343 }
2344
2345 /*
2346 * To read cgroup files with a particular pid, we will setns into the child
2347 * pidns, open a pipe, fork a child - which will be the first to really be in
2348 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2349 */
2350 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2351 {
2352 int sock[2] = {-1, -1};
2353 char *tmpdata = NULL;
2354 int ret;
2355 pid_t qpid, cpid = -1;
2356 bool answer = false;
2357 char v = '0';
2358 struct ucred cred;
2359 size_t sz = 0, asz = 0;
2360
2361 if (!cgroup_ops->get(cgroup_ops, contrl, cg, file, &tmpdata))
2362 return false;
2363
2364 /*
2365 * Now we read the pids from returned data one by one, pass
2366 * them into a child in the target namespace, read back the
2367 * translated pids, and put them into our to-return data
2368 */
2369
2370 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2371 perror("socketpair");
2372 free(tmpdata);
2373 return false;
2374 }
2375
2376 cpid = fork();
2377 if (cpid == -1)
2378 goto out;
2379
2380 if (!cpid) // child - exits when done
2381 pid_to_ns_wrapper(sock[1], tpid);
2382
2383 char *ptr = tmpdata;
2384 cred.uid = 0;
2385 cred.gid = 0;
2386 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2387 cred.pid = qpid;
2388 ret = send_creds(sock[0], &cred, v, true);
2389
2390 if (ret == SEND_CREDS_NOTSK)
2391 goto next;
2392 if (ret == SEND_CREDS_FAIL)
2393 goto out;
2394
2395 // read converted results
2396 if (!wait_for_sock(sock[0], 2)) {
2397 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2398 goto out;
2399 }
2400 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2401 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2402 goto out;
2403 }
2404 must_strcat_pid(d, &sz, &asz, qpid);
2405 next:
2406 ptr = strchr(ptr, '\n');
2407 if (!ptr)
2408 break;
2409 ptr++;
2410 }
2411
2412 cred.pid = getpid();
2413 v = '1';
2414 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2415 // failed to ask child to exit
2416 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2417 goto out;
2418 }
2419
2420 answer = true;
2421
2422 out:
2423 free(tmpdata);
2424 if (cpid != -1)
2425 wait_for_pid(cpid);
2426 if (sock[0] != -1) {
2427 close(sock[0]);
2428 close(sock[1]);
2429 }
2430 return answer;
2431 }
2432
2433 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2434 struct fuse_file_info *fi)
2435 {
2436 struct fuse_context *fc = fuse_get_context();
2437 struct file_info *f = (struct file_info *)fi->fh;
2438 struct cgfs_files *k = NULL;
2439 char *data = NULL;
2440 int ret, s;
2441 bool r;
2442
2443 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2444 return -EIO;
2445
2446 if (f->type != LXC_TYPE_CGFILE) {
2447 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2448 return -EIO;
2449 }
2450
2451 if (offset)
2452 return 0;
2453
2454 if (!f->controller)
2455 return -EINVAL;
2456
2457 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2458 return -EINVAL;
2459 }
2460 free_key(k);
2461
2462
2463 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2464 ret = -EACCES;
2465 goto out;
2466 }
2467
2468 if (strcmp(f->file, "tasks") == 0 ||
2469 strcmp(f->file, "/tasks") == 0 ||
2470 strcmp(f->file, "/cgroup.procs") == 0 ||
2471 strcmp(f->file, "cgroup.procs") == 0)
2472 // special case - we have to translate the pids
2473 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2474 else
2475 r = cgroup_ops->get(cgroup_ops, f->controller, f->cgroup, f->file, &data);
2476
2477 if (!r) {
2478 ret = -EINVAL;
2479 goto out;
2480 }
2481
2482 if (!data) {
2483 ret = 0;
2484 goto out;
2485 }
2486 s = strlen(data);
2487 if (s > size)
2488 s = size;
2489 memcpy(buf, data, s);
2490 if (s > 0 && s < size && data[s-1] != '\n')
2491 buf[s++] = '\n';
2492
2493 ret = s;
2494
2495 out:
2496 free(data);
2497 return ret;
2498 }
2499
2500 static int pid_from_ns(int sock, pid_t tpid)
2501 {
2502 pid_t vpid;
2503 struct ucred cred;
2504 char v;
2505 int ret;
2506
2507 cred.uid = 0;
2508 cred.gid = 0;
2509 while (1) {
2510 if (!wait_for_sock(sock, 2)) {
2511 lxcfs_error("%s\n", "Timeout reading from parent.");
2512 return 1;
2513 }
2514 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2515 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2516 return 1;
2517 }
2518 if (vpid == -1) // done
2519 break;
2520 v = '0';
2521 cred.pid = vpid;
2522 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2523 v = '1';
2524 cred.pid = getpid();
2525 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2526 return 1;
2527 }
2528 }
2529 return 0;
2530 }
2531
2532 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2533 {
2534 int newnsfd = -1, ret, cpipe[2];
2535 char fnam[100];
2536 pid_t cpid;
2537 char v;
2538
2539 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2540 if (ret < 0 || ret >= sizeof(fnam))
2541 _exit(1);
2542 newnsfd = open(fnam, O_RDONLY);
2543 if (newnsfd < 0)
2544 _exit(1);
2545 if (setns(newnsfd, 0) < 0)
2546 _exit(1);
2547 close(newnsfd);
2548
2549 if (pipe(cpipe) < 0)
2550 _exit(1);
2551
2552 struct pid_ns_clone_args args = {
2553 .cpipe = cpipe,
2554 .sock = sock,
2555 .tpid = tpid,
2556 .wrapped = &pid_from_ns
2557 };
2558 size_t stack_size = sysconf(_SC_PAGESIZE);
2559 void *stack = alloca(stack_size);
2560
2561 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2562 if (cpid < 0)
2563 _exit(1);
2564
2565 // give the child 1 second to be done forking and
2566 // write its ack
2567 if (!wait_for_sock(cpipe[0], 1))
2568 _exit(1);
2569 ret = read(cpipe[0], &v, 1);
2570 if (ret != sizeof(char) || v != '1')
2571 _exit(1);
2572
2573 if (!wait_for_pid(cpid))
2574 _exit(1);
2575 _exit(0);
2576 }
2577
2578 /*
2579 * Given host @uid, return the uid to which it maps in
2580 * @pid's user namespace, or -1 if none.
2581 */
2582 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2583 {
2584 FILE *f;
2585 char line[400];
2586
2587 sprintf(line, "/proc/%d/uid_map", pid);
2588 if ((f = fopen(line, "r")) == NULL) {
2589 return false;
2590 }
2591
2592 *answer = convert_id_to_ns(f, uid);
2593 fclose(f);
2594
2595 if (*answer == -1)
2596 return false;
2597 return true;
2598 }
2599
2600 /*
2601 * get_pid_creds: get the real uid and gid of @pid from
2602 * /proc/$$/status
2603 * (XXX should we use euid here?)
2604 */
2605 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2606 {
2607 char line[400];
2608 uid_t u;
2609 gid_t g;
2610 FILE *f;
2611
2612 *uid = -1;
2613 *gid = -1;
2614 sprintf(line, "/proc/%d/status", pid);
2615 if ((f = fopen(line, "r")) == NULL) {
2616 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2617 return;
2618 }
2619 while (fgets(line, 400, f)) {
2620 if (strncmp(line, "Uid:", 4) == 0) {
2621 if (sscanf(line+4, "%u", &u) != 1) {
2622 lxcfs_error("bad uid line for pid %u\n", pid);
2623 fclose(f);
2624 return;
2625 }
2626 *uid = u;
2627 } else if (strncmp(line, "Gid:", 4) == 0) {
2628 if (sscanf(line+4, "%u", &g) != 1) {
2629 lxcfs_error("bad gid line for pid %u\n", pid);
2630 fclose(f);
2631 return;
2632 }
2633 *gid = g;
2634 }
2635 }
2636 fclose(f);
2637 }
2638
2639 /*
2640 * May the requestor @r move victim @v to a new cgroup?
2641 * This is allowed if
2642 * . they are the same task
2643 * . they are ownedy by the same uid
2644 * . @r is root on the host, or
2645 * . @v's uid is mapped into @r's where @r is root.
2646 */
2647 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2648 {
2649 uid_t v_uid, tmpuid;
2650 gid_t v_gid;
2651
2652 if (r == v)
2653 return true;
2654 if (r_uid == 0)
2655 return true;
2656 get_pid_creds(v, &v_uid, &v_gid);
2657 if (r_uid == v_uid)
2658 return true;
2659 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2660 && hostuid_to_ns(v_uid, r, &tmpuid))
2661 return true;
2662 return false;
2663 }
2664
2665 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2666 const char *file, const char *buf)
2667 {
2668 int sock[2] = {-1, -1};
2669 pid_t qpid, cpid = -1;
2670 FILE *pids_file = NULL;
2671 bool answer = false, fail = false;
2672
2673 pids_file = open_pids_file(contrl, cg);
2674 if (!pids_file)
2675 return false;
2676
2677 /*
2678 * write the pids to a socket, have helper in writer's pidns
2679 * call movepid for us
2680 */
2681 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2682 perror("socketpair");
2683 goto out;
2684 }
2685
2686 cpid = fork();
2687 if (cpid == -1)
2688 goto out;
2689
2690 if (!cpid) { // child
2691 fclose(pids_file);
2692 pid_from_ns_wrapper(sock[1], tpid);
2693 }
2694
2695 const char *ptr = buf;
2696 while (sscanf(ptr, "%d", &qpid) == 1) {
2697 struct ucred cred;
2698 char v;
2699
2700 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2701 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2702 goto out;
2703 }
2704
2705 if (recv_creds(sock[0], &cred, &v)) {
2706 if (v == '0') {
2707 if (!may_move_pid(tpid, tuid, cred.pid)) {
2708 fail = true;
2709 break;
2710 }
2711 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2712 fail = true;
2713 }
2714 }
2715
2716 ptr = strchr(ptr, '\n');
2717 if (!ptr)
2718 break;
2719 ptr++;
2720 }
2721
2722 /* All good, write the value */
2723 qpid = -1;
2724 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2725 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2726
2727 if (!fail)
2728 answer = true;
2729
2730 out:
2731 if (cpid != -1)
2732 wait_for_pid(cpid);
2733 if (sock[0] != -1) {
2734 close(sock[0]);
2735 close(sock[1]);
2736 }
2737 if (pids_file) {
2738 if (fclose(pids_file) != 0)
2739 answer = false;
2740 }
2741 return answer;
2742 }
2743
2744 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2745 struct fuse_file_info *fi)
2746 {
2747 struct fuse_context *fc = fuse_get_context();
2748 char *localbuf = NULL;
2749 struct cgfs_files *k = NULL;
2750 struct file_info *f = (struct file_info *)fi->fh;
2751 bool r;
2752
2753 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2754 return -EIO;
2755
2756 if (f->type != LXC_TYPE_CGFILE) {
2757 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2758 return -EIO;
2759 }
2760
2761 if (offset)
2762 return 0;
2763
2764 localbuf = alloca(size+1);
2765 localbuf[size] = '\0';
2766 memcpy(localbuf, buf, size);
2767
2768 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2769 size = -EINVAL;
2770 goto out;
2771 }
2772
2773 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2774 size = -EACCES;
2775 goto out;
2776 }
2777
2778 if (strcmp(f->file, "tasks") == 0 ||
2779 strcmp(f->file, "/tasks") == 0 ||
2780 strcmp(f->file, "/cgroup.procs") == 0 ||
2781 strcmp(f->file, "cgroup.procs") == 0)
2782 // special case - we have to translate the pids
2783 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2784 else
2785 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2786
2787 if (!r)
2788 size = -EINVAL;
2789
2790 out:
2791 free_key(k);
2792 return size;
2793 }
2794
2795 int cg_chown(const char *path, uid_t uid, gid_t gid)
2796 {
2797 struct fuse_context *fc = fuse_get_context();
2798 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2799 struct cgfs_files *k = NULL;
2800 const char *cgroup;
2801 int ret;
2802
2803 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2804 return -EIO;
2805
2806 if (strcmp(path, "/cgroup") == 0)
2807 return -EPERM;
2808
2809 controller = pick_controller_from_path(fc, path);
2810 if (!controller)
2811 return errno == ENOENT ? -EPERM : -errno;
2812
2813 cgroup = find_cgroup_in_path(path);
2814 if (!cgroup)
2815 /* this is just /cgroup/controller */
2816 return -EPERM;
2817
2818 get_cgdir_and_path(cgroup, &cgdir, &last);
2819
2820 if (!last) {
2821 path1 = "/";
2822 path2 = cgdir;
2823 } else {
2824 path1 = cgdir;
2825 path2 = last;
2826 }
2827
2828 if (is_child_cgroup(controller, path1, path2)) {
2829 // get uid, gid, from '/tasks' file and make up a mode
2830 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2831 k = cgfs_get_key(controller, cgroup, "tasks");
2832
2833 } else
2834 k = cgfs_get_key(controller, path1, path2);
2835
2836 if (!k) {
2837 ret = -EINVAL;
2838 goto out;
2839 }
2840
2841 /*
2842 * This being a fuse request, the uid and gid must be valid
2843 * in the caller's namespace. So we can just check to make
2844 * sure that the caller is root in his uid, and privileged
2845 * over the file's current owner.
2846 */
2847 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2848 ret = -EACCES;
2849 goto out;
2850 }
2851
2852 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2853
2854 out:
2855 free_key(k);
2856 free(cgdir);
2857
2858 return ret;
2859 }
2860
2861 int cg_chmod(const char *path, mode_t mode)
2862 {
2863 struct fuse_context *fc = fuse_get_context();
2864 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2865 struct cgfs_files *k = NULL;
2866 const char *cgroup;
2867 int ret;
2868
2869 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2870 return -EIO;
2871
2872 if (strcmp(path, "/cgroup") == 0)
2873 return -EPERM;
2874
2875 controller = pick_controller_from_path(fc, path);
2876 if (!controller)
2877 return errno == ENOENT ? -EPERM : -errno;
2878
2879 cgroup = find_cgroup_in_path(path);
2880 if (!cgroup)
2881 /* this is just /cgroup/controller */
2882 return -EPERM;
2883
2884 get_cgdir_and_path(cgroup, &cgdir, &last);
2885
2886 if (!last) {
2887 path1 = "/";
2888 path2 = cgdir;
2889 } else {
2890 path1 = cgdir;
2891 path2 = last;
2892 }
2893
2894 if (is_child_cgroup(controller, path1, path2)) {
2895 // get uid, gid, from '/tasks' file and make up a mode
2896 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2897 k = cgfs_get_key(controller, cgroup, "tasks");
2898
2899 } else
2900 k = cgfs_get_key(controller, path1, path2);
2901
2902 if (!k) {
2903 ret = -EINVAL;
2904 goto out;
2905 }
2906
2907 /*
2908 * This being a fuse request, the uid and gid must be valid
2909 * in the caller's namespace. So we can just check to make
2910 * sure that the caller is root in his uid, and privileged
2911 * over the file's current owner.
2912 */
2913 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2914 ret = -EPERM;
2915 goto out;
2916 }
2917
2918 if (!cgfs_chmod_file(controller, cgroup, mode)) {
2919 ret = -EINVAL;
2920 goto out;
2921 }
2922
2923 ret = 0;
2924 out:
2925 free_key(k);
2926 free(cgdir);
2927 return ret;
2928 }
2929
2930 int cg_mkdir(const char *path, mode_t mode)
2931 {
2932 struct fuse_context *fc = fuse_get_context();
2933 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2934 const char *cgroup;
2935 int ret;
2936
2937 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2938 return -EIO;
2939
2940 controller = pick_controller_from_path(fc, path);
2941 if (!controller)
2942 return errno == ENOENT ? -EPERM : -errno;
2943
2944 cgroup = find_cgroup_in_path(path);
2945 if (!cgroup)
2946 return -errno;
2947
2948 get_cgdir_and_path(cgroup, &cgdir, &last);
2949 if (!last)
2950 path1 = "/";
2951 else
2952 path1 = cgdir;
2953
2954 pid_t initpid = lookup_initpid_in_store(fc->pid);
2955 if (initpid <= 1 || is_shared_pidns(initpid))
2956 initpid = fc->pid;
2957 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
2958 if (!next)
2959 ret = -EINVAL;
2960 else if (last && strcmp(next, last) == 0)
2961 ret = -EEXIST;
2962 else
2963 ret = -EPERM;
2964 goto out;
2965 }
2966
2967 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
2968 ret = -EACCES;
2969 goto out;
2970 }
2971 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2972 ret = -EACCES;
2973 goto out;
2974 }
2975
2976 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
2977
2978 out:
2979 free(cgdir);
2980 free(next);
2981 return ret;
2982 }
2983
2984 int cg_rmdir(const char *path)
2985 {
2986 struct fuse_context *fc = fuse_get_context();
2987 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
2988 const char *cgroup;
2989 int ret;
2990
2991 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2992 return -EIO;
2993
2994 controller = pick_controller_from_path(fc, path);
2995 if (!controller) /* Someone's trying to delete "/cgroup". */
2996 return -EPERM;
2997
2998 cgroup = find_cgroup_in_path(path);
2999 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3000 return -EPERM;
3001
3002 get_cgdir_and_path(cgroup, &cgdir, &last);
3003 if (!last) {
3004 /* Someone's trying to delete a cgroup on the same level as the
3005 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3006 * rmdir "/cgroup/blkio/init.slice".
3007 */
3008 ret = -EPERM;
3009 goto out;
3010 }
3011
3012 pid_t initpid = lookup_initpid_in_store(fc->pid);
3013 if (initpid <= 1 || is_shared_pidns(initpid))
3014 initpid = fc->pid;
3015 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3016 if (!last || (next && (strcmp(next, last) == 0)))
3017 ret = -EBUSY;
3018 else
3019 ret = -ENOENT;
3020 goto out;
3021 }
3022
3023 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3024 ret = -EACCES;
3025 goto out;
3026 }
3027 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3028 ret = -EACCES;
3029 goto out;
3030 }
3031
3032 if (!cgfs_remove(controller, cgroup)) {
3033 ret = -EINVAL;
3034 goto out;
3035 }
3036
3037 ret = 0;
3038
3039 out:
3040 free(cgdir);
3041 free(next);
3042 return ret;
3043 }
3044
3045 static bool startswith(const char *line, const char *pref)
3046 {
3047 if (strncmp(line, pref, strlen(pref)) == 0)
3048 return true;
3049 return false;
3050 }
3051
3052 /* Note that "memory.stat" in cgroup2 is hierarchical by default. */
3053 static void parse_memstat(int version,
3054 char *memstat,
3055 unsigned long *cached,
3056 unsigned long *active_anon,
3057 unsigned long *inactive_anon,
3058 unsigned long *active_file,
3059 unsigned long *inactive_file,
3060 unsigned long *unevictable,
3061 unsigned long *shmem)
3062 {
3063 char *eol;
3064
3065 while (*memstat) {
3066 if (startswith(memstat, is_unified_controller(version)
3067 ? "cache"
3068 : "total_cache")) {
3069 sscanf(memstat + 11, "%lu", cached);
3070 *cached /= 1024;
3071 } else if (startswith(memstat, is_unified_controller(version)
3072 ? "active_anon"
3073 : "total_active_anon")) {
3074 sscanf(memstat + 17, "%lu", active_anon);
3075 *active_anon /= 1024;
3076 } else if (startswith(memstat, is_unified_controller(version)
3077 ? "inactive_anon"
3078 : "total_inactive_anon")) {
3079 sscanf(memstat + 19, "%lu", inactive_anon);
3080 *inactive_anon /= 1024;
3081 } else if (startswith(memstat, is_unified_controller(version)
3082 ? "active_file"
3083 : "total_active_file")) {
3084 sscanf(memstat + 17, "%lu", active_file);
3085 *active_file /= 1024;
3086 } else if (startswith(memstat, is_unified_controller(version)
3087 ? "inactive_file"
3088 : "total_inactive_file")) {
3089 sscanf(memstat + 19, "%lu", inactive_file);
3090 *inactive_file /= 1024;
3091 } else if (startswith(memstat, is_unified_controller(version)
3092 ? "unevictable"
3093 : "total_unevictable")) {
3094 sscanf(memstat + 17, "%lu", unevictable);
3095 *unevictable /= 1024;
3096 } else if (startswith(memstat, is_unified_controller(version)
3097 ? "shmem"
3098 : "total_shmem")) {
3099 sscanf(memstat + 11, "%lu", shmem);
3100 *shmem /= 1024;
3101 }
3102 eol = strchr(memstat, '\n');
3103 if (!eol)
3104 return;
3105 memstat = eol+1;
3106 }
3107 }
3108
3109 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3110 {
3111 char *eol;
3112 char key[32];
3113
3114 memset(key, 0, 32);
3115 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3116
3117 size_t len = strlen(key);
3118 *v = 0;
3119
3120 while (*str) {
3121 if (startswith(str, key)) {
3122 sscanf(str + len, "%lu", v);
3123 return;
3124 }
3125 eol = strchr(str, '\n');
3126 if (!eol)
3127 return;
3128 str = eol+1;
3129 }
3130 }
3131
3132 int read_file_fuse(const char *path, char *buf, size_t size, struct file_info *d)
3133 {
3134 __do_free char *line = NULL;
3135 __do_fclose FILE *f = NULL;
3136 size_t linelen = 0, total_len = 0;
3137 char *cache = d->buf;
3138 size_t cache_size = d->buflen;
3139
3140 f = fopen(path, "r");
3141 if (!f)
3142 return 0;
3143
3144 while (getline(&line, &linelen, f) != -1) {
3145 ssize_t l = snprintf(cache, cache_size, "%s", line);
3146 if (l < 0) {
3147 perror("Error writing to cache");
3148 return 0;
3149 }
3150 if (l >= cache_size) {
3151 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3152 return 0;
3153 }
3154 cache += l;
3155 cache_size -= l;
3156 total_len += l;
3157 }
3158
3159 d->size = total_len;
3160 if (total_len > size)
3161 total_len = size;
3162
3163 /* read from off 0 */
3164 memcpy(buf, d->buf, total_len);
3165
3166 if (d->size > total_len)
3167 d->cached = d->size - total_len;
3168 return total_len;
3169 }
3170
3171 /*
3172 * FUSE ops for /proc
3173 */
3174
3175 static unsigned long get_memlimit(const char *cgroup, bool swap)
3176 {
3177 int ret;
3178 __do_free char *memlimit_str = NULL;
3179 unsigned long memlimit = -1;
3180
3181 if (swap)
3182 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
3183 else
3184 ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
3185 if (ret > 0)
3186 memlimit = strtoul(memlimit_str, NULL, 10);
3187
3188 return memlimit;
3189 }
3190
3191 static unsigned long get_min_memlimit(const char *cgroup, bool swap)
3192 {
3193 __do_free char *copy = NULL;
3194 unsigned long memlimit = 0;
3195 unsigned long retlimit;
3196
3197 copy = strdup(cgroup);
3198 retlimit = get_memlimit(copy, swap);
3199
3200 while (strcmp(copy, "/") != 0) {
3201 char *it = copy;
3202
3203 it = dirname(it);
3204 memlimit = get_memlimit(it, swap);
3205 if (memlimit != -1 && memlimit < retlimit)
3206 retlimit = memlimit;
3207 };
3208
3209 return retlimit;
3210 }
3211
3212 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3213 struct fuse_file_info *fi)
3214 {
3215 __do_free char *cgroup = NULL, *line = NULL,
3216 *memusage_str = NULL, *memstat_str = NULL,
3217 *memswlimit_str = NULL, *memswusage_str = NULL;
3218 __do_fclose FILE *f = NULL;
3219 struct fuse_context *fc = fuse_get_context();
3220 struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
3221 struct file_info *d = (struct file_info *)fi->fh;
3222 unsigned long memlimit = 0, memusage = 0, memswlimit = 0,
3223 memswusage = 0, cached = 0, hosttotal = 0, active_anon = 0,
3224 inactive_anon = 0, active_file = 0, inactive_file = 0,
3225 unevictable = 0, shmem = 0, hostswtotal = 0;
3226 size_t linelen = 0, total_len = 0;
3227 char *cache = d->buf;
3228 size_t cache_size = d->buflen;
3229 int ret;
3230
3231 if (offset) {
3232 int left;
3233
3234 if (offset > d->size)
3235 return -EINVAL;
3236
3237 if (!d->cached)
3238 return 0;
3239
3240 left = d->size - offset;
3241 total_len = left > size ? size : left;
3242 memcpy(buf, cache + offset, total_len);
3243
3244 return total_len;
3245 }
3246
3247 pid_t initpid = lookup_initpid_in_store(fc->pid);
3248 if (initpid <= 1 || is_shared_pidns(initpid))
3249 initpid = fc->pid;
3250
3251 cgroup = get_pid_cgroup(initpid, "memory");
3252 if (!cgroup)
3253 return read_file_fuse("/proc/meminfo", buf, size, d);
3254
3255 prune_init_slice(cgroup);
3256
3257 memlimit = get_min_memlimit(cgroup, false);
3258
3259 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
3260 if (ret < 0)
3261 return 0;
3262
3263 ret = cgroup_ops->get_memory_stats(cgroup_ops, cgroup, &memstat_str);
3264 if (ret < 0)
3265 return 0;
3266 parse_memstat(ret, memstat_str, &cached, &active_anon, &inactive_anon,
3267 &active_file, &inactive_file, &unevictable, &shmem);
3268
3269 /*
3270 * Following values are allowed to fail, because swapaccount might be
3271 * turned off for current kernel.
3272 */
3273 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memswlimit_str);
3274 if (ret >= 0)
3275 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
3276 if (ret >= 0) {
3277 memswlimit = get_min_memlimit(cgroup, true);
3278 memswusage = strtoul(memswusage_str, NULL, 10);
3279 memswlimit = memswlimit / 1024;
3280 memswusage = memswusage / 1024;
3281 }
3282
3283 memusage = strtoul(memusage_str, NULL, 10);
3284 memlimit /= 1024;
3285 memusage /= 1024;
3286
3287 f = fopen("/proc/meminfo", "r");
3288 if (!f)
3289 return 0;
3290
3291 while (getline(&line, &linelen, f) != -1) {
3292 ssize_t l;
3293 char *printme, lbuf[100];
3294
3295 memset(lbuf, 0, 100);
3296 if (startswith(line, "MemTotal:")) {
3297 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3298 if (hosttotal < memlimit)
3299 memlimit = hosttotal;
3300 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3301 printme = lbuf;
3302 } else if (startswith(line, "MemFree:")) {
3303 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3304 printme = lbuf;
3305 } else if (startswith(line, "MemAvailable:")) {
3306 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
3307 printme = lbuf;
3308 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 &&
3309 opts && opts->swap_off == false) {
3310 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3311 if (hostswtotal < memswlimit)
3312 memswlimit = hostswtotal;
3313 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
3314 printme = lbuf;
3315 } else if (startswith(line, "SwapTotal:") && opts && opts->swap_off == true) {
3316 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", 0UL);
3317 printme = lbuf;
3318 } else if (startswith(line, "SwapFree:") && memswlimit > 0 &&
3319 memswusage > 0 && opts && opts->swap_off == false) {
3320 unsigned long swaptotal = memswlimit,
3321 swapusage = memusage > memswusage
3322 ? 0
3323 : memswusage - memusage,
3324 swapfree = swapusage < swaptotal
3325 ? swaptotal - swapusage
3326 : 0;
3327 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3328 printme = lbuf;
3329 } else if (startswith(line, "SwapFree:") && opts && opts->swap_off == true) {
3330 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", 0UL);
3331 printme = lbuf;
3332 } else if (startswith(line, "Slab:")) {
3333 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3334 printme = lbuf;
3335 } else if (startswith(line, "Buffers:")) {
3336 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3337 printme = lbuf;
3338 } else if (startswith(line, "Cached:")) {
3339 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3340 printme = lbuf;
3341 } else if (startswith(line, "SwapCached:")) {
3342 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3343 printme = lbuf;
3344 } else if (startswith(line, "Active:")) {
3345 snprintf(lbuf, 100, "Active: %8lu kB\n",
3346 active_anon + active_file);
3347 printme = lbuf;
3348 } else if (startswith(line, "Inactive:")) {
3349 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3350 inactive_anon + inactive_file);
3351 printme = lbuf;
3352 } else if (startswith(line, "Active(anon)")) {
3353 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3354 printme = lbuf;
3355 } else if (startswith(line, "Inactive(anon)")) {
3356 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3357 printme = lbuf;
3358 } else if (startswith(line, "Active(file)")) {
3359 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3360 printme = lbuf;
3361 } else if (startswith(line, "Inactive(file)")) {
3362 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3363 printme = lbuf;
3364 } else if (startswith(line, "Unevictable")) {
3365 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3366 printme = lbuf;
3367 } else if (startswith(line, "SReclaimable")) {
3368 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3369 printme = lbuf;
3370 } else if (startswith(line, "SUnreclaim")) {
3371 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3372 printme = lbuf;
3373 } else if (startswith(line, "Shmem:")) {
3374 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3375 printme = lbuf;
3376 } else if (startswith(line, "ShmemHugePages")) {
3377 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3378 printme = lbuf;
3379 } else if (startswith(line, "ShmemPmdMapped")) {
3380 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3381 printme = lbuf;
3382 } else
3383 printme = line;
3384
3385 l = snprintf(cache, cache_size, "%s", printme);
3386 if (l < 0) {
3387 perror("Error writing to cache");
3388 return 0;
3389
3390 }
3391 if (l >= cache_size) {
3392 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3393 return 0;
3394 }
3395
3396 cache += l;
3397 cache_size -= l;
3398 total_len += l;
3399 }
3400
3401 d->cached = 1;
3402 d->size = total_len;
3403 if (total_len > size ) total_len = size;
3404 memcpy(buf, d->buf, total_len);
3405
3406 return total_len;
3407 }
3408
3409 /*
3410 * Read the cpuset.cpus for cg
3411 * Return the answer in a newly allocated string which must be freed
3412 */
3413 char *get_cpuset(const char *cg)
3414 {
3415 char *value = NULL;
3416 int ret;
3417
3418 ret = cgroup_ops->get_cpuset_cpus(cgroup_ops, cg, &value);
3419 if (ret < 0)
3420 return NULL;
3421
3422 return value;
3423 }
3424
3425 bool cpu_in_cpuset(int cpu, const char *cpuset);
3426
3427 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3428 {
3429 int cpu;
3430
3431 if (sscanf(line, "processor : %d", &cpu) != 1)
3432 return false;
3433 return cpu_in_cpuset(cpu, cpuset);
3434 }
3435
3436 /*
3437 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3438 * depending on `param`. Parameter value is returned throuh `value`.
3439 */
3440 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3441 {
3442 __do_free char *str = NULL;
3443 char file[11 + 6 + 1]; /* cpu.cfs__us + quota/period + \0 */
3444
3445 snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
3446
3447 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
3448 return false;
3449
3450 if (sscanf(str, "%ld", value) != 1)
3451 return false;
3452
3453 return true;
3454 }
3455
3456 /*
3457 * Return the maximum number of visible CPUs based on CPU quotas.
3458 * If there is no quota set, zero is returned.
3459 */
3460 int max_cpu_count(const char *cg)
3461 {
3462 int rv, nprocs;
3463 int64_t cfs_quota, cfs_period;
3464 int nr_cpus_in_cpuset = 0;
3465 char *cpuset = NULL;
3466
3467 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3468 return 0;
3469
3470 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3471 return 0;
3472
3473 cpuset = get_cpuset(cg);
3474 if (cpuset)
3475 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
3476
3477 if (cfs_quota <= 0 || cfs_period <= 0){
3478 if (nr_cpus_in_cpuset > 0)
3479 return nr_cpus_in_cpuset;
3480
3481 return 0;
3482 }
3483
3484 rv = cfs_quota / cfs_period;
3485
3486 /* In case quota/period does not yield a whole number, add one CPU for
3487 * the remainder.
3488 */
3489 if ((cfs_quota % cfs_period) > 0)
3490 rv += 1;
3491
3492 nprocs = get_nprocs();
3493
3494 if (rv > nprocs)
3495 rv = nprocs;
3496
3497 /* use min value in cpu quota and cpuset */
3498 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
3499 rv = nr_cpus_in_cpuset;
3500
3501 return rv;
3502 }
3503
3504 /*
3505 * Return the exact number of visible CPUs based on CPU quotas.
3506 * If there is no quota set, zero is returned.
3507 */
3508 static double exact_cpu_count(const char *cg)
3509 {
3510 double rv;
3511 int nprocs;
3512 int64_t cfs_quota, cfs_period;
3513
3514 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3515 return 0;
3516
3517 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3518 return 0;
3519
3520 if (cfs_quota <= 0 || cfs_period <= 0)
3521 return 0;
3522
3523 rv = (double)cfs_quota / (double)cfs_period;
3524
3525 nprocs = get_nprocs();
3526
3527 if (rv > nprocs)
3528 rv = nprocs;
3529
3530 return rv;
3531 }
3532
3533 /*
3534 * check whether this is a '^processor" line in /proc/cpuinfo
3535 */
3536 static bool is_processor_line(const char *line)
3537 {
3538 int cpu;
3539
3540 if (sscanf(line, "processor : %d", &cpu) == 1)
3541 return true;
3542 return false;
3543 }
3544
3545 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3546 struct fuse_file_info *fi)
3547 {
3548 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
3549 __do_fclose FILE *f = NULL;
3550 struct fuse_context *fc = fuse_get_context();
3551 struct file_info *d = (struct file_info *)fi->fh;
3552 size_t linelen = 0, total_len = 0;
3553 bool am_printing = false, firstline = true, is_s390x = false;
3554 int curcpu = -1, cpu, max_cpus = 0;
3555 bool use_view;
3556 char *cache = d->buf;
3557 size_t cache_size = d->buflen;
3558
3559 if (offset){
3560 int left;
3561
3562 if (offset > d->size)
3563 return -EINVAL;
3564
3565 if (!d->cached)
3566 return 0;
3567
3568 left = d->size - offset;
3569 total_len = left > size ? size: left;
3570 memcpy(buf, cache + offset, total_len);
3571
3572 return total_len;
3573 }
3574
3575 pid_t initpid = lookup_initpid_in_store(fc->pid);
3576 if (initpid <= 1 || is_shared_pidns(initpid))
3577 initpid = fc->pid;
3578 cg = get_pid_cgroup(initpid, "cpuset");
3579 if (!cg)
3580 return read_file_fuse("proc/cpuinfo", buf, size, d);
3581 prune_init_slice(cg);
3582
3583 cpuset = get_cpuset(cg);
3584 if (!cpuset)
3585 return 0;
3586
3587 use_view = cgroup_ops->can_use_cpuview(cgroup_ops);
3588 if (use_view)
3589 max_cpus = max_cpu_count(cg);
3590
3591 f = fopen("/proc/cpuinfo", "r");
3592 if (!f)
3593 return 0;
3594
3595 while (getline(&line, &linelen, f) != -1) {
3596 ssize_t l;
3597 if (firstline) {
3598 firstline = false;
3599 if (strstr(line, "IBM/S390") != NULL) {
3600 is_s390x = true;
3601 am_printing = true;
3602 continue;
3603 }
3604 }
3605 if (strncmp(line, "# processors:", 12) == 0)
3606 continue;
3607 if (is_processor_line(line)) {
3608 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3609 break;
3610 am_printing = cpuline_in_cpuset(line, cpuset);
3611 if (am_printing) {
3612 curcpu ++;
3613 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3614 if (l < 0) {
3615 perror("Error writing to cache");
3616 return 0;
3617 }
3618 if (l >= cache_size) {
3619 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3620 return 0;
3621 }
3622 cache += l;
3623 cache_size -= l;
3624 total_len += l;
3625 }
3626 continue;
3627 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3628 char *p;
3629 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3630 break;
3631 if (!cpu_in_cpuset(cpu, cpuset))
3632 continue;
3633 curcpu ++;
3634 p = strchr(line, ':');
3635 if (!p || !*p)
3636 return 0;
3637 p++;
3638 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3639 if (l < 0) {
3640 perror("Error writing to cache");
3641 return 0;
3642 }
3643 if (l >= cache_size) {
3644 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3645 return 0;
3646 }
3647 cache += l;
3648 cache_size -= l;
3649 total_len += l;
3650 continue;
3651
3652 }
3653 if (am_printing) {
3654 l = snprintf(cache, cache_size, "%s", line);
3655 if (l < 0) {
3656 perror("Error writing to cache");
3657 return 0;
3658 }
3659 if (l >= cache_size) {
3660 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3661 return 0;
3662 }
3663 cache += l;
3664 cache_size -= l;
3665 total_len += l;
3666 }
3667 }
3668
3669 if (is_s390x) {
3670 __do_free char *origcache = d->buf;
3671 ssize_t l;
3672
3673 d->buf = malloc(d->buflen);
3674 if (!d->buf) {
3675 d->buf = move_ptr(origcache);
3676 return 0;
3677 }
3678
3679 cache = d->buf;
3680 cache_size = d->buflen;
3681 total_len = 0;
3682 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3683 if (l < 0 || l >= cache_size)
3684 return 0;
3685
3686 cache_size -= l;
3687 cache += l;
3688 total_len += l;
3689 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3690 if (l < 0 || l >= cache_size)
3691 return 0;
3692
3693 cache_size -= l;
3694 cache += l;
3695 total_len += l;
3696 l = snprintf(cache, cache_size, "%s", origcache);
3697 if (l < 0 || l >= cache_size)
3698 return 0;
3699 total_len += l;
3700 }
3701
3702 d->cached = 1;
3703 d->size = total_len;
3704 if (total_len > size ) total_len = size;
3705
3706 /* read from off 0 */
3707 memcpy(buf, d->buf, total_len);
3708 return total_len;
3709 }
3710
3711 static uint64_t get_reaper_start_time(pid_t pid)
3712 {
3713 int ret;
3714 FILE *f;
3715 uint64_t starttime;
3716 /* strlen("/proc/") = 6
3717 * +
3718 * LXCFS_NUMSTRLEN64
3719 * +
3720 * strlen("/stat") = 5
3721 * +
3722 * \0 = 1
3723 * */
3724 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3725 char path[__PROC_PID_STAT_LEN];
3726 pid_t qpid;
3727
3728 qpid = lookup_initpid_in_store(pid);
3729 if (qpid <= 0) {
3730 /* Caller can check for EINVAL on 0. */
3731 errno = EINVAL;
3732 return 0;
3733 }
3734
3735 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3736 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3737 /* Caller can check for EINVAL on 0. */
3738 errno = EINVAL;
3739 return 0;
3740 }
3741
3742 f = fopen(path, "r");
3743 if (!f) {
3744 /* Caller can check for EINVAL on 0. */
3745 errno = EINVAL;
3746 return 0;
3747 }
3748
3749 /* Note that the *scanf() argument supression requires that length
3750 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3751 * at us. It's like telling someone you're not married and then asking
3752 * if you can bring your wife to the party.
3753 */
3754 ret = fscanf(f, "%*d " /* (1) pid %d */
3755 "%*s " /* (2) comm %s */
3756 "%*c " /* (3) state %c */
3757 "%*d " /* (4) ppid %d */
3758 "%*d " /* (5) pgrp %d */
3759 "%*d " /* (6) session %d */
3760 "%*d " /* (7) tty_nr %d */
3761 "%*d " /* (8) tpgid %d */
3762 "%*u " /* (9) flags %u */
3763 "%*u " /* (10) minflt %lu */
3764 "%*u " /* (11) cminflt %lu */
3765 "%*u " /* (12) majflt %lu */
3766 "%*u " /* (13) cmajflt %lu */
3767 "%*u " /* (14) utime %lu */
3768 "%*u " /* (15) stime %lu */
3769 "%*d " /* (16) cutime %ld */
3770 "%*d " /* (17) cstime %ld */
3771 "%*d " /* (18) priority %ld */
3772 "%*d " /* (19) nice %ld */
3773 "%*d " /* (20) num_threads %ld */
3774 "%*d " /* (21) itrealvalue %ld */
3775 "%" PRIu64, /* (22) starttime %llu */
3776 &starttime);
3777 if (ret != 1) {
3778 fclose(f);
3779 /* Caller can check for EINVAL on 0. */
3780 errno = EINVAL;
3781 return 0;
3782 }
3783
3784 fclose(f);
3785
3786 errno = 0;
3787 return starttime;
3788 }
3789
3790 static double get_reaper_start_time_in_sec(pid_t pid)
3791 {
3792 uint64_t clockticks, ticks_per_sec;
3793 int64_t ret;
3794 double res = 0;
3795
3796 clockticks = get_reaper_start_time(pid);
3797 if (clockticks == 0 && errno == EINVAL) {
3798 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3799 return 0;
3800 }
3801
3802 ret = sysconf(_SC_CLK_TCK);
3803 if (ret < 0 && errno == EINVAL) {
3804 lxcfs_debug(
3805 "%s\n",
3806 "failed to determine number of clock ticks in a second");
3807 return 0;
3808 }
3809
3810 ticks_per_sec = (uint64_t)ret;
3811 res = (double)clockticks / ticks_per_sec;
3812 return res;
3813 }
3814
3815 static double get_reaper_age(pid_t pid)
3816 {
3817 uint64_t uptime_ms;
3818 double procstart, procage;
3819
3820 /* We need to substract the time the process has started since system
3821 * boot minus the time when the system has started to get the actual
3822 * reaper age.
3823 */
3824 procstart = get_reaper_start_time_in_sec(pid);
3825 procage = procstart;
3826 if (procstart > 0) {
3827 int ret;
3828 struct timespec spec;
3829
3830 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3831 if (ret < 0)
3832 return 0;
3833
3834 /* We could make this more precise here by using the tv_nsec
3835 * field in the timespec struct and convert it to milliseconds
3836 * and then create a double for the seconds and milliseconds but
3837 * that seems more work than it is worth.
3838 */
3839 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
3840 procage = (uptime_ms - (procstart * 1000)) / 1000;
3841 }
3842
3843 return procage;
3844 }
3845
3846 /*
3847 * Returns 0 on success.
3848 * It is the caller's responsibility to free `return_usage`, unless this
3849 * function returns an error.
3850 */
3851 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
3852 {
3853 __do_free char *usage_str = NULL;
3854 __do_free struct cpuacct_usage *cpu_usage = NULL;
3855 int cpucount = get_nprocs_conf();
3856 int read_pos = 0, read_cnt=0;
3857 int i, j, ret;
3858 int cg_cpu;
3859 uint64_t cg_user, cg_system;
3860 int64_t ticks_per_sec;
3861
3862 ticks_per_sec = sysconf(_SC_CLK_TCK);
3863
3864 if (ticks_per_sec < 0 && errno == EINVAL) {
3865 lxcfs_v(
3866 "%s\n",
3867 "read_cpuacct_usage_all failed to determine number of clock ticks "
3868 "in a second");
3869 return -1;
3870 }
3871
3872 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
3873 if (!cpu_usage)
3874 return -ENOMEM;
3875
3876 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
3877 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
3878 char *data = NULL;
3879 int i = 0, read_pos = 0, read_cnt=0;
3880 size_t sz = 0, asz = 0;
3881
3882 /* read cpuacct.usage_percpu instead. */
3883 lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", "");
3884 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
3885 return -1;
3886 lxcfs_v("usage_str: %s\n", usage_str);
3887
3888 /* convert cpuacct.usage_percpu into cpuacct.usage_all. */
3889 lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", "");
3890
3891 must_strcat(&data, &sz, &asz, "cpu user system\n");
3892
3893 while (sscanf(usage_str + read_pos, "%lu %n", &cg_user, &read_cnt) > 0) {
3894 lxcfs_debug("i: %d, cg_user: %lu, read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt);
3895 must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user);
3896 i++;
3897 read_pos += read_cnt;
3898 }
3899
3900 usage_str = data;
3901
3902 lxcfs_v("usage_str: %s\n", usage_str);
3903 }
3904
3905 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
3906 lxcfs_error("read_cpuacct_usage_all reading first line from "
3907 "%s/cpuacct.usage_all failed.\n", cg);
3908 return -1;
3909 }
3910
3911 read_pos += read_cnt;
3912
3913 for (i = 0, j = 0; i < cpucount; i++) {
3914 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
3915 &cg_system, &read_cnt);
3916
3917 if (ret == EOF)
3918 break;
3919
3920 if (ret != 3) {
3921 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
3922 "failed.\n", cg);
3923 return -1;
3924 }
3925
3926 read_pos += read_cnt;
3927
3928 /* Convert the time from nanoseconds to USER_HZ */
3929 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
3930 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
3931 j++;
3932 }
3933
3934 *return_usage = move_ptr(cpu_usage);
3935 *size = cpucount;
3936 return 0;
3937 }
3938
3939 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
3940 {
3941 int i;
3942 unsigned long sum = 0;
3943
3944 for (i = 0; i < cpu_count; i++) {
3945 if (!newer[i].online)
3946 continue;
3947
3948 /* When cpuset is changed on the fly, the CPUs might get reordered.
3949 * We could either reset all counters, or check that the substractions
3950 * below will return expected results.
3951 */
3952 if (newer[i].user > older[i].user)
3953 diff[i].user = newer[i].user - older[i].user;
3954 else
3955 diff[i].user = 0;
3956
3957 if (newer[i].system > older[i].system)
3958 diff[i].system = newer[i].system - older[i].system;
3959 else
3960 diff[i].system = 0;
3961
3962 if (newer[i].idle > older[i].idle)
3963 diff[i].idle = newer[i].idle - older[i].idle;
3964 else
3965 diff[i].idle = 0;
3966
3967 sum += diff[i].user;
3968 sum += diff[i].system;
3969 sum += diff[i].idle;
3970 }
3971
3972 return sum;
3973 }
3974
3975 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
3976 {
3977 unsigned long free_space, to_add;
3978
3979 free_space = threshold - usage->user - usage->system;
3980
3981 if (free_space > usage->idle)
3982 free_space = usage->idle;
3983
3984 to_add = free_space > *surplus ? *surplus : free_space;
3985
3986 *counter += to_add;
3987 usage->idle -= to_add;
3988 *surplus -= to_add;
3989 }
3990
3991 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
3992 {
3993 struct cg_proc_stat *first = NULL, *prev, *tmp;
3994
3995 for (prev = NULL; node; ) {
3996 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
3997 tmp = node;
3998 lxcfs_debug("Removing stat node for %s\n", node->cg);
3999
4000 if (prev)
4001 prev->next = node->next;
4002 else
4003 first = node->next;
4004
4005 node = node->next;
4006 free_proc_stat_node(tmp);
4007 } else {
4008 if (!first)
4009 first = node;
4010 prev = node;
4011 node = node->next;
4012 }
4013 }
4014
4015 return first;
4016 }
4017
4018 #define PROC_STAT_PRUNE_INTERVAL 10
4019 static void prune_proc_stat_history(void)
4020 {
4021 int i;
4022 time_t now = time(NULL);
4023
4024 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4025 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4026
4027 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4028 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4029 return;
4030 }
4031
4032 if (proc_stat_history[i]->next) {
4033 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4034 proc_stat_history[i]->lastcheck = now;
4035 }
4036
4037 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4038 }
4039 }
4040
4041 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4042 {
4043 struct cg_proc_stat *node;
4044
4045 pthread_rwlock_rdlock(&head->lock);
4046
4047 if (!head->next) {
4048 pthread_rwlock_unlock(&head->lock);
4049 return NULL;
4050 }
4051
4052 node = head->next;
4053
4054 do {
4055 if (strcmp(cg, node->cg) == 0)
4056 goto out;
4057 } while ((node = node->next));
4058
4059 node = NULL;
4060
4061 out:
4062 pthread_rwlock_unlock(&head->lock);
4063 prune_proc_stat_history();
4064 return node;
4065 }
4066
4067 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4068 {
4069 struct cg_proc_stat *node;
4070 int i;
4071
4072 node = malloc(sizeof(struct cg_proc_stat));
4073 if (!node)
4074 goto err;
4075
4076 node->cg = NULL;
4077 node->usage = NULL;
4078 node->view = NULL;
4079
4080 node->cg = malloc(strlen(cg) + 1);
4081 if (!node->cg)
4082 goto err;
4083
4084 strcpy(node->cg, cg);
4085
4086 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4087 if (!node->usage)
4088 goto err;
4089
4090 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4091
4092 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4093 if (!node->view)
4094 goto err;
4095
4096 node->cpu_count = cpu_count;
4097 node->next = NULL;
4098
4099 if (pthread_mutex_init(&node->lock, NULL) != 0) {
4100 lxcfs_error("%s\n", "Failed to initialize node lock");
4101 goto err;
4102 }
4103
4104 for (i = 0; i < cpu_count; i++) {
4105 node->view[i].user = 0;
4106 node->view[i].system = 0;
4107 node->view[i].idle = 0;
4108 }
4109
4110 return node;
4111
4112 err:
4113 if (node && node->cg)
4114 free(node->cg);
4115 if (node && node->usage)
4116 free(node->usage);
4117 if (node && node->view)
4118 free(node->view);
4119 if (node)
4120 free(node);
4121
4122 return NULL;
4123 }
4124
4125 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4126 {
4127 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4128 struct cg_proc_stat_head *head = proc_stat_history[hash];
4129 struct cg_proc_stat *node, *rv = new_node;
4130
4131 pthread_rwlock_wrlock(&head->lock);
4132
4133 if (!head->next) {
4134 head->next = new_node;
4135 goto out;
4136 }
4137
4138 node = head->next;
4139
4140 for (;;) {
4141 if (strcmp(node->cg, new_node->cg) == 0) {
4142 /* The node is already present, return it */
4143 free_proc_stat_node(new_node);
4144 rv = node;
4145 goto out;
4146 }
4147
4148 if (node->next) {
4149 node = node->next;
4150 continue;
4151 }
4152
4153 node->next = new_node;
4154 goto out;
4155 }
4156
4157 out:
4158 pthread_rwlock_unlock(&head->lock);
4159 return rv;
4160 }
4161
4162 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4163 {
4164 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
4165
4166 /* Allocate new memory */
4167 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4168 if (!new_usage)
4169 return false;
4170
4171 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4172 if (!new_view)
4173 return false;
4174
4175 /* Copy existing data & initialize new elements */
4176 for (int i = 0; i < cpu_count; i++) {
4177 if (i < node->cpu_count) {
4178 new_usage[i].user = node->usage[i].user;
4179 new_usage[i].system = node->usage[i].system;
4180 new_usage[i].idle = node->usage[i].idle;
4181
4182 new_view[i].user = node->view[i].user;
4183 new_view[i].system = node->view[i].system;
4184 new_view[i].idle = node->view[i].idle;
4185 } else {
4186 new_usage[i].user = 0;
4187 new_usage[i].system = 0;
4188 new_usage[i].idle = 0;
4189
4190 new_view[i].user = 0;
4191 new_view[i].system = 0;
4192 new_view[i].idle = 0;
4193 }
4194 }
4195
4196 free(node->usage);
4197 node->usage = move_ptr(new_usage);
4198
4199 free(node->view);
4200 node->view = move_ptr(new_view);
4201 node->cpu_count = cpu_count;
4202
4203 return true;
4204 }
4205
4206 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4207 {
4208 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4209 struct cg_proc_stat_head *head = proc_stat_history[hash];
4210 struct cg_proc_stat *node;
4211
4212 node = find_proc_stat_node(head, cg);
4213
4214 if (!node) {
4215 node = new_proc_stat_node(usage, cpu_count, cg);
4216 if (!node)
4217 return NULL;
4218
4219 node = add_proc_stat_node(node);
4220 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4221 }
4222
4223 pthread_mutex_lock(&node->lock);
4224
4225 /* If additional CPUs on the host have been enabled, CPU usage counter
4226 * arrays have to be expanded */
4227 if (node->cpu_count < cpu_count) {
4228 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4229 node->cpu_count, cpu_count, cg);
4230
4231 if (!expand_proc_stat_node(node, cpu_count)) {
4232 pthread_mutex_unlock(&node->lock);
4233 lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4234 node->cpu_count, cpu_count, cg);
4235 return NULL;
4236 }
4237 }
4238
4239 return node;
4240 }
4241
4242 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4243 {
4244 int i;
4245
4246 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4247 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4248
4249 for (i = 0; i < cpu_count; i++) {
4250 node->view[i].user = 0;
4251 node->view[i].system = 0;
4252 node->view[i].idle = 0;
4253 }
4254
4255 node->cpu_count = cpu_count;
4256 }
4257
4258 static int cpuview_proc_stat(const char *cg, const char *cpuset,
4259 struct cpuacct_usage *cg_cpu_usage,
4260 int cg_cpu_usage_size, FILE *f, char *buf,
4261 size_t buf_size)
4262 {
4263 __do_free char *line = NULL;
4264 __do_free struct cpuacct_usage *diff = NULL;
4265 size_t linelen = 0, total_len = 0, l;
4266 int curcpu = -1; /* cpu numbering starts at 0 */
4267 int physcpu, i;
4268 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4269 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0,
4270 irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4271 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4272 unsigned long user_surplus = 0, system_surplus = 0;
4273 unsigned long total_sum, threshold;
4274 struct cg_proc_stat *stat_node;
4275 int nprocs = get_nprocs_conf();
4276
4277 if (cg_cpu_usage_size < nprocs)
4278 nprocs = cg_cpu_usage_size;
4279
4280 /* Read all CPU stats and stop when we've encountered other lines */
4281 while (getline(&line, &linelen, f) != -1) {
4282 int ret;
4283 char cpu_char[10]; /* That's a lot of cores */
4284 uint64_t all_used, cg_used;
4285
4286 if (strlen(line) == 0)
4287 continue;
4288
4289 /* not a ^cpuN line containing a number N */
4290 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
4291 break;
4292
4293 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4294 continue;
4295
4296 if (physcpu >= cg_cpu_usage_size)
4297 continue;
4298
4299 curcpu ++;
4300 cpu_cnt ++;
4301
4302 if (!cpu_in_cpuset(physcpu, cpuset)) {
4303 for (i = curcpu; i <= physcpu; i++)
4304 cg_cpu_usage[i].online = false;
4305 continue;
4306 }
4307
4308 if (curcpu < physcpu) {
4309 /* Some CPUs may be disabled */
4310 for (i = curcpu; i < physcpu; i++)
4311 cg_cpu_usage[i].online = false;
4312
4313 curcpu = physcpu;
4314 }
4315
4316 cg_cpu_usage[curcpu].online = true;
4317
4318 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4319 &user,
4320 &nice,
4321 &system,
4322 &idle,
4323 &iowait,
4324 &irq,
4325 &softirq,
4326 &steal,
4327 &guest,
4328 &guest_nice);
4329
4330 if (ret != 10)
4331 continue;
4332
4333 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4334 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4335
4336 if (all_used >= cg_used) {
4337 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4338
4339 } else {
4340 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4341 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4342 curcpu, cg, all_used, cg_used);
4343 cg_cpu_usage[curcpu].idle = idle;
4344 }
4345 }
4346
4347 /* Cannot use more CPUs than is available due to cpuset */
4348 if (max_cpus > cpu_cnt)
4349 max_cpus = cpu_cnt;
4350
4351 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4352
4353 if (!stat_node) {
4354 lxcfs_error("unable to find/create stat node for %s\n", cg);
4355 return 0;
4356 }
4357
4358 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4359 if (!diff) {
4360 return 0;
4361 }
4362
4363 /*
4364 * If the new values are LOWER than values stored in memory, it means
4365 * the cgroup has been reset/recreated and we should reset too.
4366 */
4367 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4368 if (!cg_cpu_usage[curcpu].online)
4369 continue;
4370
4371 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4372 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4373
4374 break;
4375 }
4376
4377 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4378
4379 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4380 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4381
4382 if (!stat_node->usage[curcpu].online)
4383 continue;
4384
4385 i++;
4386
4387 stat_node->usage[curcpu].user += diff[curcpu].user;
4388 stat_node->usage[curcpu].system += diff[curcpu].system;
4389 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4390
4391 if (max_cpus > 0 && i >= max_cpus) {
4392 user_surplus += diff[curcpu].user;
4393 system_surplus += diff[curcpu].system;
4394 }
4395 }
4396
4397 /* Calculate usage counters of visible CPUs */
4398 if (max_cpus > 0) {
4399 unsigned long diff_user = 0;
4400 unsigned long diff_system = 0;
4401 unsigned long diff_idle = 0;
4402 unsigned long max_diff_idle = 0;
4403 unsigned long max_diff_idle_index = 0;
4404 double exact_cpus;
4405
4406 /* threshold = maximum usage per cpu, including idle */
4407 threshold = total_sum / cpu_cnt * max_cpus;
4408
4409 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4410 if (!stat_node->usage[curcpu].online)
4411 continue;
4412
4413 i++;
4414
4415 if (i == max_cpus)
4416 break;
4417
4418 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4419 continue;
4420
4421 /* Add user */
4422 add_cpu_usage(&user_surplus, &diff[curcpu],
4423 &diff[curcpu].user, threshold);
4424
4425 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4426 continue;
4427
4428 /* If there is still room, add system */
4429 add_cpu_usage(&system_surplus, &diff[curcpu],
4430 &diff[curcpu].system, threshold);
4431 }
4432
4433 if (user_surplus > 0)
4434 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4435 if (system_surplus > 0)
4436 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4437
4438 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4439 if (!stat_node->usage[curcpu].online)
4440 continue;
4441
4442 i++;
4443
4444 if (i == max_cpus)
4445 break;
4446
4447 stat_node->view[curcpu].user += diff[curcpu].user;
4448 stat_node->view[curcpu].system += diff[curcpu].system;
4449 stat_node->view[curcpu].idle += diff[curcpu].idle;
4450
4451 user_sum += stat_node->view[curcpu].user;
4452 system_sum += stat_node->view[curcpu].system;
4453 idle_sum += stat_node->view[curcpu].idle;
4454
4455 diff_user += diff[curcpu].user;
4456 diff_system += diff[curcpu].system;
4457 diff_idle += diff[curcpu].idle;
4458 if (diff[curcpu].idle > max_diff_idle) {
4459 max_diff_idle = diff[curcpu].idle;
4460 max_diff_idle_index = curcpu;
4461 }
4462
4463 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
4464 }
4465 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
4466
4467 /* revise cpu usage view to support partial cpu case. */
4468 exact_cpus = exact_cpu_count(cg);
4469 if (exact_cpus < (double)max_cpus){
4470 unsigned long delta = (unsigned long)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
4471
4472 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
4473 lxcfs_v("delta: %lu\n", delta);
4474 lxcfs_v("idle_sum before: %lu\n", idle_sum);
4475 idle_sum = idle_sum > delta ? idle_sum - delta : 0;
4476 lxcfs_v("idle_sum after: %lu\n", idle_sum);
4477
4478 curcpu = max_diff_idle_index;
4479 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
4480 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
4481 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
4482 }
4483 } else {
4484 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4485 if (!stat_node->usage[curcpu].online)
4486 continue;
4487
4488 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4489 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4490 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4491
4492 user_sum += stat_node->view[curcpu].user;
4493 system_sum += stat_node->view[curcpu].system;
4494 idle_sum += stat_node->view[curcpu].idle;
4495 }
4496 }
4497
4498 /* Render the file */
4499 /* cpu-all */
4500 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4501 user_sum,
4502 system_sum,
4503 idle_sum);
4504 lxcfs_v("cpu-all: %s\n", buf);
4505
4506 if (l < 0) {
4507 perror("Error writing to cache");
4508 return 0;
4509 }
4510 if (l >= buf_size) {
4511 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4512 return 0;
4513 }
4514
4515 buf += l;
4516 buf_size -= l;
4517 total_len += l;
4518
4519 /* Render visible CPUs */
4520 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4521 if (!stat_node->usage[curcpu].online)
4522 continue;
4523
4524 i++;
4525
4526 if (max_cpus > 0 && i == max_cpus)
4527 break;
4528
4529 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4530 i,
4531 stat_node->view[curcpu].user,
4532 stat_node->view[curcpu].system,
4533 stat_node->view[curcpu].idle);
4534 lxcfs_v("cpu: %s\n", buf);
4535
4536 if (l < 0) {
4537 perror("Error writing to cache");
4538 return 0;
4539
4540 }
4541 if (l >= buf_size) {
4542 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4543 return 0;
4544 }
4545
4546 buf += l;
4547 buf_size -= l;
4548 total_len += l;
4549 }
4550
4551 /* Pass the rest of /proc/stat, start with the last line read */
4552 l = snprintf(buf, buf_size, "%s", line);
4553
4554 if (l < 0) {
4555 perror("Error writing to cache");
4556 return 0;
4557
4558 }
4559 if (l >= buf_size) {
4560 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4561 return 0;
4562 }
4563
4564 buf += l;
4565 buf_size -= l;
4566 total_len += l;
4567
4568 /* Pass the rest of the host's /proc/stat */
4569 while (getline(&line, &linelen, f) != -1) {
4570 l = snprintf(buf, buf_size, "%s", line);
4571 if (l < 0) {
4572 perror("Error writing to cache");
4573 return 0;
4574 }
4575 if (l >= buf_size) {
4576 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4577 return 0;
4578 }
4579 buf += l;
4580 buf_size -= l;
4581 total_len += l;
4582 }
4583
4584 if (stat_node)
4585 pthread_mutex_unlock(&stat_node->lock);
4586 return total_len;
4587 }
4588
4589 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4590 static int proc_stat_read(char *buf, size_t size, off_t offset,
4591 struct fuse_file_info *fi)
4592 {
4593 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
4594 __do_free struct cpuacct_usage *cg_cpu_usage = NULL;
4595 __do_fclose FILE *f = NULL;
4596 struct fuse_context *fc = fuse_get_context();
4597 struct file_info *d = (struct file_info *)fi->fh;
4598 size_t linelen = 0, total_len = 0;
4599 int curcpu = -1; /* cpu numbering starts at 0 */
4600 int physcpu = 0;
4601 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0,
4602 irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4603 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0,
4604 iowait_sum = 0, irq_sum = 0, softirq_sum = 0,
4605 steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4606 char cpuall[CPUALL_MAX_SIZE];
4607 /* reserve for cpu all */
4608 char *cache = d->buf + CPUALL_MAX_SIZE;
4609 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4610 int cg_cpu_usage_size = 0;
4611
4612 if (offset){
4613 if (offset > d->size)
4614 return -EINVAL;
4615 if (!d->cached)
4616 return 0;
4617 int left = d->size - offset;
4618 total_len = left > size ? size: left;
4619 memcpy(buf, d->buf + offset, total_len);
4620 return total_len;
4621 }
4622
4623 pid_t initpid = lookup_initpid_in_store(fc->pid);
4624 lxcfs_v("initpid: %d\n", initpid);
4625 if (initpid <= 0)
4626 initpid = fc->pid;
4627
4628 /*
4629 * when container run with host pid namespace initpid == 1, cgroup will "/"
4630 * we should return host os's /proc contents.
4631 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
4632 */
4633 if (initpid == 1) {
4634 return read_file_fuse("/proc/stat", buf, size, d);
4635 }
4636
4637 cg = get_pid_cgroup(initpid, "cpuset");
4638 lxcfs_v("cg: %s\n", cg);
4639 if (!cg)
4640 return read_file_fuse("/proc/stat", buf, size, d);
4641 prune_init_slice(cg);
4642
4643 cpuset = get_cpuset(cg);
4644 if (!cpuset)
4645 return 0;
4646
4647 /*
4648 * Read cpuacct.usage_all for all CPUs.
4649 * If the cpuacct cgroup is present, it is used to calculate the container's
4650 * CPU usage. If not, values from the host's /proc/stat are used.
4651 */
4652 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
4653 lxcfs_v("%s\n", "proc_stat_read failed to read from cpuacct, "
4654 "falling back to the host's /proc/stat");
4655 }
4656
4657 f = fopen("/proc/stat", "r");
4658 if (!f)
4659 return 0;
4660
4661 //skip first line
4662 if (getline(&line, &linelen, f) < 0) {
4663 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4664 return 0;
4665 }
4666
4667 if (cgroup_ops->can_use_cpuview(cgroup_ops) && cg_cpu_usage) {
4668 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4669 f, d->buf, d->buflen);
4670 goto out;
4671 }
4672
4673 while (getline(&line, &linelen, f) != -1) {
4674 ssize_t l;
4675 char cpu_char[10]; /* That's a lot of cores */
4676 char *c;
4677 uint64_t all_used, cg_used, new_idle;
4678 int ret;
4679
4680 if (strlen(line) == 0)
4681 continue;
4682 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4683 /* not a ^cpuN line containing a number N, just print it */
4684 l = snprintf(cache, cache_size, "%s", line);
4685 if (l < 0) {
4686 perror("Error writing to cache");
4687 return 0;
4688 }
4689 if (l >= cache_size) {
4690 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4691 return 0;
4692 }
4693 cache += l;
4694 cache_size -= l;
4695 total_len += l;
4696 continue;
4697 }
4698
4699 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4700 continue;
4701 if (!cpu_in_cpuset(physcpu, cpuset))
4702 continue;
4703 curcpu ++;
4704
4705 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4706 &user,
4707 &nice,
4708 &system,
4709 &idle,
4710 &iowait,
4711 &irq,
4712 &softirq,
4713 &steal,
4714 &guest,
4715 &guest_nice);
4716
4717 if (ret != 10 || !cg_cpu_usage) {
4718 c = strchr(line, ' ');
4719 if (!c)
4720 continue;
4721 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4722 if (l < 0) {
4723 perror("Error writing to cache");
4724 return 0;
4725
4726 }
4727 if (l >= cache_size) {
4728 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4729 return 0;
4730 }
4731
4732 cache += l;
4733 cache_size -= l;
4734 total_len += l;
4735
4736 if (ret != 10)
4737 continue;
4738 }
4739
4740 if (cg_cpu_usage) {
4741 if (physcpu >= cg_cpu_usage_size)
4742 break;
4743
4744 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4745 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
4746
4747 if (all_used >= cg_used) {
4748 new_idle = idle + (all_used - cg_used);
4749
4750 } else {
4751 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4752 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4753 curcpu, cg, all_used, cg_used);
4754 new_idle = idle;
4755 }
4756
4757 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4758 curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
4759 new_idle);
4760
4761 if (l < 0) {
4762 perror("Error writing to cache");
4763 return 0;
4764
4765 }
4766 if (l >= cache_size) {
4767 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4768 return 0;
4769 }
4770
4771 cache += l;
4772 cache_size -= l;
4773 total_len += l;
4774
4775 user_sum += cg_cpu_usage[physcpu].user;
4776 system_sum += cg_cpu_usage[physcpu].system;
4777 idle_sum += new_idle;
4778
4779 } else {
4780 user_sum += user;
4781 nice_sum += nice;
4782 system_sum += system;
4783 idle_sum += idle;
4784 iowait_sum += iowait;
4785 irq_sum += irq;
4786 softirq_sum += softirq;
4787 steal_sum += steal;
4788 guest_sum += guest;
4789 guest_nice_sum += guest_nice;
4790 }
4791 }
4792
4793 cache = d->buf;
4794
4795 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4796 user_sum,
4797 nice_sum,
4798 system_sum,
4799 idle_sum,
4800 iowait_sum,
4801 irq_sum,
4802 softirq_sum,
4803 steal_sum,
4804 guest_sum,
4805 guest_nice_sum);
4806 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4807 memcpy(cache, cpuall, cpuall_len);
4808 cache += cpuall_len;
4809 } else {
4810 /* shouldn't happen */
4811 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4812 cpuall_len = 0;
4813 }
4814
4815 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4816 total_len += cpuall_len;
4817
4818 out:
4819 d->cached = 1;
4820 d->size = total_len;
4821 if (total_len > size)
4822 total_len = size;
4823
4824 memcpy(buf, d->buf, total_len);
4825 return total_len;
4826 }
4827
4828 /* This function retrieves the busy time of a group of tasks by looking at
4829 * cpuacct.usage. Unfortunately, this only makes sense when the container has
4830 * been given it's own cpuacct cgroup. If not, this function will take the busy
4831 * time of all other taks that do not actually belong to the container into
4832 * account as well. If someone has a clever solution for this please send a
4833 * patch!
4834 */
4835 static double get_reaper_busy(pid_t task)
4836 {
4837 __do_free char *cgroup = NULL, *usage_str = NULL;
4838 unsigned long usage = 0;
4839 pid_t initpid;
4840
4841 initpid = lookup_initpid_in_store(task);
4842 if (initpid <= 0)
4843 return 0;
4844
4845 cgroup = get_pid_cgroup(initpid, "cpuacct");
4846 if (!cgroup)
4847 return 0;
4848 prune_init_slice(cgroup);
4849 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cgroup, "cpuacct.usage",
4850 &usage_str))
4851 return 0;
4852
4853 usage = strtoul(usage_str, NULL, 10);
4854 return ((double)usage / 1000000000);
4855 }
4856
4857 #if RELOADTEST
4858 void iwashere(void)
4859 {
4860 int fd;
4861
4862 fd = creat("/tmp/lxcfs-iwashere", 0644);
4863 if (fd >= 0)
4864 close(fd);
4865 }
4866 #endif
4867
4868 /*
4869 * We read /proc/uptime and reuse its second field.
4870 * For the first field, we use the mtime for the reaper for
4871 * the calling pid as returned by getreaperage
4872 */
4873 static int proc_uptime_read(char *buf, size_t size, off_t offset,
4874 struct fuse_file_info *fi)
4875 {
4876 struct fuse_context *fc = fuse_get_context();
4877 struct file_info *d = (struct file_info *)fi->fh;
4878 double busytime = get_reaper_busy(fc->pid);
4879 char *cache = d->buf;
4880 ssize_t total_len = 0;
4881 double idletime, reaperage;
4882
4883 #if RELOADTEST
4884 iwashere();
4885 #endif
4886
4887 if (offset){
4888 if (!d->cached)
4889 return 0;
4890 if (offset > d->size)
4891 return -EINVAL;
4892 int left = d->size - offset;
4893 total_len = left > size ? size: left;
4894 memcpy(buf, cache + offset, total_len);
4895 return total_len;
4896 }
4897
4898 reaperage = get_reaper_age(fc->pid);
4899 /* To understand why this is done, please read the comment to the
4900 * get_reaper_busy() function.
4901 */
4902 idletime = reaperage;
4903 if (reaperage >= busytime)
4904 idletime = reaperage - busytime;
4905
4906 total_len = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
4907 if (total_len < 0 || total_len >= d->buflen){
4908 lxcfs_error("%s\n", "failed to write to cache");
4909 return 0;
4910 }
4911
4912 d->size = (int)total_len;
4913 d->cached = 1;
4914
4915 if (total_len > size) total_len = size;
4916
4917 memcpy(buf, d->buf, total_len);
4918 return total_len;
4919 }
4920
4921 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4922 struct fuse_file_info *fi)
4923 {
4924 __do_free char *cg = NULL, *io_serviced_str = NULL,
4925 *io_merged_str = NULL, *io_service_bytes_str = NULL,
4926 *io_wait_time_str = NULL, *io_service_time_str = NULL,
4927 *line = NULL;
4928 __do_fclose FILE *f = NULL;
4929 struct fuse_context *fc = fuse_get_context();
4930 struct file_info *d = (struct file_info *)fi->fh;
4931 unsigned long read = 0, write = 0;
4932 unsigned long read_merged = 0, write_merged = 0;
4933 unsigned long read_sectors = 0, write_sectors = 0;
4934 unsigned long read_ticks = 0, write_ticks = 0;
4935 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4936 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4937 char *cache = d->buf;
4938 size_t cache_size = d->buflen;
4939 size_t linelen = 0, total_len = 0;
4940 unsigned int major = 0, minor = 0;
4941 int i = 0;
4942 int ret;
4943 char dev_name[72];
4944
4945 if (offset){
4946 int left;
4947
4948 if (offset > d->size)
4949 return -EINVAL;
4950
4951 if (!d->cached)
4952 return 0;
4953
4954 left = d->size - offset;
4955 total_len = left > size ? size: left;
4956 memcpy(buf, cache + offset, total_len);
4957
4958 return total_len;
4959 }
4960
4961 pid_t initpid = lookup_initpid_in_store(fc->pid);
4962 if (initpid <= 1 || is_shared_pidns(initpid))
4963 initpid = fc->pid;
4964 cg = get_pid_cgroup(initpid, "blkio");
4965 if (!cg)
4966 return read_file_fuse("/proc/diskstats", buf, size, d);
4967 prune_init_slice(cg);
4968
4969 ret = cgroup_ops->get_io_serviced(cgroup_ops, cg, &io_serviced_str);
4970 if (ret < 0) {
4971 if (ret == -EOPNOTSUPP)
4972 return read_file_fuse("/proc/diskstats", buf, size, d);
4973 }
4974
4975 ret = cgroup_ops->get_io_merged(cgroup_ops, cg, &io_merged_str);
4976 if (ret < 0) {
4977 if (ret == -EOPNOTSUPP)
4978 return read_file_fuse("/proc/diskstats", buf, size, d);
4979 }
4980
4981 ret = cgroup_ops->get_io_service_bytes(cgroup_ops, cg, &io_service_bytes_str);
4982 if (ret < 0) {
4983 if (ret == -EOPNOTSUPP)
4984 return read_file_fuse("/proc/diskstats", buf, size, d);
4985 }
4986
4987 ret = cgroup_ops->get_io_wait_time(cgroup_ops, cg, &io_wait_time_str);
4988 if (ret < 0) {
4989 if (ret == -EOPNOTSUPP)
4990 return read_file_fuse("/proc/diskstats", buf, size, d);
4991 }
4992
4993 ret = cgroup_ops->get_io_service_time(cgroup_ops, cg, &io_service_time_str);
4994 if (ret < 0) {
4995 if (ret == -EOPNOTSUPP)
4996 return read_file_fuse("/proc/diskstats", buf, size, d);
4997 }
4998
4999 f = fopen("/proc/diskstats", "r");
5000 if (!f)
5001 return 0;
5002
5003 while (getline(&line, &linelen, f) != -1) {
5004 ssize_t l;
5005 char lbuf[256];
5006
5007 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5008 if (i != 3)
5009 continue;
5010
5011 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5012 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5013 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5014 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5015 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5016 read_sectors = read_sectors/512;
5017 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5018 write_sectors = write_sectors/512;
5019
5020 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5021 rd_svctm = rd_svctm/1000000;
5022 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5023 rd_wait = rd_wait/1000000;
5024 read_ticks = rd_svctm + rd_wait;
5025
5026 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5027 wr_svctm = wr_svctm/1000000;
5028 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5029 wr_wait = wr_wait/1000000;
5030 write_ticks = wr_svctm + wr_wait;
5031
5032 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5033 tot_ticks = tot_ticks/1000000;
5034
5035 memset(lbuf, 0, 256);
5036 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5037 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5038 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5039 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5040 else
5041 continue;
5042
5043 l = snprintf(cache, cache_size, "%s", lbuf);
5044 if (l < 0) {
5045 perror("Error writing to fuse buf");
5046 return 0;
5047 }
5048 if (l >= cache_size) {
5049 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5050 return 0;
5051 }
5052 cache += l;
5053 cache_size -= l;
5054 total_len += l;
5055 }
5056
5057 d->cached = 1;
5058 d->size = total_len;
5059 if (total_len > size ) total_len = size;
5060 memcpy(buf, d->buf, total_len);
5061
5062 return total_len;
5063 }
5064
5065 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5066 struct fuse_file_info *fi)
5067 {
5068 __do_free char *cg = NULL, *memswlimit_str = NULL, *memusage_str = NULL,
5069 *memswusage_str = NULL;
5070 struct fuse_context *fc = fuse_get_context();
5071 struct file_info *d = (struct file_info *)fi->fh;
5072 unsigned long memswlimit = 0, memlimit = 0, memusage = 0,
5073 memswusage = 0, swap_total = 0, swap_free = 0;
5074 ssize_t total_len = 0;
5075 ssize_t l = 0;
5076 char *cache = d->buf;
5077 int ret;
5078
5079 if (offset) {
5080 int left;
5081
5082 if (offset > d->size)
5083 return -EINVAL;
5084
5085 if (!d->cached)
5086 return 0;
5087
5088 left = d->size - offset;
5089 total_len = left > size ? size: left;
5090 memcpy(buf, cache + offset, total_len);
5091
5092 return total_len;
5093 }
5094
5095 pid_t initpid = lookup_initpid_in_store(fc->pid);
5096 if (initpid <= 1 || is_shared_pidns(initpid))
5097 initpid = fc->pid;
5098 cg = get_pid_cgroup(initpid, "memory");
5099 if (!cg)
5100 return read_file_fuse("/proc/swaps", buf, size, d);
5101 prune_init_slice(cg);
5102
5103 memlimit = get_min_memlimit(cg, false);
5104
5105 ret = cgroup_ops->get_memory_current(cgroup_ops, cg, &memusage_str);
5106 if (ret < 0)
5107 return 0;
5108
5109 memusage = strtoul(memusage_str, NULL, 10);
5110
5111 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cg, &memswlimit_str);
5112 if (ret >= 0)
5113 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cg, &memswusage_str);
5114 if (ret >= 0) {
5115 memswlimit = get_min_memlimit(cg, true);
5116 memswusage = strtoul(memswusage_str, NULL, 10);
5117 swap_total = (memswlimit - memlimit) / 1024;
5118 swap_free = (memswusage - memusage) / 1024;
5119 }
5120
5121 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5122
5123 /* When no mem + swap limit is specified or swapaccount=0*/
5124 if (!memswlimit) {
5125 __do_free char *line = NULL;
5126 __do_fclose FILE *f = NULL;
5127 size_t linelen = 0;
5128
5129 f = fopen("/proc/meminfo", "r");
5130 if (!f)
5131 return 0;
5132
5133 while (getline(&line, &linelen, f) != -1) {
5134 if (startswith(line, "SwapTotal:"))
5135 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
5136 else if (startswith(line, "SwapFree:"))
5137 sscanf(line, "SwapFree: %8lu kB", &swap_free);
5138 }
5139 }
5140
5141 if (swap_total > 0) {
5142 l = snprintf(d->buf + total_len, d->size - total_len,
5143 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5144 swap_total, swap_free);
5145 total_len += l;
5146 }
5147
5148 if (total_len < 0 || l < 0) {
5149 perror("Error writing to cache");
5150 return 0;
5151 }
5152
5153 d->cached = 1;
5154 d->size = (int)total_len;
5155
5156 if (total_len > size) total_len = size;
5157 memcpy(buf, d->buf, total_len);
5158 return total_len;
5159 }
5160
5161 /*
5162 * Find the process pid from cgroup path.
5163 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5164 * @pid_buf : put pid to pid_buf.
5165 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5166 * @depth : the depth of cgroup in container.
5167 * @sum : return the number of pid.
5168 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5169 */
5170 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5171 {
5172 __do_free char *path = NULL;
5173 __do_close_prot_errno int fd = -EBADF;
5174 __do_fclose FILE *f = NULL;
5175 __do_closedir DIR *dir = NULL;
5176 struct dirent *file;
5177 size_t linelen = 0;
5178 char *line = NULL;
5179 int pd;
5180 char **pid;
5181
5182 /* path = dpath + "/cgroup.procs" + /0 */
5183 path = malloc(strlen(dpath) + 20);
5184 if (!path)
5185 return sum;
5186
5187 strcpy(path, dpath);
5188 fd = openat(cfd, path, O_RDONLY | O_CLOEXEC | O_NOFOLLOW);
5189 if (fd < 0)
5190 return sum;
5191
5192 dir = fdopendir(move_fd(fd));
5193 if (!dir)
5194 return sum;
5195
5196 while (((file = readdir(dir)) != NULL) && depth > 0) {
5197 if (strcmp(file->d_name, ".") == 0)
5198 continue;
5199
5200 if (strcmp(file->d_name, "..") == 0)
5201 continue;
5202
5203 if (file->d_type == DT_DIR) {
5204 __do_free char *path_dir = NULL;
5205
5206 /* path + '/' + d_name +/0 */
5207 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5208 if (!path_dir)
5209 return sum;
5210
5211 strcpy(path_dir, path);
5212 strcat(path_dir, "/");
5213 strcat(path_dir, file->d_name);
5214 pd = depth - 1;
5215 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5216 }
5217 }
5218
5219 strcat(path, "/cgroup.procs");
5220 fd = openat(cfd, path, O_RDONLY);
5221 if (fd < 0)
5222 return sum;
5223
5224 f = fdopen(move_fd(fd), "r");
5225 if (!f)
5226 return sum;
5227
5228 while (getline(&line, &linelen, f) != -1) {
5229 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5230 if (!pid)
5231 return sum;
5232 *pid_buf = pid;
5233
5234 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5235 if (!*(*pid_buf + sum))
5236 return sum;
5237
5238 strcpy(*(*pid_buf + sum), line);
5239 sum++;
5240 }
5241
5242 return sum;
5243 }
5244
5245 /*
5246 * calc_load calculates the load according to the following formula:
5247 * load1 = load0 * exp + active * (1 - exp)
5248 *
5249 * @load1: the new loadavg.
5250 * @load0: the former loadavg.
5251 * @active: the total number of running pid at this moment.
5252 * @exp: the fixed-point defined in the beginning.
5253 */
5254 static unsigned long
5255 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5256 {
5257 unsigned long newload;
5258
5259 active = active > 0 ? active * FIXED_1 : 0;
5260 newload = load * exp + active * (FIXED_1 - exp);
5261 if (active >= load)
5262 newload += FIXED_1 - 1;
5263
5264 return newload / FIXED_1;
5265 }
5266
5267 /*
5268 * Return 0 means that container p->cg is closed.
5269 * Return -1 means that error occurred in refresh.
5270 * Positive num equals the total number of pid.
5271 */
5272 static int refresh_load(struct load_node *p, char *path)
5273 {
5274 __do_free char *line = NULL;
5275 char **idbuf;
5276 char proc_path[256];
5277 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5278 size_t linelen = 0;
5279 int sum, length;
5280 struct dirent *file;
5281
5282 idbuf = malloc(sizeof(char *));
5283 if (!idbuf)
5284 return -1;
5285
5286 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5287 /* normal exit */
5288 if (sum == 0)
5289 goto out;
5290
5291 for (i = 0; i < sum; i++) {
5292 __do_closedir DIR *dp = NULL;
5293
5294 /*clean up '\n' */
5295 length = strlen(idbuf[i])-1;
5296 idbuf[i][length] = '\0';
5297 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5298 if (ret < 0 || ret > 255) {
5299 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5300 i = sum;
5301 sum = -1;
5302 goto err_out;
5303 }
5304
5305 dp = opendir(proc_path);
5306 if (!dp) {
5307 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5308 continue;
5309 }
5310 while ((file = readdir(dp)) != NULL) {
5311 __do_fclose FILE *f = NULL;
5312
5313 if (strncmp(file->d_name, ".", 1) == 0)
5314 continue;
5315 if (strncmp(file->d_name, "..", 1) == 0)
5316 continue;
5317 total_pid++;
5318 /* We make the biggest pid become last_pid.*/
5319 ret = atof(file->d_name);
5320 last_pid = (ret > last_pid) ? ret : last_pid;
5321
5322 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5323 if (ret < 0 || ret > 255) {
5324 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5325 i = sum;
5326 sum = -1;
5327 goto err_out;
5328 }
5329
5330 f = fopen(proc_path, "r");
5331 if (f != NULL) {
5332 while (getline(&line, &linelen, f) != -1) {
5333 /* Find State */
5334 if ((line[0] == 'S') && (line[1] == 't'))
5335 break;
5336 }
5337
5338 if ((line[7] == 'R') || (line[7] == 'D'))
5339 run_pid++;
5340 }
5341 }
5342 }
5343 /*Calculate the loadavg.*/
5344 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5345 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5346 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5347 p->run_pid = run_pid;
5348 p->total_pid = total_pid;
5349 p->last_pid = last_pid;
5350
5351 err_out:
5352 for (; i > 0; i--)
5353 free(idbuf[i-1]);
5354 out:
5355 free(idbuf);
5356 return sum;
5357 }
5358
5359 /*
5360 * Traverse the hash table and update it.
5361 */
5362 void *load_begin(void *arg)
5363 {
5364
5365 int i, sum, length, ret;
5366 struct load_node *f;
5367 int first_node;
5368 clock_t time1, time2;
5369
5370 while (1) {
5371 if (loadavg_stop == 1)
5372 return NULL;
5373
5374 time1 = clock();
5375 for (i = 0; i < LOAD_SIZE; i++) {
5376 pthread_mutex_lock(&load_hash[i].lock);
5377 if (load_hash[i].next == NULL) {
5378 pthread_mutex_unlock(&load_hash[i].lock);
5379 continue;
5380 }
5381 f = load_hash[i].next;
5382 first_node = 1;
5383 while (f) {
5384 __do_free char *path = NULL;
5385
5386 length = strlen(f->cg) + 2;
5387 /* strlen(f->cg) + '.' or '' + \0 */
5388 path = malloc(length);
5389 if (!path)
5390 goto out;
5391
5392 ret = snprintf(path, length, "%s%s", dot_or_empty(f->cg), f->cg);
5393 if (ret < 0 || ret > length - 1) {
5394 /* snprintf failed, ignore the node.*/
5395 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5396 goto out;
5397 }
5398
5399 sum = refresh_load(f, path);
5400 if (sum == 0)
5401 f = del_node(f, i);
5402 else
5403 out: f = f->next;
5404 /* load_hash[i].lock locks only on the first node.*/
5405 if (first_node == 1) {
5406 first_node = 0;
5407 pthread_mutex_unlock(&load_hash[i].lock);
5408 }
5409 }
5410 }
5411
5412 if (loadavg_stop == 1)
5413 return NULL;
5414
5415 time2 = clock();
5416 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5417 }
5418 }
5419
5420 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5421 struct fuse_file_info *fi)
5422 {
5423 struct fuse_context *fc = fuse_get_context();
5424 struct file_info *d = (struct file_info *)fi->fh;
5425 pid_t initpid;
5426 char *cg;
5427 size_t total_len = 0;
5428 char *cache = d->buf;
5429 struct load_node *n;
5430 int hash;
5431 int cfd, rv = 0;
5432 unsigned long a, b, c;
5433
5434 if (offset) {
5435 if (offset > d->size)
5436 return -EINVAL;
5437 if (!d->cached)
5438 return 0;
5439 int left = d->size - offset;
5440 total_len = left > size ? size : left;
5441 memcpy(buf, cache + offset, total_len);
5442 return total_len;
5443 }
5444 if (!loadavg)
5445 return read_file_fuse("/proc/loadavg", buf, size, d);
5446
5447 initpid = lookup_initpid_in_store(fc->pid);
5448 if (initpid <= 1 || is_shared_pidns(initpid))
5449 initpid = fc->pid;
5450 cg = get_pid_cgroup(initpid, "cpu");
5451 if (!cg)
5452 return read_file_fuse("/proc/loadavg", buf, size, d);
5453
5454 prune_init_slice(cg);
5455 hash = calc_hash(cg) % LOAD_SIZE;
5456 n = locate_node(cg, hash);
5457
5458 /* First time */
5459 if (n == NULL) {
5460 cfd = get_cgroup_fd("cpu");
5461 if (cfd >= 0) {
5462 /*
5463 * In locate_node() above, pthread_rwlock_unlock() isn't used
5464 * because delete is not allowed before read has ended.
5465 */
5466 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5467 rv = 0;
5468 goto err;
5469 }
5470 do {
5471 n = malloc(sizeof(struct load_node));
5472 } while (!n);
5473
5474 do {
5475 n->cg = malloc(strlen(cg)+1);
5476 } while (!n->cg);
5477 strcpy(n->cg, cg);
5478 n->avenrun[0] = 0;
5479 n->avenrun[1] = 0;
5480 n->avenrun[2] = 0;
5481 n->run_pid = 0;
5482 n->total_pid = 1;
5483 n->last_pid = initpid;
5484 n->cfd = cfd;
5485 insert_node(&n, hash);
5486 }
5487 a = n->avenrun[0] + (FIXED_1/200);
5488 b = n->avenrun[1] + (FIXED_1/200);
5489 c = n->avenrun[2] + (FIXED_1/200);
5490 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5491 LOAD_INT(a), LOAD_FRAC(a),
5492 LOAD_INT(b), LOAD_FRAC(b),
5493 LOAD_INT(c), LOAD_FRAC(c),
5494 n->run_pid, n->total_pid, n->last_pid);
5495 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5496 if (total_len < 0 || total_len >= d->buflen) {
5497 lxcfs_error("%s\n", "Failed to write to cache");
5498 rv = 0;
5499 goto err;
5500 }
5501 d->size = (int)total_len;
5502 d->cached = 1;
5503
5504 if (total_len > size)
5505 total_len = size;
5506 memcpy(buf, d->buf, total_len);
5507 rv = total_len;
5508
5509 err:
5510 free(cg);
5511 return rv;
5512 }
5513 /* Return a positive number on success, return 0 on failure.*/
5514 pthread_t load_daemon(int load_use)
5515 {
5516 int ret;
5517 pthread_t pid;
5518
5519 ret = init_load();
5520 if (ret == -1) {
5521 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5522 return 0;
5523 }
5524 ret = pthread_create(&pid, NULL, load_begin, NULL);
5525 if (ret != 0) {
5526 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5527 load_free();
5528 return 0;
5529 }
5530 /* use loadavg, here loadavg = 1*/
5531 loadavg = load_use;
5532 return pid;
5533 }
5534
5535 /* Returns 0 on success. */
5536 int stop_load_daemon(pthread_t pid)
5537 {
5538 int s;
5539
5540 /* Signal the thread to gracefully stop */
5541 loadavg_stop = 1;
5542
5543 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5544 if (s != 0) {
5545 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5546 return -1;
5547 }
5548
5549 load_free();
5550 loadavg_stop = 0;
5551
5552 return 0;
5553 }
5554
5555 static off_t get_procfile_size(const char *which)
5556 {
5557 FILE *f = fopen(which, "r");
5558 char *line = NULL;
5559 size_t len = 0;
5560 ssize_t sz, answer = 0;
5561 if (!f)
5562 return 0;
5563
5564 while ((sz = getline(&line, &len, f)) != -1)
5565 answer += sz;
5566 fclose (f);
5567 free(line);
5568
5569 return answer;
5570 }
5571
5572 int proc_getattr(const char *path, struct stat *sb)
5573 {
5574 struct timespec now;
5575
5576 memset(sb, 0, sizeof(struct stat));
5577 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5578 return -EINVAL;
5579 sb->st_uid = sb->st_gid = 0;
5580 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5581 if (strcmp(path, "/proc") == 0) {
5582 sb->st_mode = S_IFDIR | 00555;
5583 sb->st_nlink = 2;
5584 return 0;
5585 }
5586 if (strcmp(path, "/proc/meminfo") == 0 ||
5587 strcmp(path, "/proc/cpuinfo") == 0 ||
5588 strcmp(path, "/proc/uptime") == 0 ||
5589 strcmp(path, "/proc/stat") == 0 ||
5590 strcmp(path, "/proc/diskstats") == 0 ||
5591 strcmp(path, "/proc/swaps") == 0 ||
5592 strcmp(path, "/proc/loadavg") == 0) {
5593 sb->st_size = 0;
5594 sb->st_mode = S_IFREG | 00444;
5595 sb->st_nlink = 1;
5596 return 0;
5597 }
5598
5599 return -ENOENT;
5600 }
5601
5602 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5603 struct fuse_file_info *fi)
5604 {
5605 if (filler(buf, ".", NULL, 0) != 0 ||
5606 filler(buf, "..", NULL, 0) != 0 ||
5607 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5608 filler(buf, "meminfo", NULL, 0) != 0 ||
5609 filler(buf, "stat", NULL, 0) != 0 ||
5610 filler(buf, "uptime", NULL, 0) != 0 ||
5611 filler(buf, "diskstats", NULL, 0) != 0 ||
5612 filler(buf, "swaps", NULL, 0) != 0 ||
5613 filler(buf, "loadavg", NULL, 0) != 0)
5614 return -EINVAL;
5615 return 0;
5616 }
5617
5618 int proc_open(const char *path, struct fuse_file_info *fi)
5619 {
5620 int type = -1;
5621 struct file_info *info;
5622
5623 if (strcmp(path, "/proc/meminfo") == 0)
5624 type = LXC_TYPE_PROC_MEMINFO;
5625 else if (strcmp(path, "/proc/cpuinfo") == 0)
5626 type = LXC_TYPE_PROC_CPUINFO;
5627 else if (strcmp(path, "/proc/uptime") == 0)
5628 type = LXC_TYPE_PROC_UPTIME;
5629 else if (strcmp(path, "/proc/stat") == 0)
5630 type = LXC_TYPE_PROC_STAT;
5631 else if (strcmp(path, "/proc/diskstats") == 0)
5632 type = LXC_TYPE_PROC_DISKSTATS;
5633 else if (strcmp(path, "/proc/swaps") == 0)
5634 type = LXC_TYPE_PROC_SWAPS;
5635 else if (strcmp(path, "/proc/loadavg") == 0)
5636 type = LXC_TYPE_PROC_LOADAVG;
5637 if (type == -1)
5638 return -ENOENT;
5639
5640 info = malloc(sizeof(*info));
5641 if (!info)
5642 return -ENOMEM;
5643
5644 memset(info, 0, sizeof(*info));
5645 info->type = type;
5646
5647 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5648 do {
5649 info->buf = malloc(info->buflen);
5650 } while (!info->buf);
5651 memset(info->buf, 0, info->buflen);
5652 /* set actual size to buffer size */
5653 info->size = info->buflen;
5654
5655 fi->fh = (unsigned long)info;
5656 return 0;
5657 }
5658
5659 int proc_access(const char *path, int mask)
5660 {
5661 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5662 return 0;
5663
5664 /* these are all read-only */
5665 if ((mask & ~R_OK) != 0)
5666 return -EACCES;
5667 return 0;
5668 }
5669
5670 int proc_release(const char *path, struct fuse_file_info *fi)
5671 {
5672 do_release_file_info(fi);
5673 return 0;
5674 }
5675
5676 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5677 struct fuse_file_info *fi)
5678 {
5679 struct file_info *f = (struct file_info *) fi->fh;
5680
5681 switch (f->type) {
5682 case LXC_TYPE_PROC_MEMINFO:
5683 return proc_meminfo_read(buf, size, offset, fi);
5684 case LXC_TYPE_PROC_CPUINFO:
5685 return proc_cpuinfo_read(buf, size, offset, fi);
5686 case LXC_TYPE_PROC_UPTIME:
5687 return proc_uptime_read(buf, size, offset, fi);
5688 case LXC_TYPE_PROC_STAT:
5689 return proc_stat_read(buf, size, offset, fi);
5690 case LXC_TYPE_PROC_DISKSTATS:
5691 return proc_diskstats_read(buf, size, offset, fi);
5692 case LXC_TYPE_PROC_SWAPS:
5693 return proc_swaps_read(buf, size, offset, fi);
5694 case LXC_TYPE_PROC_LOADAVG:
5695 return proc_loadavg_read(buf, size, offset, fi);
5696 default:
5697 return -EINVAL;
5698 }
5699 }
5700
5701 /*
5702 * Functions needed to setup cgroups in the __constructor__.
5703 */
5704
5705 static bool umount_if_mounted(void)
5706 {
5707 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5708 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5709 return false;
5710 }
5711 return true;
5712 }
5713
5714 /* __typeof__ should be safe to use with all compilers. */
5715 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5716 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5717 {
5718 return (fs->f_type == (fs_type_magic)magic_val);
5719 }
5720
5721 /*
5722 * looking at fs/proc_namespace.c, it appears we can
5723 * actually expect the rootfs entry to very specifically contain
5724 * " - rootfs rootfs "
5725 * IIUC, so long as we've chrooted so that rootfs is not our root,
5726 * the rootfs entry should always be skipped in mountinfo contents.
5727 */
5728 static bool is_on_ramfs(void)
5729 {
5730 FILE *f;
5731 char *p, *p2;
5732 char *line = NULL;
5733 size_t len = 0;
5734 int i;
5735
5736 f = fopen("/proc/self/mountinfo", "r");
5737 if (!f)
5738 return false;
5739
5740 while (getline(&line, &len, f) != -1) {
5741 for (p = line, i = 0; p && i < 4; i++)
5742 p = strchr(p + 1, ' ');
5743 if (!p)
5744 continue;
5745 p2 = strchr(p + 1, ' ');
5746 if (!p2)
5747 continue;
5748 *p2 = '\0';
5749 if (strcmp(p + 1, "/") == 0) {
5750 // this is '/'. is it the ramfs?
5751 p = strchr(p2 + 1, '-');
5752 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5753 free(line);
5754 fclose(f);
5755 return true;
5756 }
5757 }
5758 }
5759 free(line);
5760 fclose(f);
5761 return false;
5762 }
5763
5764 static int pivot_enter()
5765 {
5766 int ret = -1, oldroot = -1, newroot = -1;
5767
5768 oldroot = open("/", O_DIRECTORY | O_RDONLY);
5769 if (oldroot < 0) {
5770 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5771 return ret;
5772 }
5773
5774 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5775 if (newroot < 0) {
5776 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5777 goto err;
5778 }
5779
5780 /* change into new root fs */
5781 if (fchdir(newroot) < 0) {
5782 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5783 goto err;
5784 }
5785
5786 /* pivot_root into our new root fs */
5787 if (pivot_root(".", ".") < 0) {
5788 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5789 goto err;
5790 }
5791
5792 /*
5793 * At this point the old-root is mounted on top of our new-root.
5794 * To unmounted it we must not be chdir'd into it, so escape back
5795 * to the old-root.
5796 */
5797 if (fchdir(oldroot) < 0) {
5798 lxcfs_error("%s\n", "Failed to enter old root.");
5799 goto err;
5800 }
5801
5802 if (umount2(".", MNT_DETACH) < 0) {
5803 lxcfs_error("%s\n", "Failed to detach old root.");
5804 goto err;
5805 }
5806
5807 if (fchdir(newroot) < 0) {
5808 lxcfs_error("%s\n", "Failed to re-enter new root.");
5809 goto err;
5810 }
5811
5812 ret = 0;
5813
5814 err:
5815 if (oldroot > 0)
5816 close(oldroot);
5817 if (newroot > 0)
5818 close(newroot);
5819
5820 return ret;
5821 }
5822
5823 static int chroot_enter()
5824 {
5825 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5826 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5827 return -1;
5828 }
5829
5830 if (chroot(".") < 0) {
5831 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
5832 return -1;
5833 }
5834
5835 if (chdir("/") < 0) {
5836 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
5837 return -1;
5838 }
5839
5840 return 0;
5841 }
5842
5843 static int permute_and_enter(void)
5844 {
5845 struct statfs sb;
5846
5847 if (statfs("/", &sb) < 0) {
5848 lxcfs_error("%s\n", "Could not stat / mountpoint.");
5849 return -1;
5850 }
5851
5852 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5853 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5854 * /proc/1/mountinfo. */
5855 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5856 return chroot_enter();
5857
5858 if (pivot_enter() < 0) {
5859 lxcfs_error("%s\n", "Could not perform pivot root.");
5860 return -1;
5861 }
5862
5863 return 0;
5864 }
5865
5866 /* Prepare our new clean root. */
5867 static int permute_prepare(void)
5868 {
5869 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
5870 lxcfs_error("%s\n", "Failed to create directory for new root.");
5871 return -1;
5872 }
5873
5874 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
5875 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
5876 return -1;
5877 }
5878
5879 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
5880 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
5881 return -1;
5882 }
5883
5884 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
5885 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
5886 return -1;
5887 }
5888
5889 return 0;
5890 }
5891
5892 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
5893 static bool permute_root(void)
5894 {
5895 /* Prepare new root. */
5896 if (permute_prepare() < 0)
5897 return false;
5898
5899 /* Pivot into new root. */
5900 if (permute_and_enter() < 0)
5901 return false;
5902
5903 return true;
5904 }
5905
5906 static bool cgfs_prepare_mounts(void)
5907 {
5908 if (!mkdir_p(BASEDIR, 0700)) {
5909 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
5910 return false;
5911 }
5912
5913 if (!umount_if_mounted()) {
5914 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
5915 return false;
5916 }
5917
5918 if (unshare(CLONE_NEWNS) < 0) {
5919 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
5920 return false;
5921 }
5922
5923 cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
5924 if (cgroup_ops->mntns_fd < 0) {
5925 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
5926 return false;
5927 }
5928
5929 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
5930 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
5931 return false;
5932 }
5933
5934 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
5935 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
5936 return false;
5937 }
5938
5939 return true;
5940 }
5941
5942 static bool cgfs_mount_hierarchies(void)
5943 {
5944 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
5945 return false;
5946
5947 if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
5948 return false;
5949
5950 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
5951 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
5952 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
5953 if ((*h)->fd < 0)
5954 return false;
5955 }
5956
5957 return true;
5958 }
5959
5960 static bool cgfs_setup_controllers(void)
5961 {
5962 if (!cgfs_prepare_mounts())
5963 return false;
5964
5965 if (!cgfs_mount_hierarchies()) {
5966 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
5967 return false;
5968 }
5969
5970 if (!permute_root())
5971 return false;
5972
5973 return true;
5974 }
5975
5976 static void __attribute__((constructor)) lxcfs_init(void)
5977 {
5978 __do_close_prot_errno int init_ns = -EBADF;
5979 char *cret;
5980 char cwd[MAXPATHLEN];
5981
5982 cgroup_ops = cgroup_init();
5983 if (!cgroup_ops)
5984 log_exit("Failed to initialize cgroup support");
5985
5986 /* Preserve initial namespace. */
5987 init_ns = preserve_ns(getpid(), "mnt");
5988 if (init_ns < 0)
5989 log_exit("Failed to preserve initial mount namespace");
5990
5991 cret = getcwd(cwd, MAXPATHLEN);
5992 log_exit("%s - Could not retrieve current working directory", strerror(errno));
5993
5994 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
5995 * to privately mount lxcfs cgroups. */
5996 if (!cgfs_setup_controllers())
5997 log_exit("Failed to setup private cgroup mounts for lxcfs");
5998
5999 if (setns(init_ns, 0) < 0)
6000 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
6001
6002 if (!cret || chdir(cwd) < 0)
6003 log_exit("%s - Could not change back to original working directory", strerror(errno));
6004
6005 if (!init_cpuview())
6006 log_exit("Failed to init CPU view");
6007
6008 print_subsystems();
6009 }
6010
6011 static void __attribute__((destructor)) lxcfs_exit(void)
6012 {
6013 lxcfs_debug("%s\n", "Running destructor for liblxcfs");
6014 free_cpuview();
6015 cgroup_exit(cgroup_ops);
6016 }