]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
Merge pull request #321 from brauner/2020-02-20/cgroup2_support_4
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #define __STDC_FORMAT_MACROS
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <fuse.h>
16 #include <inttypes.h>
17 #include <libgen.h>
18 #include <pthread.h>
19 #include <sched.h>
20 #include <stdarg.h>
21 #include <stdbool.h>
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <time.h>
27 #include <unistd.h>
28 #include <wait.h>
29 #include <linux/magic.h>
30 #include <linux/sched.h>
31 #include <sys/epoll.h>
32 #include <sys/mman.h>
33 #include <sys/mount.h>
34 #include <sys/param.h>
35 #include <sys/socket.h>
36 #include <sys/syscall.h>
37 #include <sys/sysinfo.h>
38 #include <sys/vfs.h>
39
40 #include "bindings.h"
41 #include "cgroups/cgroup.h"
42 #include "cgroups/cgroup_utils.h"
43 #include "memory_utils.h"
44 #include "config.h"
45
46 /* Define pivot_root() if missing from the C library */
47 #ifndef HAVE_PIVOT_ROOT
48 static int pivot_root(const char * new_root, const char * put_old)
49 {
50 #ifdef __NR_pivot_root
51 return syscall(__NR_pivot_root, new_root, put_old);
52 #else
53 errno = ENOSYS;
54 return -1;
55 #endif
56 }
57 #else
58 extern int pivot_root(const char * new_root, const char * put_old);
59 #endif
60
61 struct cpuacct_usage {
62 uint64_t user;
63 uint64_t system;
64 uint64_t idle;
65 bool online;
66 };
67
68 /* The function of hash table.*/
69 #define LOAD_SIZE 100 /*the size of hash_table */
70 #define FLUSH_TIME 5 /*the flush rate */
71 #define DEPTH_DIR 3 /*the depth of per cgroup */
72 /* The function of calculate loadavg .*/
73 #define FSHIFT 11 /* nr of bits of precision */
74 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
75 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
76 #define EXP_5 2014 /* 1/exp(5sec/5min) */
77 #define EXP_15 2037 /* 1/exp(5sec/15min) */
78 #define LOAD_INT(x) ((x) >> FSHIFT)
79 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
80 /*
81 * This parameter is used for proc_loadavg_read().
82 * 1 means use loadavg, 0 means not use.
83 */
84 static int loadavg = 0;
85 static volatile sig_atomic_t loadavg_stop = 0;
86 static int calc_hash(const char *name)
87 {
88 unsigned int hash = 0;
89 unsigned int x = 0;
90 /* ELFHash algorithm. */
91 while (*name) {
92 hash = (hash << 4) + *name++;
93 x = hash & 0xf0000000;
94 if (x != 0)
95 hash ^= (x >> 24);
96 hash &= ~x;
97 }
98 return (hash & 0x7fffffff);
99 }
100
101 struct load_node {
102 char *cg; /*cg */
103 unsigned long avenrun[3]; /* Load averages */
104 unsigned int run_pid;
105 unsigned int total_pid;
106 unsigned int last_pid;
107 int cfd; /* The file descriptor of the mounted cgroup */
108 struct load_node *next;
109 struct load_node **pre;
110 };
111
112 struct load_head {
113 /*
114 * The lock is about insert load_node and refresh load_node.To the first
115 * load_node of each hash bucket, insert and refresh in this hash bucket is
116 * mutually exclusive.
117 */
118 pthread_mutex_t lock;
119 /*
120 * The rdlock is about read loadavg and delete load_node.To each hash
121 * bucket, read and delete is mutually exclusive. But at the same time, we
122 * allow paratactic read operation. This rdlock is at list level.
123 */
124 pthread_rwlock_t rdlock;
125 /*
126 * The rilock is about read loadavg and insert load_node.To the first
127 * load_node of each hash bucket, read and insert is mutually exclusive.
128 * But at the same time, we allow paratactic read operation.
129 */
130 pthread_rwlock_t rilock;
131 struct load_node *next;
132 };
133
134 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
135 /*
136 * init_load initialize the hash table.
137 * Return 0 on success, return -1 on failure.
138 */
139 static int init_load(void)
140 {
141 int i;
142 int ret;
143
144 for (i = 0; i < LOAD_SIZE; i++) {
145 load_hash[i].next = NULL;
146 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
147 if (ret != 0) {
148 lxcfs_error("%s\n", "Failed to initialize lock");
149 goto out3;
150 }
151 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
152 if (ret != 0) {
153 lxcfs_error("%s\n", "Failed to initialize rdlock");
154 goto out2;
155 }
156 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
157 if (ret != 0) {
158 lxcfs_error("%s\n", "Failed to initialize rilock");
159 goto out1;
160 }
161 }
162 return 0;
163 out1:
164 pthread_rwlock_destroy(&load_hash[i].rdlock);
165 out2:
166 pthread_mutex_destroy(&load_hash[i].lock);
167 out3:
168 while (i > 0) {
169 i--;
170 pthread_mutex_destroy(&load_hash[i].lock);
171 pthread_rwlock_destroy(&load_hash[i].rdlock);
172 pthread_rwlock_destroy(&load_hash[i].rilock);
173 }
174 return -1;
175 }
176
177 static void insert_node(struct load_node **n, int locate)
178 {
179 struct load_node *f;
180
181 pthread_mutex_lock(&load_hash[locate].lock);
182 pthread_rwlock_wrlock(&load_hash[locate].rilock);
183 f = load_hash[locate].next;
184 load_hash[locate].next = *n;
185
186 (*n)->pre = &(load_hash[locate].next);
187 if (f)
188 f->pre = &((*n)->next);
189 (*n)->next = f;
190 pthread_mutex_unlock(&load_hash[locate].lock);
191 pthread_rwlock_unlock(&load_hash[locate].rilock);
192 }
193 /*
194 * locate_node() finds special node. Not return NULL means success.
195 * It should be noted that rdlock isn't unlocked at the end of code
196 * because this function is used to read special node. Delete is not
197 * allowed before read has ended.
198 * unlock rdlock only in proc_loadavg_read().
199 */
200 static struct load_node *locate_node(char *cg, int locate)
201 {
202 struct load_node *f = NULL;
203 int i = 0;
204
205 pthread_rwlock_rdlock(&load_hash[locate].rilock);
206 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
207 if (load_hash[locate].next == NULL) {
208 pthread_rwlock_unlock(&load_hash[locate].rilock);
209 return f;
210 }
211 f = load_hash[locate].next;
212 pthread_rwlock_unlock(&load_hash[locate].rilock);
213 while (f && ((i = strcmp(f->cg, cg)) != 0))
214 f = f->next;
215 return f;
216 }
217 /* Delete the load_node n and return the next node of it. */
218 static struct load_node *del_node(struct load_node *n, int locate)
219 {
220 struct load_node *g;
221
222 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
223 if (n->next == NULL) {
224 *(n->pre) = NULL;
225 } else {
226 *(n->pre) = n->next;
227 n->next->pre = n->pre;
228 }
229 g = n->next;
230 free(n->cg);
231 free(n);
232 pthread_rwlock_unlock(&load_hash[locate].rdlock);
233 return g;
234 }
235
236 static void load_free(void)
237 {
238 int i;
239 struct load_node *f, *p;
240
241 for (i = 0; i < LOAD_SIZE; i++) {
242 pthread_mutex_lock(&load_hash[i].lock);
243 pthread_rwlock_wrlock(&load_hash[i].rilock);
244 pthread_rwlock_wrlock(&load_hash[i].rdlock);
245 if (load_hash[i].next == NULL) {
246 pthread_mutex_unlock(&load_hash[i].lock);
247 pthread_mutex_destroy(&load_hash[i].lock);
248 pthread_rwlock_unlock(&load_hash[i].rilock);
249 pthread_rwlock_destroy(&load_hash[i].rilock);
250 pthread_rwlock_unlock(&load_hash[i].rdlock);
251 pthread_rwlock_destroy(&load_hash[i].rdlock);
252 continue;
253 }
254 for (f = load_hash[i].next; f; ) {
255 free(f->cg);
256 p = f->next;
257 free(f);
258 f = p;
259 }
260 pthread_mutex_unlock(&load_hash[i].lock);
261 pthread_mutex_destroy(&load_hash[i].lock);
262 pthread_rwlock_unlock(&load_hash[i].rilock);
263 pthread_rwlock_destroy(&load_hash[i].rilock);
264 pthread_rwlock_unlock(&load_hash[i].rdlock);
265 pthread_rwlock_destroy(&load_hash[i].rdlock);
266 }
267 }
268
269 /* Data for CPU view */
270 struct cg_proc_stat {
271 char *cg;
272 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
273 struct cpuacct_usage *view; // Usage stats reported to the container
274 int cpu_count;
275 pthread_mutex_t lock; // For node manipulation
276 struct cg_proc_stat *next;
277 };
278
279 struct cg_proc_stat_head {
280 struct cg_proc_stat *next;
281 time_t lastcheck;
282
283 /*
284 * For access to the list. Reading can be parallel, pruning is exclusive.
285 */
286 pthread_rwlock_t lock;
287 };
288
289 #define CPUVIEW_HASH_SIZE 100
290 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
291
292 static bool cpuview_init_head(struct cg_proc_stat_head **head)
293 {
294 *head = malloc(sizeof(struct cg_proc_stat_head));
295 if (!(*head)) {
296 lxcfs_error("%s\n", strerror(errno));
297 return false;
298 }
299
300 (*head)->lastcheck = time(NULL);
301 (*head)->next = NULL;
302
303 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
304 lxcfs_error("%s\n", "Failed to initialize list lock");
305 free(*head);
306 return false;
307 }
308
309 return true;
310 }
311
312 static bool init_cpuview()
313 {
314 int i;
315
316 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
317 proc_stat_history[i] = NULL;
318
319 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
320 if (!cpuview_init_head(&proc_stat_history[i]))
321 goto err;
322 }
323
324 return true;
325
326 err:
327 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
328 if (proc_stat_history[i]) {
329 free(proc_stat_history[i]);
330 proc_stat_history[i] = NULL;
331 }
332 }
333
334 return false;
335 }
336
337 static void free_proc_stat_node(struct cg_proc_stat *node)
338 {
339 pthread_mutex_destroy(&node->lock);
340 free(node->cg);
341 free(node->usage);
342 free(node->view);
343 free(node);
344 }
345
346 static void cpuview_free_head(struct cg_proc_stat_head *head)
347 {
348 struct cg_proc_stat *node, *tmp;
349
350 if (head->next) {
351 node = head->next;
352
353 for (;;) {
354 tmp = node;
355 node = node->next;
356 free_proc_stat_node(tmp);
357
358 if (!node)
359 break;
360 }
361 }
362
363 pthread_rwlock_destroy(&head->lock);
364 free(head);
365 }
366
367 static void free_cpuview()
368 {
369 int i;
370
371 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
372 if (proc_stat_history[i])
373 cpuview_free_head(proc_stat_history[i]);
374 }
375 }
376
377 /*
378 * A table caching which pid is init for a pid namespace.
379 * When looking up which pid is init for $qpid, we first
380 * 1. Stat /proc/$qpid/ns/pid.
381 * 2. Check whether the ino_t is in our store.
382 * a. if not, fork a child in qpid's ns to send us
383 * ucred.pid = 1, and read the initpid. Cache
384 * initpid and creation time for /proc/initpid
385 * in a new store entry.
386 * b. if so, verify that /proc/initpid still matches
387 * what we have saved. If not, clear the store
388 * entry and go back to a. If so, return the
389 * cached initpid.
390 */
391 struct pidns_init_store {
392 ino_t ino; // inode number for /proc/$pid/ns/pid
393 pid_t initpid; // the pid of nit in that ns
394 long int ctime; // the time at which /proc/$initpid was created
395 struct pidns_init_store *next;
396 long int lastcheck;
397 };
398
399 /* lol - look at how they are allocated in the kernel */
400 #define PIDNS_HASH_SIZE 4096
401 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
402
403 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
404 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
405 static void lock_mutex(pthread_mutex_t *l)
406 {
407 int ret;
408
409 if ((ret = pthread_mutex_lock(l)) != 0) {
410 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
411 exit(1);
412 }
413 }
414
415 static struct cgroup_ops *cgroup_ops;
416
417 static int cgroup_mount_ns_fd = -1;
418
419 static void unlock_mutex(pthread_mutex_t *l)
420 {
421 int ret;
422
423 if ((ret = pthread_mutex_unlock(l)) != 0) {
424 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
425 exit(1);
426 }
427 }
428
429 static void store_lock(void)
430 {
431 lock_mutex(&pidns_store_mutex);
432 }
433
434 static void store_unlock(void)
435 {
436 unlock_mutex(&pidns_store_mutex);
437 }
438
439 /* Must be called under store_lock */
440 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
441 {
442 struct stat initsb;
443 char fnam[100];
444
445 snprintf(fnam, 100, "/proc/%d", e->initpid);
446 if (stat(fnam, &initsb) < 0)
447 return false;
448
449 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
450 initsb.st_ctime, e->initpid);
451
452 if (e->ctime != initsb.st_ctime)
453 return false;
454 return true;
455 }
456
457 /* Must be called under store_lock */
458 static void remove_initpid(struct pidns_init_store *e)
459 {
460 struct pidns_init_store *tmp;
461 int h;
462
463 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
464
465 h = HASH(e->ino);
466 if (pidns_hash_table[h] == e) {
467 pidns_hash_table[h] = e->next;
468 free(e);
469 return;
470 }
471
472 tmp = pidns_hash_table[h];
473 while (tmp) {
474 if (tmp->next == e) {
475 tmp->next = e->next;
476 free(e);
477 return;
478 }
479 tmp = tmp->next;
480 }
481 }
482
483 #define PURGE_SECS 5
484 /* Must be called under store_lock */
485 static void prune_initpid_store(void)
486 {
487 static long int last_prune = 0;
488 struct pidns_init_store *e, *prev, *delme;
489 long int now, threshold;
490 int i;
491
492 if (!last_prune) {
493 last_prune = time(NULL);
494 return;
495 }
496 now = time(NULL);
497 if (now < last_prune + PURGE_SECS)
498 return;
499
500 lxcfs_debug("%s\n", "Pruning.");
501
502 last_prune = now;
503 threshold = now - 2 * PURGE_SECS;
504
505 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
506 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
507 if (e->lastcheck < threshold) {
508
509 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
510
511 delme = e;
512 if (prev)
513 prev->next = e->next;
514 else
515 pidns_hash_table[i] = e->next;
516 e = e->next;
517 free(delme);
518 } else {
519 prev = e;
520 e = e->next;
521 }
522 }
523 }
524 }
525
526 /* Must be called under store_lock */
527 static void save_initpid(struct stat *sb, pid_t pid)
528 {
529 struct pidns_init_store *e;
530 char fpath[100];
531 struct stat procsb;
532 int h;
533
534 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
535
536 snprintf(fpath, 100, "/proc/%d", pid);
537 if (stat(fpath, &procsb) < 0)
538 return;
539 do {
540 e = malloc(sizeof(*e));
541 } while (!e);
542 e->ino = sb->st_ino;
543 e->initpid = pid;
544 e->ctime = procsb.st_ctime;
545 h = HASH(e->ino);
546 e->next = pidns_hash_table[h];
547 e->lastcheck = time(NULL);
548 pidns_hash_table[h] = e;
549 }
550
551 /*
552 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
553 * entry for the inode number and creation time. Verify that the init pid
554 * is still valid. If not, remove it. Return the entry if valid, NULL
555 * otherwise.
556 * Must be called under store_lock
557 */
558 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
559 {
560 int h = HASH(sb->st_ino);
561 struct pidns_init_store *e = pidns_hash_table[h];
562
563 while (e) {
564 if (e->ino == sb->st_ino) {
565 if (initpid_still_valid(e, sb)) {
566 e->lastcheck = time(NULL);
567 return e;
568 }
569 remove_initpid(e);
570 return NULL;
571 }
572 e = e->next;
573 }
574
575 return NULL;
576 }
577
578 static int is_dir(const char *path, int fd)
579 {
580 struct stat statbuf;
581 int ret = fstatat(fd, path, &statbuf, fd);
582 if (ret == 0 && S_ISDIR(statbuf.st_mode))
583 return 1;
584 return 0;
585 }
586
587 static int preserve_ns(const int pid, const char *ns)
588 {
589 int ret;
590 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
591 #define __NS_PATH_LEN 50
592 char path[__NS_PATH_LEN];
593
594 /* This way we can use this function to also check whether namespaces
595 * are supported by the kernel by passing in the NULL or the empty
596 * string.
597 */
598 ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
599 !ns || strcmp(ns, "") == 0 ? "" : "/",
600 !ns || strcmp(ns, "") == 0 ? "" : ns);
601 if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
602 errno = EFBIG;
603 return -1;
604 }
605
606 return open(path, O_RDONLY | O_CLOEXEC);
607 }
608
609 /**
610 * in_same_namespace - Check whether two processes are in the same namespace.
611 * @pid1 - PID of the first process.
612 * @pid2 - PID of the second process.
613 * @ns - Name of the namespace to check. Must correspond to one of the names
614 * for the namespaces as shown in /proc/<pid/ns/
615 *
616 * If the two processes are not in the same namespace returns an fd to the
617 * namespace of the second process identified by @pid2. If the two processes are
618 * in the same namespace returns -EINVAL, -1 if an error occurred.
619 */
620 static int in_same_namespace(pid_t pid1, pid_t pid2, const char *ns)
621 {
622 __do_close_prot_errno int ns_fd1 = -1, ns_fd2 = -1;
623 int ret = -1;
624 struct stat ns_st1, ns_st2;
625
626 ns_fd1 = preserve_ns(pid1, ns);
627 if (ns_fd1 < 0) {
628 /* The kernel does not support this namespace. This is not an
629 * error.
630 */
631 if (errno == ENOENT)
632 return -EINVAL;
633
634 return -1;
635 }
636
637 ns_fd2 = preserve_ns(pid2, ns);
638 if (ns_fd2 < 0)
639 return -1;
640
641 ret = fstat(ns_fd1, &ns_st1);
642 if (ret < 0)
643 return -1;
644
645 ret = fstat(ns_fd2, &ns_st2);
646 if (ret < 0)
647 return -1;
648
649 /* processes are in the same namespace */
650 if ((ns_st1.st_dev == ns_st2.st_dev) && (ns_st1.st_ino == ns_st2.st_ino))
651 return -EINVAL;
652
653 /* processes are in different namespaces */
654 return move_fd(ns_fd2);
655 }
656
657 static bool is_shared_pidns(pid_t pid)
658 {
659 if (pid != 1)
660 return false;
661
662 if (in_same_namespace(pid, getpid(), "pid") == -EINVAL)
663 return true;
664
665 return false;
666 }
667
668 static bool write_string(const char *fnam, const char *string, int fd)
669 {
670 FILE *f;
671 size_t len, ret;
672
673 f = fdopen(fd, "w");
674 if (!f)
675 return false;
676
677 len = strlen(string);
678 ret = fwrite(string, 1, len, f);
679 if (ret != len) {
680 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
681 strerror(errno), string, fnam);
682 fclose(f);
683 return false;
684 }
685
686 if (fclose(f) < 0) {
687 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
688 return false;
689 }
690
691 return true;
692 }
693
694 struct cgfs_files {
695 char *name;
696 uint32_t uid, gid;
697 uint32_t mode;
698 };
699
700 static void print_subsystems(void)
701 {
702 int i = 0;
703
704 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
705 fprintf(stderr, "hierarchies:\n");
706 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
707 __do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false);
708 fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: "");
709 }
710 }
711
712 /* do we need to do any massaging here? I'm not sure... */
713 /* Return the mounted controller and store the corresponding open file descriptor
714 * referring to the controller mountpoint in the private lxcfs namespace in
715 * @cfd.
716 */
717 static int find_mounted_controller(const char *controller)
718 {
719 struct hierarchy *h;
720
721 h = cgroup_ops->get_hierarchy(cgroup_ops, controller);
722 return h ? h->fd : -EBADF;
723 }
724
725 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
726 const char *value)
727 {
728 int ret, fd, cfd;
729 size_t len;
730 char *fnam;
731
732 cfd = find_mounted_controller(controller);
733 if (cfd < 0)
734 return false;
735
736 /* Make sure we pass a relative path to *at() family of functions.
737 * . + /cgroup + / + file + \0
738 */
739 len = strlen(cgroup) + strlen(file) + 3;
740 fnam = alloca(len);
741 ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
742 if (ret < 0 || (size_t)ret >= len)
743 return false;
744
745 fd = openat(cfd, fnam, O_WRONLY);
746 if (fd < 0)
747 return false;
748
749 return write_string(fnam, value, fd);
750 }
751
752 // Chown all the files in the cgroup directory. We do this when we create
753 // a cgroup on behalf of a user.
754 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
755 {
756 struct dirent *direntp;
757 char path[MAXPATHLEN];
758 size_t len;
759 DIR *d;
760 int fd1, ret;
761
762 len = strlen(dirname);
763 if (len >= MAXPATHLEN) {
764 lxcfs_error("Pathname too long: %s\n", dirname);
765 return;
766 }
767
768 fd1 = openat(fd, dirname, O_DIRECTORY);
769 if (fd1 < 0)
770 return;
771
772 d = fdopendir(fd1);
773 if (!d) {
774 lxcfs_error("Failed to open %s\n", dirname);
775 return;
776 }
777
778 while ((direntp = readdir(d))) {
779 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
780 continue;
781 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
782 if (ret < 0 || ret >= MAXPATHLEN) {
783 lxcfs_error("Pathname too long under %s\n", dirname);
784 continue;
785 }
786 if (fchownat(fd, path, uid, gid, 0) < 0)
787 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
788 }
789 closedir(d);
790 }
791
792 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
793 {
794 int cfd;
795 size_t len;
796 char *dirnam;
797
798 cfd = find_mounted_controller(controller);
799 if (cfd < 0)
800 return -EINVAL;
801
802 /* Make sure we pass a relative path to *at() family of functions.
803 * . + /cg + \0
804 */
805 len = strlen(cg) + 2;
806 dirnam = alloca(len);
807 snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
808
809 if (mkdirat(cfd, dirnam, 0755) < 0)
810 return -errno;
811
812 if (uid == 0 && gid == 0)
813 return 0;
814
815 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
816 return -errno;
817
818 chown_all_cgroup_files(dirnam, uid, gid, cfd);
819
820 return 0;
821 }
822
823 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
824 {
825 struct dirent *direntp;
826 DIR *dir;
827 bool ret = false;
828 char pathname[MAXPATHLEN];
829 int dupfd;
830
831 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
832 if (dupfd < 0)
833 return false;
834
835 dir = fdopendir(dupfd);
836 if (!dir) {
837 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
838 close(dupfd);
839 return false;
840 }
841
842 while ((direntp = readdir(dir))) {
843 struct stat mystat;
844 int rc;
845
846 if (!strcmp(direntp->d_name, ".") ||
847 !strcmp(direntp->d_name, ".."))
848 continue;
849
850 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
851 if (rc < 0 || rc >= MAXPATHLEN) {
852 lxcfs_error("%s\n", "Pathname too long.");
853 continue;
854 }
855
856 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
857 if (rc) {
858 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
859 continue;
860 }
861 if (S_ISDIR(mystat.st_mode))
862 if (!recursive_rmdir(pathname, fd, cfd))
863 lxcfs_debug("Error removing %s.\n", pathname);
864 }
865
866 ret = true;
867 if (closedir(dir) < 0) {
868 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
869 ret = false;
870 }
871
872 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
873 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
874 ret = false;
875 }
876
877 close(dupfd);
878
879 return ret;
880 }
881
882 bool cgfs_remove(const char *controller, const char *cg)
883 {
884 int fd, cfd;
885 size_t len;
886 char *dirnam;
887 bool bret;
888
889 cfd = find_mounted_controller(controller);
890 if (cfd < 0)
891 return false;
892
893 /* Make sure we pass a relative path to *at() family of functions.
894 * . + /cg + \0
895 */
896 len = strlen(cg) + 2;
897 dirnam = alloca(len);
898 snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
899
900 fd = openat(cfd, dirnam, O_DIRECTORY);
901 if (fd < 0)
902 return false;
903
904 bret = recursive_rmdir(dirnam, fd, cfd);
905 close(fd);
906 return bret;
907 }
908
909 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
910 {
911 int cfd;
912 size_t len;
913 char *pathname;
914
915 cfd = find_mounted_controller(controller);
916 if (cfd < 0)
917 return false;
918
919 /* Make sure we pass a relative path to *at() family of functions.
920 * . + /file + \0
921 */
922 len = strlen(file) + 2;
923 pathname = alloca(len);
924 snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
925 if (fchmodat(cfd, pathname, mode, 0) < 0)
926 return false;
927 return true;
928 }
929
930 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
931 {
932 size_t len;
933 char *fname;
934
935 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
936 fname = alloca(len);
937 snprintf(fname, len, "%s/tasks", dirname);
938 if (fchownat(fd, fname, uid, gid, 0) != 0)
939 return -errno;
940 snprintf(fname, len, "%s/cgroup.procs", dirname);
941 if (fchownat(fd, fname, uid, gid, 0) != 0)
942 return -errno;
943 return 0;
944 }
945
946 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
947 {
948 int cfd;
949 size_t len;
950 char *pathname;
951
952 cfd = find_mounted_controller(controller);
953 if (cfd < 0)
954 return false;
955
956 /* Make sure we pass a relative path to *at() family of functions.
957 * . + /file + \0
958 */
959 len = strlen(file) + 2;
960 pathname = alloca(len);
961 snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
962 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
963 return -errno;
964
965 if (is_dir(pathname, cfd))
966 // like cgmanager did, we want to chown the tasks file as well
967 return chown_tasks_files(pathname, uid, gid, cfd);
968
969 return 0;
970 }
971
972 FILE *open_pids_file(const char *controller, const char *cgroup)
973 {
974 int fd, cfd;
975 size_t len;
976 char *pathname;
977
978 cfd = find_mounted_controller(controller);
979 if (cfd < 0)
980 return false;
981
982 /* Make sure we pass a relative path to *at() family of functions.
983 * . + /cgroup + / "cgroup.procs" + \0
984 */
985 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
986 pathname = alloca(len);
987 snprintf(pathname, len, "%s%s/cgroup.procs", dot_or_empty(cgroup), cgroup);
988
989 fd = openat(cfd, pathname, O_WRONLY);
990 if (fd < 0)
991 return NULL;
992
993 return fdopen(fd, "w");
994 }
995
996 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
997 void ***list, size_t typesize,
998 void* (*iterator)(const char*, const char*, const char*))
999 {
1000 int cfd, fd, ret;
1001 size_t len;
1002 char *cg;
1003 char pathname[MAXPATHLEN];
1004 size_t sz = 0, asz = 0;
1005 struct dirent *dirent;
1006 DIR *dir;
1007
1008 cfd = find_mounted_controller(controller);
1009 *list = NULL;
1010 if (cfd < 0)
1011 return false;
1012
1013 /* Make sure we pass a relative path to *at() family of functions. */
1014 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1015 cg = alloca(len);
1016 ret = snprintf(cg, len, "%s%s", dot_or_empty(cgroup), cgroup);
1017 if (ret < 0 || (size_t)ret >= len) {
1018 lxcfs_error("Pathname too long under %s\n", cgroup);
1019 return false;
1020 }
1021
1022 fd = openat(cfd, cg, O_DIRECTORY);
1023 if (fd < 0)
1024 return false;
1025
1026 dir = fdopendir(fd);
1027 if (!dir)
1028 return false;
1029
1030 while ((dirent = readdir(dir))) {
1031 struct stat mystat;
1032
1033 if (!strcmp(dirent->d_name, ".") ||
1034 !strcmp(dirent->d_name, ".."))
1035 continue;
1036
1037 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1038 if (ret < 0 || ret >= MAXPATHLEN) {
1039 lxcfs_error("Pathname too long under %s\n", cg);
1040 continue;
1041 }
1042
1043 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1044 if (ret) {
1045 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1046 continue;
1047 }
1048 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1049 (directories && !S_ISDIR(mystat.st_mode)))
1050 continue;
1051
1052 if (sz+2 >= asz) {
1053 void **tmp;
1054 asz += BATCH_SIZE;
1055 do {
1056 tmp = realloc(*list, asz * typesize);
1057 } while (!tmp);
1058 *list = tmp;
1059 }
1060 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1061 (*list)[sz+1] = NULL;
1062 sz++;
1063 }
1064 if (closedir(dir) < 0) {
1065 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1066 return false;
1067 }
1068 return true;
1069 }
1070
1071 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1072 {
1073 char *dup;
1074 do {
1075 dup = strdup(dir_entry);
1076 } while (!dup);
1077 return dup;
1078 }
1079
1080 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1081 {
1082 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1083 }
1084
1085 void free_key(struct cgfs_files *k)
1086 {
1087 if (!k)
1088 return;
1089 free(k->name);
1090 free(k);
1091 }
1092
1093 void free_keys(struct cgfs_files **keys)
1094 {
1095 int i;
1096
1097 if (!keys)
1098 return;
1099 for (i = 0; keys[i]; i++) {
1100 free_key(keys[i]);
1101 }
1102 free(keys);
1103 }
1104
1105 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1106 {
1107 int ret, cfd;
1108 size_t len;
1109 char *fnam;
1110
1111 cfd = find_mounted_controller(controller);
1112 if (cfd < 0)
1113 return false;
1114
1115 /* Make sure we pass a relative path to *at() family of functions.
1116 * . + /cgroup + / + file + \0
1117 */
1118 len = strlen(cgroup) + strlen(file) + 3;
1119 fnam = alloca(len);
1120 ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
1121 if (ret < 0 || (size_t)ret >= len)
1122 return false;
1123
1124 return (faccessat(cfd, fnam, F_OK, 0) == 0);
1125 }
1126
1127 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1128 {
1129 int ret, cfd;
1130 size_t len;
1131 char *fnam;
1132 struct stat sb;
1133 struct cgfs_files *newkey;
1134
1135 cfd = find_mounted_controller(controller);
1136 if (cfd < 0)
1137 return false;
1138
1139 if (file && *file == '/')
1140 file++;
1141
1142 if (file && strchr(file, '/'))
1143 return NULL;
1144
1145 /* Make sure we pass a relative path to *at() family of functions.
1146 * . + /cgroup + / + file + \0
1147 */
1148 len = strlen(cgroup) + 3;
1149 if (file)
1150 len += strlen(file) + 1;
1151 fnam = alloca(len);
1152 snprintf(fnam, len, "%s%s%s%s", dot_or_empty(cgroup), cgroup,
1153 file ? "/" : "", file ? file : "");
1154
1155 ret = fstatat(cfd, fnam, &sb, 0);
1156 if (ret < 0)
1157 return NULL;
1158
1159 do {
1160 newkey = malloc(sizeof(struct cgfs_files));
1161 } while (!newkey);
1162 if (file)
1163 newkey->name = must_copy_string(file);
1164 else if (strrchr(cgroup, '/'))
1165 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1166 else
1167 newkey->name = must_copy_string(cgroup);
1168 newkey->uid = sb.st_uid;
1169 newkey->gid = sb.st_gid;
1170 newkey->mode = sb.st_mode;
1171
1172 return newkey;
1173 }
1174
1175 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1176 {
1177 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1178 if (!entry) {
1179 lxcfs_error("Error getting files under %s:%s\n", controller,
1180 cgroup);
1181 }
1182 return entry;
1183 }
1184
1185 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1186 {
1187 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1188 }
1189
1190 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1191 {
1192 int cfd;
1193 size_t len;
1194 char *fnam;
1195 int ret;
1196 struct stat sb;
1197
1198 cfd = find_mounted_controller(controller);
1199 if (cfd < 0)
1200 return false;
1201
1202 /* Make sure we pass a relative path to *at() family of functions.
1203 * . + /cgroup + / + f + \0
1204 */
1205 len = strlen(cgroup) + strlen(f) + 3;
1206 fnam = alloca(len);
1207 ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, f);
1208 if (ret < 0 || (size_t)ret >= len)
1209 return false;
1210
1211 ret = fstatat(cfd, fnam, &sb, 0);
1212 if (ret < 0 || !S_ISDIR(sb.st_mode))
1213 return false;
1214
1215 return true;
1216 }
1217
1218 #define SEND_CREDS_OK 0
1219 #define SEND_CREDS_NOTSK 1
1220 #define SEND_CREDS_FAIL 2
1221 static bool recv_creds(int sock, struct ucred *cred, char *v);
1222 static int wait_for_pid(pid_t pid);
1223 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1224 static int send_creds_clone_wrapper(void *arg);
1225
1226 /*
1227 * clone a task which switches to @task's namespace and writes '1'.
1228 * over a unix sock so we can read the task's reaper's pid in our
1229 * namespace
1230 *
1231 * Note: glibc's fork() does not respect pidns, which can lead to failed
1232 * assertions inside glibc (and thus failed forks) if the child's pid in
1233 * the pidns and the parent pid outside are identical. Using clone prevents
1234 * this issue.
1235 */
1236 static void write_task_init_pid_exit(int sock, pid_t target)
1237 {
1238 char fnam[100];
1239 pid_t pid;
1240 int fd, ret;
1241 size_t stack_size = sysconf(_SC_PAGESIZE);
1242 void *stack = alloca(stack_size);
1243
1244 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1245 if (ret < 0 || ret >= sizeof(fnam))
1246 _exit(1);
1247
1248 fd = open(fnam, O_RDONLY);
1249 if (fd < 0) {
1250 perror("write_task_init_pid_exit open of ns/pid");
1251 _exit(1);
1252 }
1253 if (setns(fd, 0)) {
1254 perror("write_task_init_pid_exit setns 1");
1255 close(fd);
1256 _exit(1);
1257 }
1258 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1259 if (pid < 0)
1260 _exit(1);
1261 if (pid != 0) {
1262 if (!wait_for_pid(pid))
1263 _exit(1);
1264 _exit(0);
1265 }
1266 }
1267
1268 static int send_creds_clone_wrapper(void *arg) {
1269 struct ucred cred;
1270 char v;
1271 int sock = *(int *)arg;
1272
1273 /* we are the child */
1274 cred.uid = 0;
1275 cred.gid = 0;
1276 cred.pid = 1;
1277 v = '1';
1278 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1279 return 1;
1280 return 0;
1281 }
1282
1283 static pid_t get_init_pid_for_task(pid_t task)
1284 {
1285 int sock[2];
1286 pid_t pid;
1287 pid_t ret = -1;
1288 char v = '0';
1289 struct ucred cred;
1290
1291 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1292 perror("socketpair");
1293 return -1;
1294 }
1295
1296 pid = fork();
1297 if (pid < 0)
1298 goto out;
1299 if (!pid) {
1300 close(sock[1]);
1301 write_task_init_pid_exit(sock[0], task);
1302 _exit(0);
1303 }
1304
1305 if (!recv_creds(sock[1], &cred, &v))
1306 goto out;
1307 ret = cred.pid;
1308
1309 out:
1310 close(sock[0]);
1311 close(sock[1]);
1312 if (pid > 0)
1313 wait_for_pid(pid);
1314 return ret;
1315 }
1316
1317 pid_t lookup_initpid_in_store(pid_t qpid)
1318 {
1319 pid_t answer = 0;
1320 struct stat sb;
1321 struct pidns_init_store *e;
1322 char fnam[100];
1323
1324 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1325 store_lock();
1326 if (stat(fnam, &sb) < 0)
1327 goto out;
1328 e = lookup_verify_initpid(&sb);
1329 if (e) {
1330 answer = e->initpid;
1331 goto out;
1332 }
1333 answer = get_init_pid_for_task(qpid);
1334 if (answer > 0)
1335 save_initpid(&sb, answer);
1336
1337 out:
1338 /* we prune at end in case we are returning
1339 * the value we were about to return */
1340 prune_initpid_store();
1341 store_unlock();
1342 return answer;
1343 }
1344
1345 static int wait_for_pid(pid_t pid)
1346 {
1347 int status, ret;
1348
1349 if (pid <= 0)
1350 return -1;
1351
1352 again:
1353 ret = waitpid(pid, &status, 0);
1354 if (ret == -1) {
1355 if (errno == EINTR)
1356 goto again;
1357 return -1;
1358 }
1359 if (ret != pid)
1360 goto again;
1361 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1362 return -1;
1363 return 0;
1364 }
1365
1366 /*
1367 * append the given formatted string to *src.
1368 * src: a pointer to a char* in which to append the formatted string.
1369 * sz: the number of characters printed so far, minus trailing \0.
1370 * asz: the allocated size so far
1371 * format: string format. See printf for details.
1372 * ...: varargs. See printf for details.
1373 */
1374 static void must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...)
1375 {
1376 char tmp[BUF_RESERVE_SIZE];
1377 va_list args;
1378
1379 va_start (args, format);
1380 int tmplen = vsnprintf(tmp, BUF_RESERVE_SIZE, format, args);
1381 va_end(args);
1382
1383 if (!*src || tmplen + *sz + 1 >= *asz) {
1384 char *tmp;
1385 do {
1386 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1387 } while (!tmp);
1388 *src = tmp;
1389 *asz += BUF_RESERVE_SIZE;
1390 }
1391 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1392 *sz += tmplen;
1393 }
1394
1395 /*
1396 * append pid to *src.
1397 * src: a pointer to a char* in which ot append the pid.
1398 * sz: the number of characters printed so far, minus trailing \0.
1399 * asz: the allocated size so far
1400 * pid: the pid to append
1401 */
1402 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1403 {
1404 must_strcat(src, sz, asz, "%d\n", (int)pid);
1405 }
1406
1407 /*
1408 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1409 * valid in the caller's namespace, return the id mapped into
1410 * pid's namespace.
1411 * Returns the mapped id, or -1 on error.
1412 */
1413 unsigned int
1414 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1415 {
1416 unsigned int nsuid, // base id for a range in the idfile's namespace
1417 hostuid, // base id for a range in the caller's namespace
1418 count; // number of ids in this range
1419 char line[400];
1420 int ret;
1421
1422 fseek(idfile, 0L, SEEK_SET);
1423 while (fgets(line, 400, idfile)) {
1424 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1425 if (ret != 3)
1426 continue;
1427 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1428 /*
1429 * uids wrapped around - unexpected as this is a procfile,
1430 * so just bail.
1431 */
1432 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1433 nsuid, hostuid, count, line);
1434 return -1;
1435 }
1436 if (hostuid <= in_id && hostuid+count > in_id) {
1437 /*
1438 * now since hostuid <= in_id < hostuid+count, and
1439 * hostuid+count and nsuid+count do not wrap around,
1440 * we know that nsuid+(in_id-hostuid) which must be
1441 * less that nsuid+(count) must not wrap around
1442 */
1443 return (in_id - hostuid) + nsuid;
1444 }
1445 }
1446
1447 // no answer found
1448 return -1;
1449 }
1450
1451 /*
1452 * for is_privileged_over,
1453 * specify whether we require the calling uid to be root in his
1454 * namespace
1455 */
1456 #define NS_ROOT_REQD true
1457 #define NS_ROOT_OPT false
1458
1459 #define PROCLEN 100
1460
1461 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1462 {
1463 char fpath[PROCLEN];
1464 int ret;
1465 bool answer = false;
1466 uid_t nsuid;
1467
1468 if (victim == -1 || uid == -1)
1469 return false;
1470
1471 /*
1472 * If the request is one not requiring root in the namespace,
1473 * then having the same uid suffices. (i.e. uid 1000 has write
1474 * access to files owned by uid 1000
1475 */
1476 if (!req_ns_root && uid == victim)
1477 return true;
1478
1479 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1480 if (ret < 0 || ret >= PROCLEN)
1481 return false;
1482 FILE *f = fopen(fpath, "r");
1483 if (!f)
1484 return false;
1485
1486 /* if caller's not root in his namespace, reject */
1487 nsuid = convert_id_to_ns(f, uid);
1488 if (nsuid)
1489 goto out;
1490
1491 /*
1492 * If victim is not mapped into caller's ns, reject.
1493 * XXX I'm not sure this check is needed given that fuse
1494 * will be sending requests where the vfs has converted
1495 */
1496 nsuid = convert_id_to_ns(f, victim);
1497 if (nsuid == -1)
1498 goto out;
1499
1500 answer = true;
1501
1502 out:
1503 fclose(f);
1504 return answer;
1505 }
1506
1507 static bool perms_include(int fmode, mode_t req_mode)
1508 {
1509 mode_t r;
1510
1511 switch (req_mode & O_ACCMODE) {
1512 case O_RDONLY:
1513 r = S_IROTH;
1514 break;
1515 case O_WRONLY:
1516 r = S_IWOTH;
1517 break;
1518 case O_RDWR:
1519 r = S_IROTH | S_IWOTH;
1520 break;
1521 default:
1522 return false;
1523 }
1524 return ((fmode & r) == r);
1525 }
1526
1527
1528 /*
1529 * taskcg is a/b/c
1530 * querycg is /a/b/c/d/e
1531 * we return 'd'
1532 */
1533 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1534 {
1535 char *start, *end;
1536
1537 if (strlen(taskcg) <= strlen(querycg)) {
1538 lxcfs_error("%s\n", "I was fed bad input.");
1539 return NULL;
1540 }
1541
1542 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1543 start = strdup(taskcg + 1);
1544 else
1545 start = strdup(taskcg + strlen(querycg) + 1);
1546 if (!start)
1547 return NULL;
1548 end = strchr(start, '/');
1549 if (end)
1550 *end = '\0';
1551 return start;
1552 }
1553
1554 char *get_pid_cgroup(pid_t pid, const char *contrl)
1555 {
1556 int cfd;
1557
1558 cfd = find_mounted_controller(contrl);
1559 if (cfd < 0)
1560 return false;
1561
1562 if (pure_unified_layout(cgroup_ops))
1563 return cg_unified_get_current_cgroup(pid);
1564
1565 return cg_legacy_get_current_cgroup(pid, contrl);
1566 }
1567
1568 /*
1569 * check whether a fuse context may access a cgroup dir or file
1570 *
1571 * If file is not null, it is a cgroup file to check under cg.
1572 * If file is null, then we are checking perms on cg itself.
1573 *
1574 * For files we can check the mode of the list_keys result.
1575 * For cgroups, we must make assumptions based on the files under the
1576 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1577 * yet.
1578 */
1579 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1580 {
1581 struct cgfs_files *k = NULL;
1582 bool ret = false;
1583
1584 k = cgfs_get_key(contrl, cg, file);
1585 if (!k)
1586 return false;
1587
1588 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1589 if (perms_include(k->mode >> 6, mode)) {
1590 ret = true;
1591 goto out;
1592 }
1593 }
1594 if (fc->gid == k->gid) {
1595 if (perms_include(k->mode >> 3, mode)) {
1596 ret = true;
1597 goto out;
1598 }
1599 }
1600 ret = perms_include(k->mode, mode);
1601
1602 out:
1603 free_key(k);
1604 return ret;
1605 }
1606
1607 #define INITSCOPE "/init.scope"
1608 void prune_init_slice(char *cg)
1609 {
1610 char *point;
1611 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1612
1613 if (cg_len < initscope_len)
1614 return;
1615
1616 point = cg + cg_len - initscope_len;
1617 if (strcmp(point, INITSCOPE) == 0) {
1618 if (point == cg)
1619 *(point+1) = '\0';
1620 else
1621 *point = '\0';
1622 }
1623 }
1624
1625 /*
1626 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1627 * If pid is in /a, he may act on /a/b, but not on /b.
1628 * if the answer is false and nextcg is not NULL, then *nextcg will point
1629 * to a string containing the next cgroup directory under cg, which must be
1630 * freed by the caller.
1631 */
1632 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1633 {
1634 bool answer = false;
1635 char *c2 = get_pid_cgroup(pid, contrl);
1636 char *linecmp;
1637
1638 if (!c2)
1639 return false;
1640 prune_init_slice(c2);
1641
1642 /*
1643 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1644 * they pass in a cgroup without leading '/'
1645 *
1646 * The original line here was:
1647 * linecmp = *cg == '/' ? c2 : c2+1;
1648 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1649 * Serge, do you know?
1650 */
1651 if (*cg == '/' || !strncmp(cg, "./", 2))
1652 linecmp = c2;
1653 else
1654 linecmp = c2 + 1;
1655 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1656 if (nextcg) {
1657 *nextcg = get_next_cgroup_dir(linecmp, cg);
1658 }
1659 goto out;
1660 }
1661 answer = true;
1662
1663 out:
1664 free(c2);
1665 return answer;
1666 }
1667
1668 /*
1669 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1670 */
1671 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1672 {
1673 bool answer = false;
1674 char *c2, *task_cg;
1675 size_t target_len, task_len;
1676
1677 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1678 return true;
1679
1680 c2 = get_pid_cgroup(pid, contrl);
1681 if (!c2)
1682 return false;
1683 prune_init_slice(c2);
1684
1685 task_cg = c2 + 1;
1686 target_len = strlen(cg);
1687 task_len = strlen(task_cg);
1688 if (task_len == 0) {
1689 /* Task is in the root cg, it can see everything. This case is
1690 * not handled by the strmcps below, since they test for the
1691 * last /, but that is the first / that we've chopped off
1692 * above.
1693 */
1694 answer = true;
1695 goto out;
1696 }
1697 if (strcmp(cg, task_cg) == 0) {
1698 answer = true;
1699 goto out;
1700 }
1701 if (target_len < task_len) {
1702 /* looking up a parent dir */
1703 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1704 answer = true;
1705 goto out;
1706 }
1707 if (target_len > task_len) {
1708 /* looking up a child dir */
1709 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1710 answer = true;
1711 goto out;
1712 }
1713
1714 out:
1715 free(c2);
1716 return answer;
1717 }
1718
1719 /*
1720 * given /cgroup/freezer/a/b, return "freezer".
1721 * the returned char* should NOT be freed.
1722 */
1723 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1724 {
1725 const char *p1;
1726 char *contr, *slash;
1727
1728 if (strlen(path) < 9) {
1729 errno = EACCES;
1730 return NULL;
1731 }
1732 if (*(path + 7) != '/') {
1733 errno = EINVAL;
1734 return NULL;
1735 }
1736 p1 = path + 8;
1737 contr = strdupa(p1);
1738 if (!contr) {
1739 errno = ENOMEM;
1740 return NULL;
1741 }
1742 slash = strstr(contr, "/");
1743 if (slash)
1744 *slash = '\0';
1745
1746 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1747 if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0)
1748 return (*h)->__controllers;
1749 }
1750 errno = ENOENT;
1751 return NULL;
1752 }
1753
1754 /*
1755 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1756 * Note that the returned value may include files (keynames) etc
1757 */
1758 static const char *find_cgroup_in_path(const char *path)
1759 {
1760 const char *p1;
1761
1762 if (strlen(path) < 9) {
1763 errno = EACCES;
1764 return NULL;
1765 }
1766 p1 = strstr(path + 8, "/");
1767 if (!p1) {
1768 errno = EINVAL;
1769 return NULL;
1770 }
1771 errno = 0;
1772 return p1 + 1;
1773 }
1774
1775 /*
1776 * split the last path element from the path in @cg.
1777 * @dir is newly allocated and should be freed, @last not
1778 */
1779 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1780 {
1781 char *p;
1782
1783 do {
1784 *dir = strdup(cg);
1785 } while (!*dir);
1786 *last = strrchr(cg, '/');
1787 if (!*last) {
1788 *last = NULL;
1789 return;
1790 }
1791 p = strrchr(*dir, '/');
1792 *p = '\0';
1793 }
1794
1795 /*
1796 * FUSE ops for /cgroup
1797 */
1798
1799 int cg_getattr(const char *path, struct stat *sb)
1800 {
1801 struct timespec now;
1802 struct fuse_context *fc = fuse_get_context();
1803 char * cgdir = NULL;
1804 char *last = NULL, *path1, *path2;
1805 struct cgfs_files *k = NULL;
1806 const char *cgroup;
1807 const char *controller = NULL;
1808 int ret = -ENOENT;
1809
1810
1811 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1812 return -EIO;
1813
1814 memset(sb, 0, sizeof(struct stat));
1815
1816 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1817 return -EINVAL;
1818
1819 sb->st_uid = sb->st_gid = 0;
1820 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1821 sb->st_size = 0;
1822
1823 if (strcmp(path, "/cgroup") == 0) {
1824 sb->st_mode = S_IFDIR | 00755;
1825 sb->st_nlink = 2;
1826 return 0;
1827 }
1828
1829 controller = pick_controller_from_path(fc, path);
1830 if (!controller)
1831 return -errno;
1832 cgroup = find_cgroup_in_path(path);
1833 if (!cgroup) {
1834 /* this is just /cgroup/controller, return it as a dir */
1835 sb->st_mode = S_IFDIR | 00755;
1836 sb->st_nlink = 2;
1837 return 0;
1838 }
1839
1840 get_cgdir_and_path(cgroup, &cgdir, &last);
1841
1842 if (!last) {
1843 path1 = "/";
1844 path2 = cgdir;
1845 } else {
1846 path1 = cgdir;
1847 path2 = last;
1848 }
1849
1850 pid_t initpid = lookup_initpid_in_store(fc->pid);
1851 if (initpid <= 1 || is_shared_pidns(initpid))
1852 initpid = fc->pid;
1853 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1854 * Then check that caller's cgroup is under path if last is a child
1855 * cgroup, or cgdir if last is a file */
1856
1857 if (is_child_cgroup(controller, path1, path2)) {
1858 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1859 ret = -ENOENT;
1860 goto out;
1861 }
1862 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1863 /* this is just /cgroup/controller, return it as a dir */
1864 sb->st_mode = S_IFDIR | 00555;
1865 sb->st_nlink = 2;
1866 ret = 0;
1867 goto out;
1868 }
1869 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1870 ret = -EACCES;
1871 goto out;
1872 }
1873
1874 // get uid, gid, from '/tasks' file and make up a mode
1875 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1876 sb->st_mode = S_IFDIR | 00755;
1877 k = cgfs_get_key(controller, cgroup, NULL);
1878 if (!k) {
1879 sb->st_uid = sb->st_gid = 0;
1880 } else {
1881 sb->st_uid = k->uid;
1882 sb->st_gid = k->gid;
1883 }
1884 free_key(k);
1885 sb->st_nlink = 2;
1886 ret = 0;
1887 goto out;
1888 }
1889
1890 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1891 sb->st_mode = S_IFREG | k->mode;
1892 sb->st_nlink = 1;
1893 sb->st_uid = k->uid;
1894 sb->st_gid = k->gid;
1895 sb->st_size = 0;
1896 free_key(k);
1897 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1898 ret = -ENOENT;
1899 goto out;
1900 }
1901 ret = 0;
1902 }
1903
1904 out:
1905 free(cgdir);
1906 return ret;
1907 }
1908
1909 int cg_opendir(const char *path, struct fuse_file_info *fi)
1910 {
1911 struct fuse_context *fc = fuse_get_context();
1912 const char *cgroup;
1913 struct file_info *dir_info;
1914 char *controller = NULL;
1915
1916 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1917 return -EIO;
1918
1919 if (strcmp(path, "/cgroup") == 0) {
1920 cgroup = NULL;
1921 controller = NULL;
1922 } else {
1923 // return list of keys for the controller, and list of child cgroups
1924 controller = pick_controller_from_path(fc, path);
1925 if (!controller)
1926 return -errno;
1927
1928 cgroup = find_cgroup_in_path(path);
1929 if (!cgroup) {
1930 /* this is just /cgroup/controller, return its contents */
1931 cgroup = "/";
1932 }
1933 }
1934
1935 pid_t initpid = lookup_initpid_in_store(fc->pid);
1936 if (initpid <= 1 || is_shared_pidns(initpid))
1937 initpid = fc->pid;
1938 if (cgroup) {
1939 if (!caller_may_see_dir(initpid, controller, cgroup))
1940 return -ENOENT;
1941 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1942 return -EACCES;
1943 }
1944
1945 /* we'll free this at cg_releasedir */
1946 dir_info = malloc(sizeof(*dir_info));
1947 if (!dir_info)
1948 return -ENOMEM;
1949 dir_info->controller = must_copy_string(controller);
1950 dir_info->cgroup = must_copy_string(cgroup);
1951 dir_info->type = LXC_TYPE_CGDIR;
1952 dir_info->buf = NULL;
1953 dir_info->file = NULL;
1954 dir_info->buflen = 0;
1955
1956 fi->fh = (unsigned long)dir_info;
1957 return 0;
1958 }
1959
1960 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1961 struct fuse_file_info *fi)
1962 {
1963 struct file_info *d = (struct file_info *)fi->fh;
1964 struct cgfs_files **list = NULL;
1965 int i, ret;
1966 char *nextcg = NULL;
1967 struct fuse_context *fc = fuse_get_context();
1968 char **clist = NULL;
1969
1970 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1971 return -EIO;
1972
1973 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1974 return -EIO;
1975
1976 if (d->type != LXC_TYPE_CGDIR) {
1977 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1978 return -EIO;
1979 }
1980 if (!d->cgroup && !d->controller) {
1981 /*
1982 * ls /var/lib/lxcfs/cgroup - just show list of controllers.
1983 * This only works with the legacy hierarchy.
1984 */
1985 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1986 if (is_unified_hierarchy(*h))
1987 continue;
1988
1989 if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0))
1990 return -EIO;
1991 }
1992
1993 return 0;
1994 }
1995
1996 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1997 // not a valid cgroup
1998 ret = -EINVAL;
1999 goto out;
2000 }
2001
2002 pid_t initpid = lookup_initpid_in_store(fc->pid);
2003 if (initpid <= 1 || is_shared_pidns(initpid))
2004 initpid = fc->pid;
2005 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2006 if (nextcg) {
2007 ret = filler(buf, nextcg, NULL, 0);
2008 free(nextcg);
2009 if (ret != 0) {
2010 ret = -EIO;
2011 goto out;
2012 }
2013 }
2014 ret = 0;
2015 goto out;
2016 }
2017
2018 for (i = 0; list && list[i]; i++) {
2019 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2020 ret = -EIO;
2021 goto out;
2022 }
2023 }
2024
2025 // now get the list of child cgroups
2026
2027 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2028 ret = 0;
2029 goto out;
2030 }
2031 if (clist) {
2032 for (i = 0; clist[i]; i++) {
2033 if (filler(buf, clist[i], NULL, 0) != 0) {
2034 ret = -EIO;
2035 goto out;
2036 }
2037 }
2038 }
2039 ret = 0;
2040
2041 out:
2042 free_keys(list);
2043 if (clist) {
2044 for (i = 0; clist[i]; i++)
2045 free(clist[i]);
2046 free(clist);
2047 }
2048 return ret;
2049 }
2050
2051 void do_release_file_info(struct fuse_file_info *fi)
2052 {
2053 struct file_info *f = (struct file_info *)fi->fh;
2054
2055 if (!f)
2056 return;
2057
2058 fi->fh = 0;
2059
2060 free(f->controller);
2061 f->controller = NULL;
2062 free(f->cgroup);
2063 f->cgroup = NULL;
2064 free(f->file);
2065 f->file = NULL;
2066 free(f->buf);
2067 f->buf = NULL;
2068 free(f);
2069 f = NULL;
2070 }
2071
2072 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2073 {
2074 do_release_file_info(fi);
2075 return 0;
2076 }
2077
2078 int cg_open(const char *path, struct fuse_file_info *fi)
2079 {
2080 const char *cgroup;
2081 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2082 struct cgfs_files *k = NULL;
2083 struct file_info *file_info;
2084 struct fuse_context *fc = fuse_get_context();
2085 int ret;
2086
2087 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2088 return -EIO;
2089
2090 controller = pick_controller_from_path(fc, path);
2091 if (!controller)
2092 return -errno;
2093 cgroup = find_cgroup_in_path(path);
2094 if (!cgroup)
2095 return -errno;
2096
2097 get_cgdir_and_path(cgroup, &cgdir, &last);
2098 if (!last) {
2099 path1 = "/";
2100 path2 = cgdir;
2101 } else {
2102 path1 = cgdir;
2103 path2 = last;
2104 }
2105
2106 k = cgfs_get_key(controller, path1, path2);
2107 if (!k) {
2108 ret = -EINVAL;
2109 goto out;
2110 }
2111 free_key(k);
2112
2113 pid_t initpid = lookup_initpid_in_store(fc->pid);
2114 if (initpid <= 1 || is_shared_pidns(initpid))
2115 initpid = fc->pid;
2116 if (!caller_may_see_dir(initpid, controller, path1)) {
2117 ret = -ENOENT;
2118 goto out;
2119 }
2120 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2121 ret = -EACCES;
2122 goto out;
2123 }
2124
2125 /* we'll free this at cg_release */
2126 file_info = malloc(sizeof(*file_info));
2127 if (!file_info) {
2128 ret = -ENOMEM;
2129 goto out;
2130 }
2131 file_info->controller = must_copy_string(controller);
2132 file_info->cgroup = must_copy_string(path1);
2133 file_info->file = must_copy_string(path2);
2134 file_info->type = LXC_TYPE_CGFILE;
2135 file_info->buf = NULL;
2136 file_info->buflen = 0;
2137
2138 fi->fh = (unsigned long)file_info;
2139 ret = 0;
2140
2141 out:
2142 free(cgdir);
2143 return ret;
2144 }
2145
2146 int cg_access(const char *path, int mode)
2147 {
2148 int ret;
2149 const char *cgroup;
2150 char *path1, *path2, *controller;
2151 char *last = NULL, *cgdir = NULL;
2152 struct cgfs_files *k = NULL;
2153 struct fuse_context *fc = fuse_get_context();
2154
2155 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2156 return -EIO;
2157
2158 if (strcmp(path, "/cgroup") == 0)
2159 return 0;
2160
2161 controller = pick_controller_from_path(fc, path);
2162 if (!controller)
2163 return -errno;
2164 cgroup = find_cgroup_in_path(path);
2165 if (!cgroup) {
2166 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2167 if ((mode & W_OK) == 0)
2168 return 0;
2169 return -EACCES;
2170 }
2171
2172 get_cgdir_and_path(cgroup, &cgdir, &last);
2173 if (!last) {
2174 path1 = "/";
2175 path2 = cgdir;
2176 } else {
2177 path1 = cgdir;
2178 path2 = last;
2179 }
2180
2181 k = cgfs_get_key(controller, path1, path2);
2182 if (!k) {
2183 if ((mode & W_OK) == 0)
2184 ret = 0;
2185 else
2186 ret = -EACCES;
2187 goto out;
2188 }
2189 free_key(k);
2190
2191 pid_t initpid = lookup_initpid_in_store(fc->pid);
2192 if (initpid <= 1 || is_shared_pidns(initpid))
2193 initpid = fc->pid;
2194 if (!caller_may_see_dir(initpid, controller, path1)) {
2195 ret = -ENOENT;
2196 goto out;
2197 }
2198 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2199 ret = -EACCES;
2200 goto out;
2201 }
2202
2203 ret = 0;
2204
2205 out:
2206 free(cgdir);
2207 return ret;
2208 }
2209
2210 int cg_release(const char *path, struct fuse_file_info *fi)
2211 {
2212 do_release_file_info(fi);
2213 return 0;
2214 }
2215
2216 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2217
2218 static bool wait_for_sock(int sock, int timeout)
2219 {
2220 struct epoll_event ev;
2221 int epfd, ret, now, starttime, deltatime, saved_errno;
2222
2223 if ((starttime = time(NULL)) < 0)
2224 return false;
2225
2226 if ((epfd = epoll_create(1)) < 0) {
2227 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2228 return false;
2229 }
2230
2231 ev.events = POLLIN_SET;
2232 ev.data.fd = sock;
2233 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2234 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2235 close(epfd);
2236 return false;
2237 }
2238
2239 again:
2240 if ((now = time(NULL)) < 0) {
2241 close(epfd);
2242 return false;
2243 }
2244
2245 deltatime = (starttime + timeout) - now;
2246 if (deltatime < 0) { // timeout
2247 errno = 0;
2248 close(epfd);
2249 return false;
2250 }
2251 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2252 if (ret < 0 && errno == EINTR)
2253 goto again;
2254 saved_errno = errno;
2255 close(epfd);
2256
2257 if (ret <= 0) {
2258 errno = saved_errno;
2259 return false;
2260 }
2261 return true;
2262 }
2263
2264 static int msgrecv(int sockfd, void *buf, size_t len)
2265 {
2266 if (!wait_for_sock(sockfd, 2))
2267 return -1;
2268 return recv(sockfd, buf, len, MSG_DONTWAIT);
2269 }
2270
2271 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2272 {
2273 struct msghdr msg = { 0 };
2274 struct iovec iov;
2275 struct cmsghdr *cmsg;
2276 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2277 char buf[1];
2278 buf[0] = 'p';
2279
2280 if (pingfirst) {
2281 if (msgrecv(sock, buf, 1) != 1) {
2282 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2283 return SEND_CREDS_FAIL;
2284 }
2285 }
2286
2287 msg.msg_control = cmsgbuf;
2288 msg.msg_controllen = sizeof(cmsgbuf);
2289
2290 cmsg = CMSG_FIRSTHDR(&msg);
2291 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2292 cmsg->cmsg_level = SOL_SOCKET;
2293 cmsg->cmsg_type = SCM_CREDENTIALS;
2294 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2295
2296 msg.msg_name = NULL;
2297 msg.msg_namelen = 0;
2298
2299 buf[0] = v;
2300 iov.iov_base = buf;
2301 iov.iov_len = sizeof(buf);
2302 msg.msg_iov = &iov;
2303 msg.msg_iovlen = 1;
2304
2305 if (sendmsg(sock, &msg, 0) < 0) {
2306 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2307 if (errno == 3)
2308 return SEND_CREDS_NOTSK;
2309 return SEND_CREDS_FAIL;
2310 }
2311
2312 return SEND_CREDS_OK;
2313 }
2314
2315 static bool recv_creds(int sock, struct ucred *cred, char *v)
2316 {
2317 struct msghdr msg = { 0 };
2318 struct iovec iov;
2319 struct cmsghdr *cmsg;
2320 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2321 char buf[1];
2322 int ret;
2323 int optval = 1;
2324
2325 *v = '1';
2326
2327 cred->pid = -1;
2328 cred->uid = -1;
2329 cred->gid = -1;
2330
2331 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2332 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2333 return false;
2334 }
2335 buf[0] = '1';
2336 if (write(sock, buf, 1) != 1) {
2337 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2338 return false;
2339 }
2340
2341 msg.msg_name = NULL;
2342 msg.msg_namelen = 0;
2343 msg.msg_control = cmsgbuf;
2344 msg.msg_controllen = sizeof(cmsgbuf);
2345
2346 iov.iov_base = buf;
2347 iov.iov_len = sizeof(buf);
2348 msg.msg_iov = &iov;
2349 msg.msg_iovlen = 1;
2350
2351 if (!wait_for_sock(sock, 2)) {
2352 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2353 return false;
2354 }
2355 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2356 if (ret < 0) {
2357 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2358 return false;
2359 }
2360
2361 cmsg = CMSG_FIRSTHDR(&msg);
2362
2363 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2364 cmsg->cmsg_level == SOL_SOCKET &&
2365 cmsg->cmsg_type == SCM_CREDENTIALS) {
2366 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2367 }
2368 *v = buf[0];
2369
2370 return true;
2371 }
2372
2373 struct pid_ns_clone_args {
2374 int *cpipe;
2375 int sock;
2376 pid_t tpid;
2377 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2378 };
2379
2380 /*
2381 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2382 * with clone(). This simply writes '1' as ACK back to the parent
2383 * before calling the actual wrapped function.
2384 */
2385 static int pid_ns_clone_wrapper(void *arg) {
2386 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2387 char b = '1';
2388
2389 close(args->cpipe[0]);
2390 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2391 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2392 close(args->cpipe[1]);
2393 return args->wrapped(args->sock, args->tpid);
2394 }
2395
2396 /*
2397 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2398 * int value back over the socket. This shifts the pid from the
2399 * sender's pidns into tpid's pidns.
2400 */
2401 static int pid_to_ns(int sock, pid_t tpid)
2402 {
2403 char v = '0';
2404 struct ucred cred;
2405
2406 while (recv_creds(sock, &cred, &v)) {
2407 if (v == '1')
2408 return 0;
2409 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2410 return 1;
2411 }
2412 return 0;
2413 }
2414
2415
2416 /*
2417 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2418 * in your old pidns. Only children which you clone will be in the target
2419 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2420 * actually convert pids.
2421 *
2422 * Note: glibc's fork() does not respect pidns, which can lead to failed
2423 * assertions inside glibc (and thus failed forks) if the child's pid in
2424 * the pidns and the parent pid outside are identical. Using clone prevents
2425 * this issue.
2426 */
2427 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2428 {
2429 int newnsfd = -1, ret, cpipe[2];
2430 char fnam[100];
2431 pid_t cpid;
2432 char v;
2433
2434 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2435 if (ret < 0 || ret >= sizeof(fnam))
2436 _exit(1);
2437 newnsfd = open(fnam, O_RDONLY);
2438 if (newnsfd < 0)
2439 _exit(1);
2440 if (setns(newnsfd, 0) < 0)
2441 _exit(1);
2442 close(newnsfd);
2443
2444 if (pipe(cpipe) < 0)
2445 _exit(1);
2446
2447 struct pid_ns_clone_args args = {
2448 .cpipe = cpipe,
2449 .sock = sock,
2450 .tpid = tpid,
2451 .wrapped = &pid_to_ns
2452 };
2453 size_t stack_size = sysconf(_SC_PAGESIZE);
2454 void *stack = alloca(stack_size);
2455
2456 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2457 if (cpid < 0)
2458 _exit(1);
2459
2460 // give the child 1 second to be done forking and
2461 // write its ack
2462 if (!wait_for_sock(cpipe[0], 1))
2463 _exit(1);
2464 ret = read(cpipe[0], &v, 1);
2465 if (ret != sizeof(char) || v != '1')
2466 _exit(1);
2467
2468 if (!wait_for_pid(cpid))
2469 _exit(1);
2470 _exit(0);
2471 }
2472
2473 /*
2474 * To read cgroup files with a particular pid, we will setns into the child
2475 * pidns, open a pipe, fork a child - which will be the first to really be in
2476 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2477 */
2478 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2479 {
2480 int sock[2] = {-1, -1};
2481 char *tmpdata = NULL;
2482 int ret;
2483 pid_t qpid, cpid = -1;
2484 bool answer = false;
2485 char v = '0';
2486 struct ucred cred;
2487 size_t sz = 0, asz = 0;
2488
2489 if (!cgroup_ops->get(cgroup_ops, contrl, cg, file, &tmpdata))
2490 return false;
2491
2492 /*
2493 * Now we read the pids from returned data one by one, pass
2494 * them into a child in the target namespace, read back the
2495 * translated pids, and put them into our to-return data
2496 */
2497
2498 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2499 perror("socketpair");
2500 free(tmpdata);
2501 return false;
2502 }
2503
2504 cpid = fork();
2505 if (cpid == -1)
2506 goto out;
2507
2508 if (!cpid) // child - exits when done
2509 pid_to_ns_wrapper(sock[1], tpid);
2510
2511 char *ptr = tmpdata;
2512 cred.uid = 0;
2513 cred.gid = 0;
2514 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2515 cred.pid = qpid;
2516 ret = send_creds(sock[0], &cred, v, true);
2517
2518 if (ret == SEND_CREDS_NOTSK)
2519 goto next;
2520 if (ret == SEND_CREDS_FAIL)
2521 goto out;
2522
2523 // read converted results
2524 if (!wait_for_sock(sock[0], 2)) {
2525 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2526 goto out;
2527 }
2528 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2529 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2530 goto out;
2531 }
2532 must_strcat_pid(d, &sz, &asz, qpid);
2533 next:
2534 ptr = strchr(ptr, '\n');
2535 if (!ptr)
2536 break;
2537 ptr++;
2538 }
2539
2540 cred.pid = getpid();
2541 v = '1';
2542 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2543 // failed to ask child to exit
2544 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2545 goto out;
2546 }
2547
2548 answer = true;
2549
2550 out:
2551 free(tmpdata);
2552 if (cpid != -1)
2553 wait_for_pid(cpid);
2554 if (sock[0] != -1) {
2555 close(sock[0]);
2556 close(sock[1]);
2557 }
2558 return answer;
2559 }
2560
2561 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2562 struct fuse_file_info *fi)
2563 {
2564 struct fuse_context *fc = fuse_get_context();
2565 struct file_info *f = (struct file_info *)fi->fh;
2566 struct cgfs_files *k = NULL;
2567 char *data = NULL;
2568 int ret, s;
2569 bool r;
2570
2571 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2572 return -EIO;
2573
2574 if (f->type != LXC_TYPE_CGFILE) {
2575 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2576 return -EIO;
2577 }
2578
2579 if (offset)
2580 return 0;
2581
2582 if (!f->controller)
2583 return -EINVAL;
2584
2585 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2586 return -EINVAL;
2587 }
2588 free_key(k);
2589
2590
2591 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2592 ret = -EACCES;
2593 goto out;
2594 }
2595
2596 if (strcmp(f->file, "tasks") == 0 ||
2597 strcmp(f->file, "/tasks") == 0 ||
2598 strcmp(f->file, "/cgroup.procs") == 0 ||
2599 strcmp(f->file, "cgroup.procs") == 0)
2600 // special case - we have to translate the pids
2601 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2602 else
2603 r = cgroup_ops->get(cgroup_ops, f->controller, f->cgroup, f->file, &data);
2604
2605 if (!r) {
2606 ret = -EINVAL;
2607 goto out;
2608 }
2609
2610 if (!data) {
2611 ret = 0;
2612 goto out;
2613 }
2614 s = strlen(data);
2615 if (s > size)
2616 s = size;
2617 memcpy(buf, data, s);
2618 if (s > 0 && s < size && data[s-1] != '\n')
2619 buf[s++] = '\n';
2620
2621 ret = s;
2622
2623 out:
2624 free(data);
2625 return ret;
2626 }
2627
2628 static int pid_from_ns(int sock, pid_t tpid)
2629 {
2630 pid_t vpid;
2631 struct ucred cred;
2632 char v;
2633 int ret;
2634
2635 cred.uid = 0;
2636 cred.gid = 0;
2637 while (1) {
2638 if (!wait_for_sock(sock, 2)) {
2639 lxcfs_error("%s\n", "Timeout reading from parent.");
2640 return 1;
2641 }
2642 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2643 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2644 return 1;
2645 }
2646 if (vpid == -1) // done
2647 break;
2648 v = '0';
2649 cred.pid = vpid;
2650 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2651 v = '1';
2652 cred.pid = getpid();
2653 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2654 return 1;
2655 }
2656 }
2657 return 0;
2658 }
2659
2660 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2661 {
2662 int newnsfd = -1, ret, cpipe[2];
2663 char fnam[100];
2664 pid_t cpid;
2665 char v;
2666
2667 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2668 if (ret < 0 || ret >= sizeof(fnam))
2669 _exit(1);
2670 newnsfd = open(fnam, O_RDONLY);
2671 if (newnsfd < 0)
2672 _exit(1);
2673 if (setns(newnsfd, 0) < 0)
2674 _exit(1);
2675 close(newnsfd);
2676
2677 if (pipe(cpipe) < 0)
2678 _exit(1);
2679
2680 struct pid_ns_clone_args args = {
2681 .cpipe = cpipe,
2682 .sock = sock,
2683 .tpid = tpid,
2684 .wrapped = &pid_from_ns
2685 };
2686 size_t stack_size = sysconf(_SC_PAGESIZE);
2687 void *stack = alloca(stack_size);
2688
2689 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2690 if (cpid < 0)
2691 _exit(1);
2692
2693 // give the child 1 second to be done forking and
2694 // write its ack
2695 if (!wait_for_sock(cpipe[0], 1))
2696 _exit(1);
2697 ret = read(cpipe[0], &v, 1);
2698 if (ret != sizeof(char) || v != '1')
2699 _exit(1);
2700
2701 if (!wait_for_pid(cpid))
2702 _exit(1);
2703 _exit(0);
2704 }
2705
2706 /*
2707 * Given host @uid, return the uid to which it maps in
2708 * @pid's user namespace, or -1 if none.
2709 */
2710 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2711 {
2712 FILE *f;
2713 char line[400];
2714
2715 sprintf(line, "/proc/%d/uid_map", pid);
2716 if ((f = fopen(line, "r")) == NULL) {
2717 return false;
2718 }
2719
2720 *answer = convert_id_to_ns(f, uid);
2721 fclose(f);
2722
2723 if (*answer == -1)
2724 return false;
2725 return true;
2726 }
2727
2728 /*
2729 * get_pid_creds: get the real uid and gid of @pid from
2730 * /proc/$$/status
2731 * (XXX should we use euid here?)
2732 */
2733 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2734 {
2735 char line[400];
2736 uid_t u;
2737 gid_t g;
2738 FILE *f;
2739
2740 *uid = -1;
2741 *gid = -1;
2742 sprintf(line, "/proc/%d/status", pid);
2743 if ((f = fopen(line, "r")) == NULL) {
2744 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2745 return;
2746 }
2747 while (fgets(line, 400, f)) {
2748 if (strncmp(line, "Uid:", 4) == 0) {
2749 if (sscanf(line+4, "%u", &u) != 1) {
2750 lxcfs_error("bad uid line for pid %u\n", pid);
2751 fclose(f);
2752 return;
2753 }
2754 *uid = u;
2755 } else if (strncmp(line, "Gid:", 4) == 0) {
2756 if (sscanf(line+4, "%u", &g) != 1) {
2757 lxcfs_error("bad gid line for pid %u\n", pid);
2758 fclose(f);
2759 return;
2760 }
2761 *gid = g;
2762 }
2763 }
2764 fclose(f);
2765 }
2766
2767 /*
2768 * May the requestor @r move victim @v to a new cgroup?
2769 * This is allowed if
2770 * . they are the same task
2771 * . they are ownedy by the same uid
2772 * . @r is root on the host, or
2773 * . @v's uid is mapped into @r's where @r is root.
2774 */
2775 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2776 {
2777 uid_t v_uid, tmpuid;
2778 gid_t v_gid;
2779
2780 if (r == v)
2781 return true;
2782 if (r_uid == 0)
2783 return true;
2784 get_pid_creds(v, &v_uid, &v_gid);
2785 if (r_uid == v_uid)
2786 return true;
2787 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2788 && hostuid_to_ns(v_uid, r, &tmpuid))
2789 return true;
2790 return false;
2791 }
2792
2793 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2794 const char *file, const char *buf)
2795 {
2796 int sock[2] = {-1, -1};
2797 pid_t qpid, cpid = -1;
2798 FILE *pids_file = NULL;
2799 bool answer = false, fail = false;
2800
2801 pids_file = open_pids_file(contrl, cg);
2802 if (!pids_file)
2803 return false;
2804
2805 /*
2806 * write the pids to a socket, have helper in writer's pidns
2807 * call movepid for us
2808 */
2809 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2810 perror("socketpair");
2811 goto out;
2812 }
2813
2814 cpid = fork();
2815 if (cpid == -1)
2816 goto out;
2817
2818 if (!cpid) { // child
2819 fclose(pids_file);
2820 pid_from_ns_wrapper(sock[1], tpid);
2821 }
2822
2823 const char *ptr = buf;
2824 while (sscanf(ptr, "%d", &qpid) == 1) {
2825 struct ucred cred;
2826 char v;
2827
2828 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2829 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2830 goto out;
2831 }
2832
2833 if (recv_creds(sock[0], &cred, &v)) {
2834 if (v == '0') {
2835 if (!may_move_pid(tpid, tuid, cred.pid)) {
2836 fail = true;
2837 break;
2838 }
2839 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2840 fail = true;
2841 }
2842 }
2843
2844 ptr = strchr(ptr, '\n');
2845 if (!ptr)
2846 break;
2847 ptr++;
2848 }
2849
2850 /* All good, write the value */
2851 qpid = -1;
2852 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2853 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2854
2855 if (!fail)
2856 answer = true;
2857
2858 out:
2859 if (cpid != -1)
2860 wait_for_pid(cpid);
2861 if (sock[0] != -1) {
2862 close(sock[0]);
2863 close(sock[1]);
2864 }
2865 if (pids_file) {
2866 if (fclose(pids_file) != 0)
2867 answer = false;
2868 }
2869 return answer;
2870 }
2871
2872 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2873 struct fuse_file_info *fi)
2874 {
2875 struct fuse_context *fc = fuse_get_context();
2876 char *localbuf = NULL;
2877 struct cgfs_files *k = NULL;
2878 struct file_info *f = (struct file_info *)fi->fh;
2879 bool r;
2880
2881 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2882 return -EIO;
2883
2884 if (f->type != LXC_TYPE_CGFILE) {
2885 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2886 return -EIO;
2887 }
2888
2889 if (offset)
2890 return 0;
2891
2892 localbuf = alloca(size+1);
2893 localbuf[size] = '\0';
2894 memcpy(localbuf, buf, size);
2895
2896 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2897 size = -EINVAL;
2898 goto out;
2899 }
2900
2901 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2902 size = -EACCES;
2903 goto out;
2904 }
2905
2906 if (strcmp(f->file, "tasks") == 0 ||
2907 strcmp(f->file, "/tasks") == 0 ||
2908 strcmp(f->file, "/cgroup.procs") == 0 ||
2909 strcmp(f->file, "cgroup.procs") == 0)
2910 // special case - we have to translate the pids
2911 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2912 else
2913 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2914
2915 if (!r)
2916 size = -EINVAL;
2917
2918 out:
2919 free_key(k);
2920 return size;
2921 }
2922
2923 int cg_chown(const char *path, uid_t uid, gid_t gid)
2924 {
2925 struct fuse_context *fc = fuse_get_context();
2926 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2927 struct cgfs_files *k = NULL;
2928 const char *cgroup;
2929 int ret;
2930
2931 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2932 return -EIO;
2933
2934 if (strcmp(path, "/cgroup") == 0)
2935 return -EPERM;
2936
2937 controller = pick_controller_from_path(fc, path);
2938 if (!controller)
2939 return errno == ENOENT ? -EPERM : -errno;
2940
2941 cgroup = find_cgroup_in_path(path);
2942 if (!cgroup)
2943 /* this is just /cgroup/controller */
2944 return -EPERM;
2945
2946 get_cgdir_and_path(cgroup, &cgdir, &last);
2947
2948 if (!last) {
2949 path1 = "/";
2950 path2 = cgdir;
2951 } else {
2952 path1 = cgdir;
2953 path2 = last;
2954 }
2955
2956 if (is_child_cgroup(controller, path1, path2)) {
2957 // get uid, gid, from '/tasks' file and make up a mode
2958 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2959 k = cgfs_get_key(controller, cgroup, "tasks");
2960
2961 } else
2962 k = cgfs_get_key(controller, path1, path2);
2963
2964 if (!k) {
2965 ret = -EINVAL;
2966 goto out;
2967 }
2968
2969 /*
2970 * This being a fuse request, the uid and gid must be valid
2971 * in the caller's namespace. So we can just check to make
2972 * sure that the caller is root in his uid, and privileged
2973 * over the file's current owner.
2974 */
2975 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2976 ret = -EACCES;
2977 goto out;
2978 }
2979
2980 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2981
2982 out:
2983 free_key(k);
2984 free(cgdir);
2985
2986 return ret;
2987 }
2988
2989 int cg_chmod(const char *path, mode_t mode)
2990 {
2991 struct fuse_context *fc = fuse_get_context();
2992 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2993 struct cgfs_files *k = NULL;
2994 const char *cgroup;
2995 int ret;
2996
2997 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2998 return -EIO;
2999
3000 if (strcmp(path, "/cgroup") == 0)
3001 return -EPERM;
3002
3003 controller = pick_controller_from_path(fc, path);
3004 if (!controller)
3005 return errno == ENOENT ? -EPERM : -errno;
3006
3007 cgroup = find_cgroup_in_path(path);
3008 if (!cgroup)
3009 /* this is just /cgroup/controller */
3010 return -EPERM;
3011
3012 get_cgdir_and_path(cgroup, &cgdir, &last);
3013
3014 if (!last) {
3015 path1 = "/";
3016 path2 = cgdir;
3017 } else {
3018 path1 = cgdir;
3019 path2 = last;
3020 }
3021
3022 if (is_child_cgroup(controller, path1, path2)) {
3023 // get uid, gid, from '/tasks' file and make up a mode
3024 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3025 k = cgfs_get_key(controller, cgroup, "tasks");
3026
3027 } else
3028 k = cgfs_get_key(controller, path1, path2);
3029
3030 if (!k) {
3031 ret = -EINVAL;
3032 goto out;
3033 }
3034
3035 /*
3036 * This being a fuse request, the uid and gid must be valid
3037 * in the caller's namespace. So we can just check to make
3038 * sure that the caller is root in his uid, and privileged
3039 * over the file's current owner.
3040 */
3041 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3042 ret = -EPERM;
3043 goto out;
3044 }
3045
3046 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3047 ret = -EINVAL;
3048 goto out;
3049 }
3050
3051 ret = 0;
3052 out:
3053 free_key(k);
3054 free(cgdir);
3055 return ret;
3056 }
3057
3058 int cg_mkdir(const char *path, mode_t mode)
3059 {
3060 struct fuse_context *fc = fuse_get_context();
3061 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3062 const char *cgroup;
3063 int ret;
3064
3065 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
3066 return -EIO;
3067
3068 controller = pick_controller_from_path(fc, path);
3069 if (!controller)
3070 return errno == ENOENT ? -EPERM : -errno;
3071
3072 cgroup = find_cgroup_in_path(path);
3073 if (!cgroup)
3074 return -errno;
3075
3076 get_cgdir_and_path(cgroup, &cgdir, &last);
3077 if (!last)
3078 path1 = "/";
3079 else
3080 path1 = cgdir;
3081
3082 pid_t initpid = lookup_initpid_in_store(fc->pid);
3083 if (initpid <= 1 || is_shared_pidns(initpid))
3084 initpid = fc->pid;
3085 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3086 if (!next)
3087 ret = -EINVAL;
3088 else if (last && strcmp(next, last) == 0)
3089 ret = -EEXIST;
3090 else
3091 ret = -EPERM;
3092 goto out;
3093 }
3094
3095 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3096 ret = -EACCES;
3097 goto out;
3098 }
3099 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3100 ret = -EACCES;
3101 goto out;
3102 }
3103
3104 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3105
3106 out:
3107 free(cgdir);
3108 free(next);
3109 return ret;
3110 }
3111
3112 int cg_rmdir(const char *path)
3113 {
3114 struct fuse_context *fc = fuse_get_context();
3115 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3116 const char *cgroup;
3117 int ret;
3118
3119 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
3120 return -EIO;
3121
3122 controller = pick_controller_from_path(fc, path);
3123 if (!controller) /* Someone's trying to delete "/cgroup". */
3124 return -EPERM;
3125
3126 cgroup = find_cgroup_in_path(path);
3127 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3128 return -EPERM;
3129
3130 get_cgdir_and_path(cgroup, &cgdir, &last);
3131 if (!last) {
3132 /* Someone's trying to delete a cgroup on the same level as the
3133 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3134 * rmdir "/cgroup/blkio/init.slice".
3135 */
3136 ret = -EPERM;
3137 goto out;
3138 }
3139
3140 pid_t initpid = lookup_initpid_in_store(fc->pid);
3141 if (initpid <= 1 || is_shared_pidns(initpid))
3142 initpid = fc->pid;
3143 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3144 if (!last || (next && (strcmp(next, last) == 0)))
3145 ret = -EBUSY;
3146 else
3147 ret = -ENOENT;
3148 goto out;
3149 }
3150
3151 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3152 ret = -EACCES;
3153 goto out;
3154 }
3155 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3156 ret = -EACCES;
3157 goto out;
3158 }
3159
3160 if (!cgfs_remove(controller, cgroup)) {
3161 ret = -EINVAL;
3162 goto out;
3163 }
3164
3165 ret = 0;
3166
3167 out:
3168 free(cgdir);
3169 free(next);
3170 return ret;
3171 }
3172
3173 static bool startswith(const char *line, const char *pref)
3174 {
3175 if (strncmp(line, pref, strlen(pref)) == 0)
3176 return true;
3177 return false;
3178 }
3179
3180 /* Note that "memory.stat" in cgroup2 is hierarchical by default. */
3181 static void parse_memstat(int version,
3182 char *memstat,
3183 unsigned long *cached,
3184 unsigned long *active_anon,
3185 unsigned long *inactive_anon,
3186 unsigned long *active_file,
3187 unsigned long *inactive_file,
3188 unsigned long *unevictable,
3189 unsigned long *shmem)
3190 {
3191 char *eol;
3192
3193 while (*memstat) {
3194 if (startswith(memstat, is_unified_controller(version)
3195 ? "cache"
3196 : "total_cache")) {
3197 sscanf(memstat + 11, "%lu", cached);
3198 *cached /= 1024;
3199 } else if (startswith(memstat, is_unified_controller(version)
3200 ? "active_anon"
3201 : "total_active_anon")) {
3202 sscanf(memstat + 17, "%lu", active_anon);
3203 *active_anon /= 1024;
3204 } else if (startswith(memstat, is_unified_controller(version)
3205 ? "inactive_anon"
3206 : "total_inactive_anon")) {
3207 sscanf(memstat + 19, "%lu", inactive_anon);
3208 *inactive_anon /= 1024;
3209 } else if (startswith(memstat, is_unified_controller(version)
3210 ? "active_file"
3211 : "total_active_file")) {
3212 sscanf(memstat + 17, "%lu", active_file);
3213 *active_file /= 1024;
3214 } else if (startswith(memstat, is_unified_controller(version)
3215 ? "inactive_file"
3216 : "total_inactive_file")) {
3217 sscanf(memstat + 19, "%lu", inactive_file);
3218 *inactive_file /= 1024;
3219 } else if (startswith(memstat, is_unified_controller(version)
3220 ? "unevictable"
3221 : "total_unevictable")) {
3222 sscanf(memstat + 17, "%lu", unevictable);
3223 *unevictable /= 1024;
3224 } else if (startswith(memstat, is_unified_controller(version)
3225 ? "shmem"
3226 : "total_shmem")) {
3227 sscanf(memstat + 11, "%lu", shmem);
3228 *shmem /= 1024;
3229 }
3230 eol = strchr(memstat, '\n');
3231 if (!eol)
3232 return;
3233 memstat = eol+1;
3234 }
3235 }
3236
3237 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3238 {
3239 char *eol;
3240 char key[32];
3241
3242 memset(key, 0, 32);
3243 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3244
3245 size_t len = strlen(key);
3246 *v = 0;
3247
3248 while (*str) {
3249 if (startswith(str, key)) {
3250 sscanf(str + len, "%lu", v);
3251 return;
3252 }
3253 eol = strchr(str, '\n');
3254 if (!eol)
3255 return;
3256 str = eol+1;
3257 }
3258 }
3259
3260 int read_file_fuse(const char *path, char *buf, size_t size, struct file_info *d)
3261 {
3262 size_t linelen = 0, total_len = 0, rv = 0;
3263 char *line = NULL;
3264 char *cache = d->buf;
3265 size_t cache_size = d->buflen;
3266 FILE *f = fopen(path, "r");
3267 if (!f)
3268 return 0;
3269
3270 while (getline(&line, &linelen, f) != -1) {
3271 ssize_t l = snprintf(cache, cache_size, "%s", line);
3272 if (l < 0) {
3273 perror("Error writing to cache");
3274 rv = 0;
3275 goto err;
3276 }
3277 if (l >= cache_size) {
3278 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3279 rv = 0;
3280 goto err;
3281 }
3282 cache += l;
3283 cache_size -= l;
3284 total_len += l;
3285 }
3286
3287 d->size = total_len;
3288 if (total_len > size)
3289 total_len = size;
3290
3291 /* read from off 0 */
3292 memcpy(buf, d->buf, total_len);
3293 rv = total_len;
3294 err:
3295 fclose(f);
3296 free(line);
3297 if (d->size > rv)
3298 d->cached = d->size - rv;
3299 return rv;
3300 }
3301
3302 /*
3303 * FUSE ops for /proc
3304 */
3305
3306 static unsigned long get_memlimit(const char *cgroup, bool swap)
3307 {
3308 int ret;
3309 __do_free char *memlimit_str = NULL;
3310 unsigned long memlimit = -1;
3311
3312 if (swap)
3313 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
3314 else
3315 ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
3316 if (ret > 0)
3317 memlimit = strtoul(memlimit_str, NULL, 10);
3318
3319 return memlimit;
3320 }
3321
3322 static unsigned long get_min_memlimit(const char *cgroup, bool swap)
3323 {
3324 __do_free char *copy = NULL;
3325 unsigned long memlimit = 0;
3326 unsigned long retlimit;
3327
3328 copy = strdup(cgroup);
3329 retlimit = get_memlimit(copy, swap);
3330
3331 while (strcmp(copy, "/") != 0) {
3332 char *it = copy;
3333
3334 it = dirname(it);
3335 memlimit = get_memlimit(it, swap);
3336 if (memlimit != -1 && memlimit < retlimit)
3337 retlimit = memlimit;
3338 };
3339
3340 return retlimit;
3341 }
3342
3343 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3344 struct fuse_file_info *fi)
3345 {
3346 __do_free char *cgroup = NULL, *line = NULL,
3347 *memusage_str = NULL, *memstat_str = NULL,
3348 *memswlimit_str = NULL, *memswusage_str = NULL;
3349 __do_fclose FILE *f = NULL;
3350 struct fuse_context *fc = fuse_get_context();
3351 struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
3352 struct file_info *d = (struct file_info *)fi->fh;
3353 unsigned long memlimit = 0, memusage = 0, memswlimit = 0,
3354 memswusage = 0, cached = 0, hosttotal = 0, active_anon = 0,
3355 inactive_anon = 0, active_file = 0, inactive_file = 0,
3356 unevictable = 0, shmem = 0, hostswtotal = 0;
3357 size_t linelen = 0, total_len = 0;
3358 char *cache = d->buf;
3359 size_t cache_size = d->buflen;
3360 int ret;
3361
3362 if (offset) {
3363 int left;
3364
3365 if (offset > d->size)
3366 return -EINVAL;
3367
3368 if (!d->cached)
3369 return 0;
3370
3371 left = d->size - offset;
3372 total_len = left > size ? size : left;
3373 memcpy(buf, cache + offset, total_len);
3374
3375 return total_len;
3376 }
3377
3378 pid_t initpid = lookup_initpid_in_store(fc->pid);
3379 if (initpid <= 1 || is_shared_pidns(initpid))
3380 initpid = fc->pid;
3381
3382 cgroup = get_pid_cgroup(initpid, "memory");
3383 if (!cgroup)
3384 return read_file_fuse("/proc/meminfo", buf, size, d);
3385
3386 prune_init_slice(cgroup);
3387
3388 memlimit = get_min_memlimit(cgroup, false);
3389
3390 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
3391 if (ret < 0)
3392 return 0;
3393
3394 ret = cgroup_ops->get_memory_stats(cgroup_ops, cgroup, &memstat_str);
3395 if (ret < 0)
3396 return 0;
3397 parse_memstat(ret, memstat_str, &cached, &active_anon, &inactive_anon,
3398 &active_file, &inactive_file, &unevictable, &shmem);
3399
3400 /*
3401 * Following values are allowed to fail, because swapaccount might be
3402 * turned off for current kernel.
3403 */
3404 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memswlimit_str);
3405 if (ret >= 0)
3406 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
3407 if (ret >= 0) {
3408 memswlimit = get_min_memlimit(cgroup, true);
3409 memswusage = strtoul(memswusage_str, NULL, 10);
3410 memswlimit = memswlimit / 1024;
3411 memswusage = memswusage / 1024;
3412 }
3413
3414 memusage = strtoul(memusage_str, NULL, 10);
3415 memlimit /= 1024;
3416 memusage /= 1024;
3417
3418 f = fopen("/proc/meminfo", "r");
3419 if (!f)
3420 return 0;
3421
3422 while (getline(&line, &linelen, f) != -1) {
3423 ssize_t l;
3424 char *printme, lbuf[100];
3425
3426 memset(lbuf, 0, 100);
3427 if (startswith(line, "MemTotal:")) {
3428 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3429 if (hosttotal < memlimit)
3430 memlimit = hosttotal;
3431 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3432 printme = lbuf;
3433 } else if (startswith(line, "MemFree:")) {
3434 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3435 printme = lbuf;
3436 } else if (startswith(line, "MemAvailable:")) {
3437 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
3438 printme = lbuf;
3439 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 &&
3440 opts && opts->swap_off == false) {
3441 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3442 if (hostswtotal < memswlimit)
3443 memswlimit = hostswtotal;
3444 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
3445 printme = lbuf;
3446 } else if (startswith(line, "SwapTotal:") && opts && opts->swap_off == true) {
3447 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", 0UL);
3448 printme = lbuf;
3449 } else if (startswith(line, "SwapFree:") && memswlimit > 0 &&
3450 memswusage > 0 && opts && opts->swap_off == false) {
3451 unsigned long swaptotal = memswlimit,
3452 swapusage = memusage > memswusage
3453 ? 0
3454 : memswusage - memusage,
3455 swapfree = swapusage < swaptotal
3456 ? swaptotal - swapusage
3457 : 0;
3458 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3459 printme = lbuf;
3460 } else if (startswith(line, "SwapFree:") && opts && opts->swap_off == true) {
3461 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", 0UL);
3462 printme = lbuf;
3463 } else if (startswith(line, "Slab:")) {
3464 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3465 printme = lbuf;
3466 } else if (startswith(line, "Buffers:")) {
3467 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3468 printme = lbuf;
3469 } else if (startswith(line, "Cached:")) {
3470 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3471 printme = lbuf;
3472 } else if (startswith(line, "SwapCached:")) {
3473 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3474 printme = lbuf;
3475 } else if (startswith(line, "Active:")) {
3476 snprintf(lbuf, 100, "Active: %8lu kB\n",
3477 active_anon + active_file);
3478 printme = lbuf;
3479 } else if (startswith(line, "Inactive:")) {
3480 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3481 inactive_anon + inactive_file);
3482 printme = lbuf;
3483 } else if (startswith(line, "Active(anon)")) {
3484 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3485 printme = lbuf;
3486 } else if (startswith(line, "Inactive(anon)")) {
3487 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3488 printme = lbuf;
3489 } else if (startswith(line, "Active(file)")) {
3490 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3491 printme = lbuf;
3492 } else if (startswith(line, "Inactive(file)")) {
3493 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3494 printme = lbuf;
3495 } else if (startswith(line, "Unevictable")) {
3496 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3497 printme = lbuf;
3498 } else if (startswith(line, "SReclaimable")) {
3499 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3500 printme = lbuf;
3501 } else if (startswith(line, "SUnreclaim")) {
3502 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3503 printme = lbuf;
3504 } else if (startswith(line, "Shmem:")) {
3505 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3506 printme = lbuf;
3507 } else if (startswith(line, "ShmemHugePages")) {
3508 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3509 printme = lbuf;
3510 } else if (startswith(line, "ShmemPmdMapped")) {
3511 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3512 printme = lbuf;
3513 } else
3514 printme = line;
3515
3516 l = snprintf(cache, cache_size, "%s", printme);
3517 if (l < 0) {
3518 perror("Error writing to cache");
3519 return 0;
3520
3521 }
3522 if (l >= cache_size) {
3523 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3524 return 0;
3525 }
3526
3527 cache += l;
3528 cache_size -= l;
3529 total_len += l;
3530 }
3531
3532 d->cached = 1;
3533 d->size = total_len;
3534 if (total_len > size ) total_len = size;
3535 memcpy(buf, d->buf, total_len);
3536
3537 return total_len;
3538 }
3539
3540 /*
3541 * Read the cpuset.cpus for cg
3542 * Return the answer in a newly allocated string which must be freed
3543 */
3544 char *get_cpuset(const char *cg)
3545 {
3546 char *value = NULL;
3547 int ret;
3548
3549 ret = cgroup_ops->get_cpuset_cpus(cgroup_ops, cg, &value);
3550 if (ret < 0)
3551 return NULL;
3552
3553 return value;
3554 }
3555
3556 bool cpu_in_cpuset(int cpu, const char *cpuset);
3557
3558 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3559 {
3560 int cpu;
3561
3562 if (sscanf(line, "processor : %d", &cpu) != 1)
3563 return false;
3564 return cpu_in_cpuset(cpu, cpuset);
3565 }
3566
3567 /*
3568 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3569 * depending on `param`. Parameter value is returned throuh `value`.
3570 */
3571 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3572 {
3573 bool rv = false;
3574 char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3575 char *str = NULL;
3576
3577 sprintf(file, "cpu.cfs_%s_us", param);
3578
3579 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
3580 goto err;
3581
3582 if (sscanf(str, "%ld", value) != 1)
3583 goto err;
3584
3585 rv = true;
3586
3587 err:
3588 if (str)
3589 free(str);
3590 return rv;
3591 }
3592
3593 /*
3594 * Return the maximum number of visible CPUs based on CPU quotas.
3595 * If there is no quota set, zero is returned.
3596 */
3597 int max_cpu_count(const char *cg)
3598 {
3599 int rv, nprocs;
3600 int64_t cfs_quota, cfs_period;
3601 int nr_cpus_in_cpuset = 0;
3602 char *cpuset = NULL;
3603
3604 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3605 return 0;
3606
3607 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3608 return 0;
3609
3610 cpuset = get_cpuset(cg);
3611 if (cpuset)
3612 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
3613
3614 if (cfs_quota <= 0 || cfs_period <= 0){
3615 if (nr_cpus_in_cpuset > 0)
3616 return nr_cpus_in_cpuset;
3617
3618 return 0;
3619 }
3620
3621 rv = cfs_quota / cfs_period;
3622
3623 /* In case quota/period does not yield a whole number, add one CPU for
3624 * the remainder.
3625 */
3626 if ((cfs_quota % cfs_period) > 0)
3627 rv += 1;
3628
3629 nprocs = get_nprocs();
3630
3631 if (rv > nprocs)
3632 rv = nprocs;
3633
3634 /* use min value in cpu quota and cpuset */
3635 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
3636 rv = nr_cpus_in_cpuset;
3637
3638 return rv;
3639 }
3640
3641 /*
3642 * Return the exact number of visible CPUs based on CPU quotas.
3643 * If there is no quota set, zero is returned.
3644 */
3645 static double exact_cpu_count(const char *cg)
3646 {
3647 double rv;
3648 int nprocs;
3649 int64_t cfs_quota, cfs_period;
3650
3651 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3652 return 0;
3653
3654 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3655 return 0;
3656
3657 if (cfs_quota <= 0 || cfs_period <= 0)
3658 return 0;
3659
3660 rv = (double)cfs_quota / (double)cfs_period;
3661
3662 nprocs = get_nprocs();
3663
3664 if (rv > nprocs)
3665 rv = nprocs;
3666
3667 return rv;
3668 }
3669
3670 /*
3671 * Determine whether CPU views should be used or not.
3672 */
3673 bool use_cpuview(const char *cg)
3674 {
3675 int cfd;
3676
3677 cfd = find_mounted_controller("cpu");
3678 if (cfd < 0)
3679 return false;
3680
3681 cfd = find_mounted_controller("cpuacct");
3682 if (cfd < 0)
3683 return false;
3684
3685 return true;
3686 }
3687
3688 /*
3689 * check whether this is a '^processor" line in /proc/cpuinfo
3690 */
3691 static bool is_processor_line(const char *line)
3692 {
3693 int cpu;
3694
3695 if (sscanf(line, "processor : %d", &cpu) == 1)
3696 return true;
3697 return false;
3698 }
3699
3700 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3701 struct fuse_file_info *fi)
3702 {
3703 struct fuse_context *fc = fuse_get_context();
3704 struct file_info *d = (struct file_info *)fi->fh;
3705 char *cg;
3706 char *cpuset = NULL;
3707 char *line = NULL;
3708 size_t linelen = 0, total_len = 0, rv = 0;
3709 bool am_printing = false, firstline = true, is_s390x = false;
3710 int curcpu = -1, cpu, max_cpus = 0;
3711 bool use_view;
3712 char *cache = d->buf;
3713 size_t cache_size = d->buflen;
3714 FILE *f = NULL;
3715
3716 if (offset){
3717 if (offset > d->size)
3718 return -EINVAL;
3719 if (!d->cached)
3720 return 0;
3721 int left = d->size - offset;
3722 total_len = left > size ? size: left;
3723 memcpy(buf, cache + offset, total_len);
3724 return total_len;
3725 }
3726
3727 pid_t initpid = lookup_initpid_in_store(fc->pid);
3728 if (initpid <= 1 || is_shared_pidns(initpid))
3729 initpid = fc->pid;
3730 cg = get_pid_cgroup(initpid, "cpuset");
3731 if (!cg)
3732 return read_file_fuse("proc/cpuinfo", buf, size, d);
3733 prune_init_slice(cg);
3734
3735 cpuset = get_cpuset(cg);
3736 if (!cpuset)
3737 goto err;
3738
3739 use_view = use_cpuview(cg);
3740
3741 if (use_view)
3742 max_cpus = max_cpu_count(cg);
3743
3744 f = fopen("/proc/cpuinfo", "r");
3745 if (!f)
3746 goto err;
3747
3748 while (getline(&line, &linelen, f) != -1) {
3749 ssize_t l;
3750 if (firstline) {
3751 firstline = false;
3752 if (strstr(line, "IBM/S390") != NULL) {
3753 is_s390x = true;
3754 am_printing = true;
3755 continue;
3756 }
3757 }
3758 if (strncmp(line, "# processors:", 12) == 0)
3759 continue;
3760 if (is_processor_line(line)) {
3761 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3762 break;
3763 am_printing = cpuline_in_cpuset(line, cpuset);
3764 if (am_printing) {
3765 curcpu ++;
3766 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3767 if (l < 0) {
3768 perror("Error writing to cache");
3769 rv = 0;
3770 goto err;
3771 }
3772 if (l >= cache_size) {
3773 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3774 rv = 0;
3775 goto err;
3776 }
3777 cache += l;
3778 cache_size -= l;
3779 total_len += l;
3780 }
3781 continue;
3782 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3783 char *p;
3784 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3785 break;
3786 if (!cpu_in_cpuset(cpu, cpuset))
3787 continue;
3788 curcpu ++;
3789 p = strchr(line, ':');
3790 if (!p || !*p)
3791 goto err;
3792 p++;
3793 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3794 if (l < 0) {
3795 perror("Error writing to cache");
3796 rv = 0;
3797 goto err;
3798 }
3799 if (l >= cache_size) {
3800 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3801 rv = 0;
3802 goto err;
3803 }
3804 cache += l;
3805 cache_size -= l;
3806 total_len += l;
3807 continue;
3808
3809 }
3810 if (am_printing) {
3811 l = snprintf(cache, cache_size, "%s", line);
3812 if (l < 0) {
3813 perror("Error writing to cache");
3814 rv = 0;
3815 goto err;
3816 }
3817 if (l >= cache_size) {
3818 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3819 rv = 0;
3820 goto err;
3821 }
3822 cache += l;
3823 cache_size -= l;
3824 total_len += l;
3825 }
3826 }
3827
3828 if (is_s390x) {
3829 char *origcache = d->buf;
3830 ssize_t l;
3831 do {
3832 d->buf = malloc(d->buflen);
3833 } while (!d->buf);
3834 cache = d->buf;
3835 cache_size = d->buflen;
3836 total_len = 0;
3837 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3838 if (l < 0 || l >= cache_size) {
3839 free(origcache);
3840 goto err;
3841 }
3842 cache_size -= l;
3843 cache += l;
3844 total_len += l;
3845 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3846 if (l < 0 || l >= cache_size) {
3847 free(origcache);
3848 goto err;
3849 }
3850 cache_size -= l;
3851 cache += l;
3852 total_len += l;
3853 l = snprintf(cache, cache_size, "%s", origcache);
3854 free(origcache);
3855 if (l < 0 || l >= cache_size)
3856 goto err;
3857 total_len += l;
3858 }
3859
3860 d->cached = 1;
3861 d->size = total_len;
3862 if (total_len > size ) total_len = size;
3863
3864 /* read from off 0 */
3865 memcpy(buf, d->buf, total_len);
3866 rv = total_len;
3867 err:
3868 if (f)
3869 fclose(f);
3870 free(line);
3871 free(cpuset);
3872 free(cg);
3873 return rv;
3874 }
3875
3876 static uint64_t get_reaper_start_time(pid_t pid)
3877 {
3878 int ret;
3879 FILE *f;
3880 uint64_t starttime;
3881 /* strlen("/proc/") = 6
3882 * +
3883 * LXCFS_NUMSTRLEN64
3884 * +
3885 * strlen("/stat") = 5
3886 * +
3887 * \0 = 1
3888 * */
3889 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3890 char path[__PROC_PID_STAT_LEN];
3891 pid_t qpid;
3892
3893 qpid = lookup_initpid_in_store(pid);
3894 if (qpid <= 0) {
3895 /* Caller can check for EINVAL on 0. */
3896 errno = EINVAL;
3897 return 0;
3898 }
3899
3900 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3901 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3902 /* Caller can check for EINVAL on 0. */
3903 errno = EINVAL;
3904 return 0;
3905 }
3906
3907 f = fopen(path, "r");
3908 if (!f) {
3909 /* Caller can check for EINVAL on 0. */
3910 errno = EINVAL;
3911 return 0;
3912 }
3913
3914 /* Note that the *scanf() argument supression requires that length
3915 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3916 * at us. It's like telling someone you're not married and then asking
3917 * if you can bring your wife to the party.
3918 */
3919 ret = fscanf(f, "%*d " /* (1) pid %d */
3920 "%*s " /* (2) comm %s */
3921 "%*c " /* (3) state %c */
3922 "%*d " /* (4) ppid %d */
3923 "%*d " /* (5) pgrp %d */
3924 "%*d " /* (6) session %d */
3925 "%*d " /* (7) tty_nr %d */
3926 "%*d " /* (8) tpgid %d */
3927 "%*u " /* (9) flags %u */
3928 "%*u " /* (10) minflt %lu */
3929 "%*u " /* (11) cminflt %lu */
3930 "%*u " /* (12) majflt %lu */
3931 "%*u " /* (13) cmajflt %lu */
3932 "%*u " /* (14) utime %lu */
3933 "%*u " /* (15) stime %lu */
3934 "%*d " /* (16) cutime %ld */
3935 "%*d " /* (17) cstime %ld */
3936 "%*d " /* (18) priority %ld */
3937 "%*d " /* (19) nice %ld */
3938 "%*d " /* (20) num_threads %ld */
3939 "%*d " /* (21) itrealvalue %ld */
3940 "%" PRIu64, /* (22) starttime %llu */
3941 &starttime);
3942 if (ret != 1) {
3943 fclose(f);
3944 /* Caller can check for EINVAL on 0. */
3945 errno = EINVAL;
3946 return 0;
3947 }
3948
3949 fclose(f);
3950
3951 errno = 0;
3952 return starttime;
3953 }
3954
3955 static double get_reaper_start_time_in_sec(pid_t pid)
3956 {
3957 uint64_t clockticks, ticks_per_sec;
3958 int64_t ret;
3959 double res = 0;
3960
3961 clockticks = get_reaper_start_time(pid);
3962 if (clockticks == 0 && errno == EINVAL) {
3963 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3964 return 0;
3965 }
3966
3967 ret = sysconf(_SC_CLK_TCK);
3968 if (ret < 0 && errno == EINVAL) {
3969 lxcfs_debug(
3970 "%s\n",
3971 "failed to determine number of clock ticks in a second");
3972 return 0;
3973 }
3974
3975 ticks_per_sec = (uint64_t)ret;
3976 res = (double)clockticks / ticks_per_sec;
3977 return res;
3978 }
3979
3980 static double get_reaper_age(pid_t pid)
3981 {
3982 uint64_t uptime_ms;
3983 double procstart, procage;
3984
3985 /* We need to substract the time the process has started since system
3986 * boot minus the time when the system has started to get the actual
3987 * reaper age.
3988 */
3989 procstart = get_reaper_start_time_in_sec(pid);
3990 procage = procstart;
3991 if (procstart > 0) {
3992 int ret;
3993 struct timespec spec;
3994
3995 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3996 if (ret < 0)
3997 return 0;
3998
3999 /* We could make this more precise here by using the tv_nsec
4000 * field in the timespec struct and convert it to milliseconds
4001 * and then create a double for the seconds and milliseconds but
4002 * that seems more work than it is worth.
4003 */
4004 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
4005 procage = (uptime_ms - (procstart * 1000)) / 1000;
4006 }
4007
4008 return procage;
4009 }
4010
4011 /*
4012 * Returns 0 on success.
4013 * It is the caller's responsibility to free `return_usage`, unless this
4014 * function returns an error.
4015 */
4016 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
4017 {
4018 int cpucount = get_nprocs_conf();
4019 struct cpuacct_usage *cpu_usage;
4020 int rv = 0, i, j, ret;
4021 int cg_cpu;
4022 uint64_t cg_user, cg_system;
4023 int64_t ticks_per_sec;
4024 char *usage_str = NULL;
4025
4026 ticks_per_sec = sysconf(_SC_CLK_TCK);
4027
4028 if (ticks_per_sec < 0 && errno == EINVAL) {
4029 lxcfs_v(
4030 "%s\n",
4031 "read_cpuacct_usage_all failed to determine number of clock ticks "
4032 "in a second");
4033 return -1;
4034 }
4035
4036 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4037 if (!cpu_usage)
4038 return -ENOMEM;
4039
4040 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
4041 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4042 // read cpuacct.usage_percpu instead
4043 lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", "");
4044 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str)) {
4045 rv = -1;
4046 goto err;
4047 }
4048 lxcfs_v("usage_str: %s\n", usage_str);
4049
4050 // convert cpuacct.usage_percpu into cpuacct.usage_all
4051 lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", "");
4052
4053 char *data = NULL;
4054 size_t sz = 0, asz = 0;
4055
4056 must_strcat(&data, &sz, &asz, "cpu user system\n");
4057
4058 int i = 0, read_pos = 0, read_cnt=0;
4059 while (sscanf(usage_str + read_pos, "%lu %n", &cg_user, &read_cnt) > 0) {
4060 lxcfs_debug("i: %d, cg_user: %lu, read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt);
4061 must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user);
4062 i++;
4063 read_pos += read_cnt;
4064 }
4065
4066 free(usage_str);
4067 usage_str = data;
4068
4069 lxcfs_v("usage_str: %s\n", usage_str);
4070 }
4071
4072 int read_pos = 0, read_cnt=0;
4073 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4074 lxcfs_error("read_cpuacct_usage_all reading first line from "
4075 "%s/cpuacct.usage_all failed.\n", cg);
4076 rv = -1;
4077 goto err;
4078 }
4079
4080 read_pos += read_cnt;
4081
4082 for (i = 0, j = 0; i < cpucount; i++) {
4083 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4084 &cg_system, &read_cnt);
4085
4086 if (ret == EOF)
4087 break;
4088
4089 if (ret != 3) {
4090 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4091 "failed.\n", cg);
4092 rv = -1;
4093 goto err;
4094 }
4095
4096 read_pos += read_cnt;
4097
4098 /* Convert the time from nanoseconds to USER_HZ */
4099 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4100 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4101 j++;
4102 }
4103
4104 rv = 0;
4105 *return_usage = cpu_usage;
4106 *size = cpucount;
4107
4108 err:
4109 if (usage_str)
4110 free(usage_str);
4111
4112 if (rv != 0) {
4113 free(cpu_usage);
4114 *return_usage = NULL;
4115 }
4116
4117 return rv;
4118 }
4119
4120 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4121 {
4122 int i;
4123 unsigned long sum = 0;
4124
4125 for (i = 0; i < cpu_count; i++) {
4126 if (!newer[i].online)
4127 continue;
4128
4129 /* When cpuset is changed on the fly, the CPUs might get reordered.
4130 * We could either reset all counters, or check that the substractions
4131 * below will return expected results.
4132 */
4133 if (newer[i].user > older[i].user)
4134 diff[i].user = newer[i].user - older[i].user;
4135 else
4136 diff[i].user = 0;
4137
4138 if (newer[i].system > older[i].system)
4139 diff[i].system = newer[i].system - older[i].system;
4140 else
4141 diff[i].system = 0;
4142
4143 if (newer[i].idle > older[i].idle)
4144 diff[i].idle = newer[i].idle - older[i].idle;
4145 else
4146 diff[i].idle = 0;
4147
4148 sum += diff[i].user;
4149 sum += diff[i].system;
4150 sum += diff[i].idle;
4151 }
4152
4153 return sum;
4154 }
4155
4156 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4157 {
4158 unsigned long free_space, to_add;
4159
4160 free_space = threshold - usage->user - usage->system;
4161
4162 if (free_space > usage->idle)
4163 free_space = usage->idle;
4164
4165 to_add = free_space > *surplus ? *surplus : free_space;
4166
4167 *counter += to_add;
4168 usage->idle -= to_add;
4169 *surplus -= to_add;
4170 }
4171
4172 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4173 {
4174 struct cg_proc_stat *first = NULL, *prev, *tmp;
4175
4176 for (prev = NULL; node; ) {
4177 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4178 tmp = node;
4179 lxcfs_debug("Removing stat node for %s\n", node->cg);
4180
4181 if (prev)
4182 prev->next = node->next;
4183 else
4184 first = node->next;
4185
4186 node = node->next;
4187 free_proc_stat_node(tmp);
4188 } else {
4189 if (!first)
4190 first = node;
4191 prev = node;
4192 node = node->next;
4193 }
4194 }
4195
4196 return first;
4197 }
4198
4199 #define PROC_STAT_PRUNE_INTERVAL 10
4200 static void prune_proc_stat_history(void)
4201 {
4202 int i;
4203 time_t now = time(NULL);
4204
4205 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4206 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4207
4208 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4209 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4210 return;
4211 }
4212
4213 if (proc_stat_history[i]->next) {
4214 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4215 proc_stat_history[i]->lastcheck = now;
4216 }
4217
4218 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4219 }
4220 }
4221
4222 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4223 {
4224 struct cg_proc_stat *node;
4225
4226 pthread_rwlock_rdlock(&head->lock);
4227
4228 if (!head->next) {
4229 pthread_rwlock_unlock(&head->lock);
4230 return NULL;
4231 }
4232
4233 node = head->next;
4234
4235 do {
4236 if (strcmp(cg, node->cg) == 0)
4237 goto out;
4238 } while ((node = node->next));
4239
4240 node = NULL;
4241
4242 out:
4243 pthread_rwlock_unlock(&head->lock);
4244 prune_proc_stat_history();
4245 return node;
4246 }
4247
4248 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4249 {
4250 struct cg_proc_stat *node;
4251 int i;
4252
4253 node = malloc(sizeof(struct cg_proc_stat));
4254 if (!node)
4255 goto err;
4256
4257 node->cg = NULL;
4258 node->usage = NULL;
4259 node->view = NULL;
4260
4261 node->cg = malloc(strlen(cg) + 1);
4262 if (!node->cg)
4263 goto err;
4264
4265 strcpy(node->cg, cg);
4266
4267 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4268 if (!node->usage)
4269 goto err;
4270
4271 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4272
4273 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4274 if (!node->view)
4275 goto err;
4276
4277 node->cpu_count = cpu_count;
4278 node->next = NULL;
4279
4280 if (pthread_mutex_init(&node->lock, NULL) != 0) {
4281 lxcfs_error("%s\n", "Failed to initialize node lock");
4282 goto err;
4283 }
4284
4285 for (i = 0; i < cpu_count; i++) {
4286 node->view[i].user = 0;
4287 node->view[i].system = 0;
4288 node->view[i].idle = 0;
4289 }
4290
4291 return node;
4292
4293 err:
4294 if (node && node->cg)
4295 free(node->cg);
4296 if (node && node->usage)
4297 free(node->usage);
4298 if (node && node->view)
4299 free(node->view);
4300 if (node)
4301 free(node);
4302
4303 return NULL;
4304 }
4305
4306 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4307 {
4308 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4309 struct cg_proc_stat_head *head = proc_stat_history[hash];
4310 struct cg_proc_stat *node, *rv = new_node;
4311
4312 pthread_rwlock_wrlock(&head->lock);
4313
4314 if (!head->next) {
4315 head->next = new_node;
4316 goto out;
4317 }
4318
4319 node = head->next;
4320
4321 for (;;) {
4322 if (strcmp(node->cg, new_node->cg) == 0) {
4323 /* The node is already present, return it */
4324 free_proc_stat_node(new_node);
4325 rv = node;
4326 goto out;
4327 }
4328
4329 if (node->next) {
4330 node = node->next;
4331 continue;
4332 }
4333
4334 node->next = new_node;
4335 goto out;
4336 }
4337
4338 out:
4339 pthread_rwlock_unlock(&head->lock);
4340 return rv;
4341 }
4342
4343 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4344 {
4345 struct cpuacct_usage *new_usage, *new_view;
4346 int i;
4347
4348 /* Allocate new memory */
4349 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4350 if (!new_usage)
4351 return false;
4352
4353 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4354 if (!new_view) {
4355 free(new_usage);
4356 return false;
4357 }
4358
4359 /* Copy existing data & initialize new elements */
4360 for (i = 0; i < cpu_count; i++) {
4361 if (i < node->cpu_count) {
4362 new_usage[i].user = node->usage[i].user;
4363 new_usage[i].system = node->usage[i].system;
4364 new_usage[i].idle = node->usage[i].idle;
4365
4366 new_view[i].user = node->view[i].user;
4367 new_view[i].system = node->view[i].system;
4368 new_view[i].idle = node->view[i].idle;
4369 } else {
4370 new_usage[i].user = 0;
4371 new_usage[i].system = 0;
4372 new_usage[i].idle = 0;
4373
4374 new_view[i].user = 0;
4375 new_view[i].system = 0;
4376 new_view[i].idle = 0;
4377 }
4378 }
4379
4380 free(node->usage);
4381 free(node->view);
4382
4383 node->usage = new_usage;
4384 node->view = new_view;
4385 node->cpu_count = cpu_count;
4386
4387 return true;
4388 }
4389
4390 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4391 {
4392 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4393 struct cg_proc_stat_head *head = proc_stat_history[hash];
4394 struct cg_proc_stat *node;
4395
4396 node = find_proc_stat_node(head, cg);
4397
4398 if (!node) {
4399 node = new_proc_stat_node(usage, cpu_count, cg);
4400 if (!node)
4401 return NULL;
4402
4403 node = add_proc_stat_node(node);
4404 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4405 }
4406
4407 pthread_mutex_lock(&node->lock);
4408
4409 /* If additional CPUs on the host have been enabled, CPU usage counter
4410 * arrays have to be expanded */
4411 if (node->cpu_count < cpu_count) {
4412 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4413 node->cpu_count, cpu_count, cg);
4414
4415 if (!expand_proc_stat_node(node, cpu_count)) {
4416 pthread_mutex_unlock(&node->lock);
4417 lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4418 node->cpu_count, cpu_count, cg);
4419 return NULL;
4420 }
4421 }
4422
4423 return node;
4424 }
4425
4426 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4427 {
4428 int i;
4429
4430 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4431 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4432
4433 for (i = 0; i < cpu_count; i++) {
4434 node->view[i].user = 0;
4435 node->view[i].system = 0;
4436 node->view[i].idle = 0;
4437 }
4438
4439 node->cpu_count = cpu_count;
4440 }
4441
4442 static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, FILE *f, char *buf, size_t buf_size)
4443 {
4444 char *line = NULL;
4445 size_t linelen = 0, total_len = 0, rv = 0, l;
4446 int curcpu = -1; /* cpu numbering starts at 0 */
4447 int physcpu, i;
4448 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4449 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4450 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4451 unsigned long user_surplus = 0, system_surplus = 0;
4452 unsigned long total_sum, threshold;
4453 struct cg_proc_stat *stat_node;
4454 struct cpuacct_usage *diff = NULL;
4455 int nprocs = get_nprocs_conf();
4456
4457 if (cg_cpu_usage_size < nprocs)
4458 nprocs = cg_cpu_usage_size;
4459
4460 /* Read all CPU stats and stop when we've encountered other lines */
4461 while (getline(&line, &linelen, f) != -1) {
4462 int ret;
4463 char cpu_char[10]; /* That's a lot of cores */
4464 uint64_t all_used, cg_used;
4465
4466 if (strlen(line) == 0)
4467 continue;
4468 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4469 /* not a ^cpuN line containing a number N */
4470 break;
4471 }
4472
4473 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4474 continue;
4475
4476 if (physcpu >= cg_cpu_usage_size)
4477 continue;
4478
4479 curcpu ++;
4480 cpu_cnt ++;
4481
4482 if (!cpu_in_cpuset(physcpu, cpuset)) {
4483 for (i = curcpu; i <= physcpu; i++) {
4484 cg_cpu_usage[i].online = false;
4485 }
4486 continue;
4487 }
4488
4489 if (curcpu < physcpu) {
4490 /* Some CPUs may be disabled */
4491 for (i = curcpu; i < physcpu; i++)
4492 cg_cpu_usage[i].online = false;
4493
4494 curcpu = physcpu;
4495 }
4496
4497 cg_cpu_usage[curcpu].online = true;
4498
4499 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4500 &user,
4501 &nice,
4502 &system,
4503 &idle,
4504 &iowait,
4505 &irq,
4506 &softirq,
4507 &steal,
4508 &guest,
4509 &guest_nice);
4510
4511 if (ret != 10)
4512 continue;
4513
4514 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4515 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4516
4517 if (all_used >= cg_used) {
4518 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4519
4520 } else {
4521 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4522 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4523 curcpu, cg, all_used, cg_used);
4524 cg_cpu_usage[curcpu].idle = idle;
4525 }
4526 }
4527
4528 /* Cannot use more CPUs than is available due to cpuset */
4529 if (max_cpus > cpu_cnt)
4530 max_cpus = cpu_cnt;
4531
4532 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4533
4534 if (!stat_node) {
4535 lxcfs_error("unable to find/create stat node for %s\n", cg);
4536 rv = 0;
4537 goto err;
4538 }
4539
4540 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4541 if (!diff) {
4542 rv = 0;
4543 goto err;
4544 }
4545
4546 /*
4547 * If the new values are LOWER than values stored in memory, it means
4548 * the cgroup has been reset/recreated and we should reset too.
4549 */
4550 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4551 if (!cg_cpu_usage[curcpu].online)
4552 continue;
4553
4554 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4555 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4556
4557 break;
4558 }
4559
4560 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4561
4562 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4563 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4564
4565 if (!stat_node->usage[curcpu].online)
4566 continue;
4567
4568 i++;
4569
4570 stat_node->usage[curcpu].user += diff[curcpu].user;
4571 stat_node->usage[curcpu].system += diff[curcpu].system;
4572 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4573
4574 if (max_cpus > 0 && i >= max_cpus) {
4575 user_surplus += diff[curcpu].user;
4576 system_surplus += diff[curcpu].system;
4577 }
4578 }
4579
4580 /* Calculate usage counters of visible CPUs */
4581 if (max_cpus > 0) {
4582 /* threshold = maximum usage per cpu, including idle */
4583 threshold = total_sum / cpu_cnt * max_cpus;
4584
4585 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4586 if (!stat_node->usage[curcpu].online)
4587 continue;
4588
4589 i++;
4590
4591 if (i == max_cpus)
4592 break;
4593
4594 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4595 continue;
4596
4597 /* Add user */
4598 add_cpu_usage(
4599 &user_surplus,
4600 &diff[curcpu],
4601 &diff[curcpu].user,
4602 threshold);
4603
4604 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4605 continue;
4606
4607 /* If there is still room, add system */
4608 add_cpu_usage(
4609 &system_surplus,
4610 &diff[curcpu],
4611 &diff[curcpu].system,
4612 threshold);
4613 }
4614
4615 if (user_surplus > 0)
4616 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4617 if (system_surplus > 0)
4618 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4619
4620 unsigned long diff_user = 0;
4621 unsigned long diff_system = 0;
4622 unsigned long diff_idle = 0;
4623 unsigned long max_diff_idle = 0;
4624 unsigned long max_diff_idle_index = 0;
4625 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4626 if (!stat_node->usage[curcpu].online)
4627 continue;
4628
4629 i++;
4630
4631 if (i == max_cpus)
4632 break;
4633
4634 stat_node->view[curcpu].user += diff[curcpu].user;
4635 stat_node->view[curcpu].system += diff[curcpu].system;
4636 stat_node->view[curcpu].idle += diff[curcpu].idle;
4637
4638 user_sum += stat_node->view[curcpu].user;
4639 system_sum += stat_node->view[curcpu].system;
4640 idle_sum += stat_node->view[curcpu].idle;
4641
4642 diff_user += diff[curcpu].user;
4643 diff_system += diff[curcpu].system;
4644 diff_idle += diff[curcpu].idle;
4645 if (diff[curcpu].idle > max_diff_idle) {
4646 max_diff_idle = diff[curcpu].idle;
4647 max_diff_idle_index = curcpu;
4648 }
4649
4650 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
4651 }
4652 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
4653
4654 // revise cpu usage view to support partial cpu case
4655 double exact_cpus = exact_cpu_count(cg);
4656 if (exact_cpus < (double)max_cpus){
4657 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
4658 unsigned long delta = (unsigned long)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
4659 lxcfs_v("delta: %lu\n", delta);
4660 lxcfs_v("idle_sum before: %lu\n", idle_sum);
4661 idle_sum = idle_sum > delta ? idle_sum - delta : 0;
4662 lxcfs_v("idle_sum after: %lu\n", idle_sum);
4663
4664 curcpu = max_diff_idle_index;
4665 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
4666 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
4667 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
4668 }
4669 } else {
4670 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4671 if (!stat_node->usage[curcpu].online)
4672 continue;
4673
4674 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4675 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4676 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4677
4678 user_sum += stat_node->view[curcpu].user;
4679 system_sum += stat_node->view[curcpu].system;
4680 idle_sum += stat_node->view[curcpu].idle;
4681 }
4682 }
4683
4684 /* Render the file */
4685 /* cpu-all */
4686 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4687 user_sum,
4688 system_sum,
4689 idle_sum);
4690 lxcfs_v("cpu-all: %s\n", buf);
4691
4692 if (l < 0) {
4693 perror("Error writing to cache");
4694 rv = 0;
4695 goto err;
4696 }
4697 if (l >= buf_size) {
4698 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4699 rv = 0;
4700 goto err;
4701 }
4702
4703 buf += l;
4704 buf_size -= l;
4705 total_len += l;
4706
4707 /* Render visible CPUs */
4708 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4709 if (!stat_node->usage[curcpu].online)
4710 continue;
4711
4712 i++;
4713
4714 if (max_cpus > 0 && i == max_cpus)
4715 break;
4716
4717 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4718 i,
4719 stat_node->view[curcpu].user,
4720 stat_node->view[curcpu].system,
4721 stat_node->view[curcpu].idle);
4722 lxcfs_v("cpu: %s\n", buf);
4723
4724 if (l < 0) {
4725 perror("Error writing to cache");
4726 rv = 0;
4727 goto err;
4728
4729 }
4730 if (l >= buf_size) {
4731 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4732 rv = 0;
4733 goto err;
4734 }
4735
4736 buf += l;
4737 buf_size -= l;
4738 total_len += l;
4739 }
4740
4741 /* Pass the rest of /proc/stat, start with the last line read */
4742 l = snprintf(buf, buf_size, "%s", line);
4743
4744 if (l < 0) {
4745 perror("Error writing to cache");
4746 rv = 0;
4747 goto err;
4748
4749 }
4750 if (l >= buf_size) {
4751 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4752 rv = 0;
4753 goto err;
4754 }
4755
4756 buf += l;
4757 buf_size -= l;
4758 total_len += l;
4759
4760 /* Pass the rest of the host's /proc/stat */
4761 while (getline(&line, &linelen, f) != -1) {
4762 l = snprintf(buf, buf_size, "%s", line);
4763 if (l < 0) {
4764 perror("Error writing to cache");
4765 rv = 0;
4766 goto err;
4767 }
4768 if (l >= buf_size) {
4769 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4770 rv = 0;
4771 goto err;
4772 }
4773 buf += l;
4774 buf_size -= l;
4775 total_len += l;
4776 }
4777
4778 rv = total_len;
4779
4780 err:
4781 if (stat_node)
4782 pthread_mutex_unlock(&stat_node->lock);
4783 if (line)
4784 free(line);
4785 if (diff)
4786 free(diff);
4787 return rv;
4788 }
4789
4790 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4791 static int proc_stat_read(char *buf, size_t size, off_t offset,
4792 struct fuse_file_info *fi)
4793 {
4794 struct fuse_context *fc = fuse_get_context();
4795 struct file_info *d = (struct file_info *)fi->fh;
4796 char *cg;
4797 char *cpuset = NULL;
4798 char *line = NULL;
4799 size_t linelen = 0, total_len = 0, rv = 0;
4800 int curcpu = -1; /* cpu numbering starts at 0 */
4801 int physcpu = 0;
4802 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4803 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
4804 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4805 char cpuall[CPUALL_MAX_SIZE];
4806 /* reserve for cpu all */
4807 char *cache = d->buf + CPUALL_MAX_SIZE;
4808 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4809 FILE *f = NULL;
4810 struct cpuacct_usage *cg_cpu_usage = NULL;
4811 int cg_cpu_usage_size = 0;
4812
4813 if (offset){
4814 if (offset > d->size)
4815 return -EINVAL;
4816 if (!d->cached)
4817 return 0;
4818 int left = d->size - offset;
4819 total_len = left > size ? size: left;
4820 memcpy(buf, d->buf + offset, total_len);
4821 return total_len;
4822 }
4823
4824 pid_t initpid = lookup_initpid_in_store(fc->pid);
4825 lxcfs_v("initpid: %d\n", initpid);
4826 if (initpid <= 0)
4827 initpid = fc->pid;
4828
4829 /*
4830 * when container run with host pid namespace initpid == 1, cgroup will "/"
4831 * we should return host os's /proc contents.
4832 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
4833 */
4834 if (initpid == 1) {
4835 return read_file_fuse("/proc/stat", buf, size, d);
4836 }
4837
4838 cg = get_pid_cgroup(initpid, "cpuset");
4839 lxcfs_v("cg: %s\n", cg);
4840 if (!cg)
4841 return read_file_fuse("/proc/stat", buf, size, d);
4842 prune_init_slice(cg);
4843
4844 cpuset = get_cpuset(cg);
4845 if (!cpuset)
4846 goto err;
4847
4848 /*
4849 * Read cpuacct.usage_all for all CPUs.
4850 * If the cpuacct cgroup is present, it is used to calculate the container's
4851 * CPU usage. If not, values from the host's /proc/stat are used.
4852 */
4853 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
4854 lxcfs_v("%s\n", "proc_stat_read failed to read from cpuacct, "
4855 "falling back to the host's /proc/stat");
4856 }
4857
4858 f = fopen("/proc/stat", "r");
4859 if (!f)
4860 goto err;
4861
4862 //skip first line
4863 if (getline(&line, &linelen, f) < 0) {
4864 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4865 goto err;
4866 }
4867
4868 if (use_cpuview(cg) && cg_cpu_usage) {
4869 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4870 f, d->buf, d->buflen);
4871 goto out;
4872 }
4873
4874 while (getline(&line, &linelen, f) != -1) {
4875 ssize_t l;
4876 char cpu_char[10]; /* That's a lot of cores */
4877 char *c;
4878 uint64_t all_used, cg_used, new_idle;
4879 int ret;
4880
4881 if (strlen(line) == 0)
4882 continue;
4883 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4884 /* not a ^cpuN line containing a number N, just print it */
4885 l = snprintf(cache, cache_size, "%s", line);
4886 if (l < 0) {
4887 perror("Error writing to cache");
4888 rv = 0;
4889 goto err;
4890 }
4891 if (l >= cache_size) {
4892 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4893 rv = 0;
4894 goto err;
4895 }
4896 cache += l;
4897 cache_size -= l;
4898 total_len += l;
4899 continue;
4900 }
4901
4902 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4903 continue;
4904 if (!cpu_in_cpuset(physcpu, cpuset))
4905 continue;
4906 curcpu ++;
4907
4908 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4909 &user,
4910 &nice,
4911 &system,
4912 &idle,
4913 &iowait,
4914 &irq,
4915 &softirq,
4916 &steal,
4917 &guest,
4918 &guest_nice);
4919
4920 if (ret != 10 || !cg_cpu_usage) {
4921 c = strchr(line, ' ');
4922 if (!c)
4923 continue;
4924 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4925 if (l < 0) {
4926 perror("Error writing to cache");
4927 rv = 0;
4928 goto err;
4929
4930 }
4931 if (l >= cache_size) {
4932 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4933 rv = 0;
4934 goto err;
4935 }
4936
4937 cache += l;
4938 cache_size -= l;
4939 total_len += l;
4940
4941 if (ret != 10)
4942 continue;
4943 }
4944
4945 if (cg_cpu_usage) {
4946 if (physcpu >= cg_cpu_usage_size)
4947 break;
4948
4949 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4950 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
4951
4952 if (all_used >= cg_used) {
4953 new_idle = idle + (all_used - cg_used);
4954
4955 } else {
4956 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4957 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4958 curcpu, cg, all_used, cg_used);
4959 new_idle = idle;
4960 }
4961
4962 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4963 curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
4964 new_idle);
4965
4966 if (l < 0) {
4967 perror("Error writing to cache");
4968 rv = 0;
4969 goto err;
4970
4971 }
4972 if (l >= cache_size) {
4973 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4974 rv = 0;
4975 goto err;
4976 }
4977
4978 cache += l;
4979 cache_size -= l;
4980 total_len += l;
4981
4982 user_sum += cg_cpu_usage[physcpu].user;
4983 system_sum += cg_cpu_usage[physcpu].system;
4984 idle_sum += new_idle;
4985
4986 } else {
4987 user_sum += user;
4988 nice_sum += nice;
4989 system_sum += system;
4990 idle_sum += idle;
4991 iowait_sum += iowait;
4992 irq_sum += irq;
4993 softirq_sum += softirq;
4994 steal_sum += steal;
4995 guest_sum += guest;
4996 guest_nice_sum += guest_nice;
4997 }
4998 }
4999
5000 cache = d->buf;
5001
5002 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5003 user_sum,
5004 nice_sum,
5005 system_sum,
5006 idle_sum,
5007 iowait_sum,
5008 irq_sum,
5009 softirq_sum,
5010 steal_sum,
5011 guest_sum,
5012 guest_nice_sum);
5013 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
5014 memcpy(cache, cpuall, cpuall_len);
5015 cache += cpuall_len;
5016 } else {
5017 /* shouldn't happen */
5018 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
5019 cpuall_len = 0;
5020 }
5021
5022 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
5023 total_len += cpuall_len;
5024
5025 out:
5026 d->cached = 1;
5027 d->size = total_len;
5028 if (total_len > size)
5029 total_len = size;
5030
5031 memcpy(buf, d->buf, total_len);
5032 rv = total_len;
5033
5034 err:
5035 if (f)
5036 fclose(f);
5037 if (cg_cpu_usage)
5038 free(cg_cpu_usage);
5039 free(line);
5040 free(cpuset);
5041 free(cg);
5042 return rv;
5043 }
5044
5045 /* This function retrieves the busy time of a group of tasks by looking at
5046 * cpuacct.usage. Unfortunately, this only makes sense when the container has
5047 * been given it's own cpuacct cgroup. If not, this function will take the busy
5048 * time of all other taks that do not actually belong to the container into
5049 * account as well. If someone has a clever solution for this please send a
5050 * patch!
5051 */
5052 static double get_reaper_busy(pid_t task)
5053 {
5054 pid_t initpid = lookup_initpid_in_store(task);
5055 char *cgroup = NULL, *usage_str = NULL;
5056 unsigned long usage = 0;
5057 double res = 0;
5058
5059 if (initpid <= 0)
5060 return 0;
5061
5062 cgroup = get_pid_cgroup(initpid, "cpuacct");
5063 if (!cgroup)
5064 goto out;
5065 prune_init_slice(cgroup);
5066 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cgroup, "cpuacct.usage", &usage_str))
5067 goto out;
5068 usage = strtoul(usage_str, NULL, 10);
5069 res = (double)usage / 1000000000;
5070
5071 out:
5072 free(cgroup);
5073 free(usage_str);
5074 return res;
5075 }
5076
5077 #if RELOADTEST
5078 void iwashere(void)
5079 {
5080 int fd;
5081
5082 fd = creat("/tmp/lxcfs-iwashere", 0644);
5083 if (fd >= 0)
5084 close(fd);
5085 }
5086 #endif
5087
5088 /*
5089 * We read /proc/uptime and reuse its second field.
5090 * For the first field, we use the mtime for the reaper for
5091 * the calling pid as returned by getreaperage
5092 */
5093 static int proc_uptime_read(char *buf, size_t size, off_t offset,
5094 struct fuse_file_info *fi)
5095 {
5096 struct fuse_context *fc = fuse_get_context();
5097 struct file_info *d = (struct file_info *)fi->fh;
5098 double busytime = get_reaper_busy(fc->pid);
5099 char *cache = d->buf;
5100 ssize_t total_len = 0;
5101 double idletime, reaperage;
5102
5103 #if RELOADTEST
5104 iwashere();
5105 #endif
5106
5107 if (offset){
5108 if (!d->cached)
5109 return 0;
5110 if (offset > d->size)
5111 return -EINVAL;
5112 int left = d->size - offset;
5113 total_len = left > size ? size: left;
5114 memcpy(buf, cache + offset, total_len);
5115 return total_len;
5116 }
5117
5118 reaperage = get_reaper_age(fc->pid);
5119 /* To understand why this is done, please read the comment to the
5120 * get_reaper_busy() function.
5121 */
5122 idletime = reaperage;
5123 if (reaperage >= busytime)
5124 idletime = reaperage - busytime;
5125
5126 total_len = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
5127 if (total_len < 0 || total_len >= d->buflen){
5128 lxcfs_error("%s\n", "failed to write to cache");
5129 return 0;
5130 }
5131
5132 d->size = (int)total_len;
5133 d->cached = 1;
5134
5135 if (total_len > size) total_len = size;
5136
5137 memcpy(buf, d->buf, total_len);
5138 return total_len;
5139 }
5140
5141 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5142 struct fuse_file_info *fi)
5143 {
5144 char dev_name[72];
5145 struct fuse_context *fc = fuse_get_context();
5146 struct file_info *d = (struct file_info *)fi->fh;
5147 char *cg;
5148 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5149 *io_wait_time_str = NULL, *io_service_time_str = NULL;
5150 unsigned long read = 0, write = 0;
5151 unsigned long read_merged = 0, write_merged = 0;
5152 unsigned long read_sectors = 0, write_sectors = 0;
5153 unsigned long read_ticks = 0, write_ticks = 0;
5154 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5155 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5156 char *cache = d->buf;
5157 size_t cache_size = d->buflen;
5158 char *line = NULL;
5159 size_t linelen = 0, total_len = 0, rv = 0;
5160 unsigned int major = 0, minor = 0;
5161 int i = 0;
5162 FILE *f = NULL;
5163
5164 if (offset){
5165 if (offset > d->size)
5166 return -EINVAL;
5167 if (!d->cached)
5168 return 0;
5169 int left = d->size - offset;
5170 total_len = left > size ? size: left;
5171 memcpy(buf, cache + offset, total_len);
5172 return total_len;
5173 }
5174
5175 pid_t initpid = lookup_initpid_in_store(fc->pid);
5176 if (initpid <= 1 || is_shared_pidns(initpid))
5177 initpid = fc->pid;
5178 cg = get_pid_cgroup(initpid, "blkio");
5179 if (!cg)
5180 return read_file_fuse("/proc/diskstats", buf, size, d);
5181 prune_init_slice(cg);
5182
5183 if (!cgroup_ops->get(cgroup_ops, "blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
5184 goto err;
5185 if (!cgroup_ops->get(cgroup_ops, "blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
5186 goto err;
5187 if (!cgroup_ops->get(cgroup_ops, "blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
5188 goto err;
5189 if (!cgroup_ops->get(cgroup_ops, "blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
5190 goto err;
5191 if (!cgroup_ops->get(cgroup_ops, "blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
5192 goto err;
5193
5194
5195 f = fopen("/proc/diskstats", "r");
5196 if (!f)
5197 goto err;
5198
5199 while (getline(&line, &linelen, f) != -1) {
5200 ssize_t l;
5201 char lbuf[256];
5202
5203 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5204 if (i != 3)
5205 continue;
5206
5207 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5208 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5209 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5210 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5211 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5212 read_sectors = read_sectors/512;
5213 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5214 write_sectors = write_sectors/512;
5215
5216 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5217 rd_svctm = rd_svctm/1000000;
5218 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5219 rd_wait = rd_wait/1000000;
5220 read_ticks = rd_svctm + rd_wait;
5221
5222 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5223 wr_svctm = wr_svctm/1000000;
5224 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5225 wr_wait = wr_wait/1000000;
5226 write_ticks = wr_svctm + wr_wait;
5227
5228 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5229 tot_ticks = tot_ticks/1000000;
5230
5231 memset(lbuf, 0, 256);
5232 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5233 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5234 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5235 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5236 else
5237 continue;
5238
5239 l = snprintf(cache, cache_size, "%s", lbuf);
5240 if (l < 0) {
5241 perror("Error writing to fuse buf");
5242 rv = 0;
5243 goto err;
5244 }
5245 if (l >= cache_size) {
5246 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5247 rv = 0;
5248 goto err;
5249 }
5250 cache += l;
5251 cache_size -= l;
5252 total_len += l;
5253 }
5254
5255 d->cached = 1;
5256 d->size = total_len;
5257 if (total_len > size ) total_len = size;
5258 memcpy(buf, d->buf, total_len);
5259
5260 rv = total_len;
5261 err:
5262 free(cg);
5263 if (f)
5264 fclose(f);
5265 free(line);
5266 free(io_serviced_str);
5267 free(io_merged_str);
5268 free(io_service_bytes_str);
5269 free(io_wait_time_str);
5270 free(io_service_time_str);
5271 return rv;
5272 }
5273
5274 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5275 struct fuse_file_info *fi)
5276 {
5277 __do_free char *cg = NULL, *memswlimit_str = NULL, *memusage_str = NULL,
5278 *memswusage_str = NULL;
5279 struct fuse_context *fc = fuse_get_context();
5280 struct file_info *d = (struct file_info *)fi->fh;
5281 unsigned long memswlimit = 0, memlimit = 0, memusage = 0,
5282 memswusage = 0, swap_total = 0, swap_free = 0;
5283 ssize_t total_len = 0;
5284 ssize_t l = 0;
5285 char *cache = d->buf;
5286 int ret;
5287
5288 if (offset) {
5289 int left;
5290
5291 if (offset > d->size)
5292 return -EINVAL;
5293
5294 if (!d->cached)
5295 return 0;
5296
5297 left = d->size - offset;
5298 total_len = left > size ? size: left;
5299 memcpy(buf, cache + offset, total_len);
5300
5301 return total_len;
5302 }
5303
5304 pid_t initpid = lookup_initpid_in_store(fc->pid);
5305 if (initpid <= 1 || is_shared_pidns(initpid))
5306 initpid = fc->pid;
5307 cg = get_pid_cgroup(initpid, "memory");
5308 if (!cg)
5309 return read_file_fuse("/proc/swaps", buf, size, d);
5310 prune_init_slice(cg);
5311
5312 memlimit = get_min_memlimit(cg, false);
5313
5314 ret = cgroup_ops->get_memory_current(cgroup_ops, cg, &memusage_str);
5315 if (ret < 0)
5316 return 0;
5317
5318 memusage = strtoul(memusage_str, NULL, 10);
5319
5320 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cg, &memswlimit_str);
5321 if (ret >= 0)
5322 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cg, &memswusage_str);
5323 if (ret >= 0) {
5324 memswlimit = get_min_memlimit(cg, true);
5325 memswusage = strtoul(memswusage_str, NULL, 10);
5326 swap_total = (memswlimit - memlimit) / 1024;
5327 swap_free = (memswusage - memusage) / 1024;
5328 }
5329
5330 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5331
5332 /* When no mem + swap limit is specified or swapaccount=0*/
5333 if (!memswlimit) {
5334 __do_free char *line = NULL;
5335 __do_fclose FILE *f = NULL;
5336 size_t linelen = 0;
5337
5338 f = fopen("/proc/meminfo", "r");
5339 if (!f)
5340 return 0;
5341
5342 while (getline(&line, &linelen, f) != -1) {
5343 if (startswith(line, "SwapTotal:"))
5344 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
5345 else if (startswith(line, "SwapFree:"))
5346 sscanf(line, "SwapFree: %8lu kB", &swap_free);
5347 }
5348 }
5349
5350 if (swap_total > 0) {
5351 l = snprintf(d->buf + total_len, d->size - total_len,
5352 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5353 swap_total, swap_free);
5354 total_len += l;
5355 }
5356
5357 if (total_len < 0 || l < 0) {
5358 perror("Error writing to cache");
5359 return 0;
5360 }
5361
5362 d->cached = 1;
5363 d->size = (int)total_len;
5364
5365 if (total_len > size) total_len = size;
5366 memcpy(buf, d->buf, total_len);
5367 return total_len;
5368 }
5369
5370 /*
5371 * Find the process pid from cgroup path.
5372 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5373 * @pid_buf : put pid to pid_buf.
5374 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5375 * @depth : the depth of cgroup in container.
5376 * @sum : return the number of pid.
5377 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5378 */
5379 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5380 {
5381 DIR *dir;
5382 int fd;
5383 struct dirent *file;
5384 FILE *f = NULL;
5385 size_t linelen = 0;
5386 char *line = NULL;
5387 int pd;
5388 char *path_dir, *path;
5389 char **pid;
5390
5391 /* path = dpath + "/cgroup.procs" + /0 */
5392 do {
5393 path = malloc(strlen(dpath) + 20);
5394 } while (!path);
5395
5396 strcpy(path, dpath);
5397 fd = openat(cfd, path, O_RDONLY);
5398 if (fd < 0)
5399 goto out;
5400
5401 dir = fdopendir(fd);
5402 if (dir == NULL) {
5403 close(fd);
5404 goto out;
5405 }
5406
5407 while (((file = readdir(dir)) != NULL) && depth > 0) {
5408 if (strncmp(file->d_name, ".", 1) == 0)
5409 continue;
5410 if (strncmp(file->d_name, "..", 1) == 0)
5411 continue;
5412 if (file->d_type == DT_DIR) {
5413 /* path + '/' + d_name +/0 */
5414 do {
5415 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5416 } while (!path_dir);
5417 strcpy(path_dir, path);
5418 strcat(path_dir, "/");
5419 strcat(path_dir, file->d_name);
5420 pd = depth - 1;
5421 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5422 free(path_dir);
5423 }
5424 }
5425 closedir(dir);
5426
5427 strcat(path, "/cgroup.procs");
5428 fd = openat(cfd, path, O_RDONLY);
5429 if (fd < 0)
5430 goto out;
5431
5432 f = fdopen(fd, "r");
5433 if (!f) {
5434 close(fd);
5435 goto out;
5436 }
5437
5438 while (getline(&line, &linelen, f) != -1) {
5439 do {
5440 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5441 } while (!pid);
5442 *pid_buf = pid;
5443 do {
5444 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5445 } while (*(*pid_buf + sum) == NULL);
5446 strcpy(*(*pid_buf + sum), line);
5447 sum++;
5448 }
5449 fclose(f);
5450 out:
5451 if (line)
5452 free(line);
5453 free(path);
5454 return sum;
5455 }
5456 /*
5457 * calc_load calculates the load according to the following formula:
5458 * load1 = load0 * exp + active * (1 - exp)
5459 *
5460 * @load1: the new loadavg.
5461 * @load0: the former loadavg.
5462 * @active: the total number of running pid at this moment.
5463 * @exp: the fixed-point defined in the beginning.
5464 */
5465 static unsigned long
5466 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5467 {
5468 unsigned long newload;
5469
5470 active = active > 0 ? active * FIXED_1 : 0;
5471 newload = load * exp + active * (FIXED_1 - exp);
5472 if (active >= load)
5473 newload += FIXED_1 - 1;
5474
5475 return newload / FIXED_1;
5476 }
5477
5478 /*
5479 * Return 0 means that container p->cg is closed.
5480 * Return -1 means that error occurred in refresh.
5481 * Positive num equals the total number of pid.
5482 */
5483 static int refresh_load(struct load_node *p, char *path)
5484 {
5485 FILE *f = NULL;
5486 char **idbuf;
5487 char proc_path[256];
5488 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5489 char *line = NULL;
5490 size_t linelen = 0;
5491 int sum, length;
5492 DIR *dp;
5493 struct dirent *file;
5494
5495 do {
5496 idbuf = malloc(sizeof(char *));
5497 } while (!idbuf);
5498 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5499 /* normal exit */
5500 if (sum == 0)
5501 goto out;
5502
5503 for (i = 0; i < sum; i++) {
5504 /*clean up '\n' */
5505 length = strlen(idbuf[i])-1;
5506 idbuf[i][length] = '\0';
5507 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5508 if (ret < 0 || ret > 255) {
5509 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5510 i = sum;
5511 sum = -1;
5512 goto err_out;
5513 }
5514
5515 dp = opendir(proc_path);
5516 if (!dp) {
5517 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5518 continue;
5519 }
5520 while ((file = readdir(dp)) != NULL) {
5521 if (strncmp(file->d_name, ".", 1) == 0)
5522 continue;
5523 if (strncmp(file->d_name, "..", 1) == 0)
5524 continue;
5525 total_pid++;
5526 /* We make the biggest pid become last_pid.*/
5527 ret = atof(file->d_name);
5528 last_pid = (ret > last_pid) ? ret : last_pid;
5529
5530 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5531 if (ret < 0 || ret > 255) {
5532 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5533 i = sum;
5534 sum = -1;
5535 closedir(dp);
5536 goto err_out;
5537 }
5538 f = fopen(proc_path, "r");
5539 if (f != NULL) {
5540 while (getline(&line, &linelen, f) != -1) {
5541 /* Find State */
5542 if ((line[0] == 'S') && (line[1] == 't'))
5543 break;
5544 }
5545 if ((line[7] == 'R') || (line[7] == 'D'))
5546 run_pid++;
5547 fclose(f);
5548 }
5549 }
5550 closedir(dp);
5551 }
5552 /*Calculate the loadavg.*/
5553 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5554 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5555 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5556 p->run_pid = run_pid;
5557 p->total_pid = total_pid;
5558 p->last_pid = last_pid;
5559
5560 free(line);
5561 err_out:
5562 for (; i > 0; i--)
5563 free(idbuf[i-1]);
5564 out:
5565 free(idbuf);
5566 return sum;
5567 }
5568 /*
5569 * Traverse the hash table and update it.
5570 */
5571 void *load_begin(void *arg)
5572 {
5573
5574 char *path = NULL;
5575 int i, sum, length, ret;
5576 struct load_node *f;
5577 int first_node;
5578 clock_t time1, time2;
5579
5580 while (1) {
5581 if (loadavg_stop == 1)
5582 return NULL;
5583
5584 time1 = clock();
5585 for (i = 0; i < LOAD_SIZE; i++) {
5586 pthread_mutex_lock(&load_hash[i].lock);
5587 if (load_hash[i].next == NULL) {
5588 pthread_mutex_unlock(&load_hash[i].lock);
5589 continue;
5590 }
5591 f = load_hash[i].next;
5592 first_node = 1;
5593 while (f) {
5594 length = strlen(f->cg) + 2;
5595 do {
5596 /* strlen(f->cg) + '.' or '' + \0 */
5597 path = malloc(length);
5598 } while (!path);
5599
5600 ret = snprintf(path, length, "%s%s", dot_or_empty(f->cg), f->cg);
5601 if (ret < 0 || ret > length - 1) {
5602 /* snprintf failed, ignore the node.*/
5603 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5604 goto out;
5605 }
5606 sum = refresh_load(f, path);
5607 if (sum == 0) {
5608 f = del_node(f, i);
5609 } else {
5610 out: f = f->next;
5611 }
5612 free(path);
5613 /* load_hash[i].lock locks only on the first node.*/
5614 if (first_node == 1) {
5615 first_node = 0;
5616 pthread_mutex_unlock(&load_hash[i].lock);
5617 }
5618 }
5619 }
5620
5621 if (loadavg_stop == 1)
5622 return NULL;
5623
5624 time2 = clock();
5625 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5626 }
5627 }
5628
5629 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5630 struct fuse_file_info *fi)
5631 {
5632 struct fuse_context *fc = fuse_get_context();
5633 struct file_info *d = (struct file_info *)fi->fh;
5634 pid_t initpid;
5635 char *cg;
5636 size_t total_len = 0;
5637 char *cache = d->buf;
5638 struct load_node *n;
5639 int hash;
5640 int cfd, rv = 0;
5641 unsigned long a, b, c;
5642
5643 if (offset) {
5644 if (offset > d->size)
5645 return -EINVAL;
5646 if (!d->cached)
5647 return 0;
5648 int left = d->size - offset;
5649 total_len = left > size ? size : left;
5650 memcpy(buf, cache + offset, total_len);
5651 return total_len;
5652 }
5653 if (!loadavg)
5654 return read_file_fuse("/proc/loadavg", buf, size, d);
5655
5656 initpid = lookup_initpid_in_store(fc->pid);
5657 if (initpid <= 1 || is_shared_pidns(initpid))
5658 initpid = fc->pid;
5659 cg = get_pid_cgroup(initpid, "cpu");
5660 if (!cg)
5661 return read_file_fuse("/proc/loadavg", buf, size, d);
5662
5663 prune_init_slice(cg);
5664 hash = calc_hash(cg) % LOAD_SIZE;
5665 n = locate_node(cg, hash);
5666
5667 /* First time */
5668 if (n == NULL) {
5669 cfd = find_mounted_controller("cpu");
5670 if (cfd >= 0) {
5671 /*
5672 * In locate_node() above, pthread_rwlock_unlock() isn't used
5673 * because delete is not allowed before read has ended.
5674 */
5675 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5676 rv = 0;
5677 goto err;
5678 }
5679 do {
5680 n = malloc(sizeof(struct load_node));
5681 } while (!n);
5682
5683 do {
5684 n->cg = malloc(strlen(cg)+1);
5685 } while (!n->cg);
5686 strcpy(n->cg, cg);
5687 n->avenrun[0] = 0;
5688 n->avenrun[1] = 0;
5689 n->avenrun[2] = 0;
5690 n->run_pid = 0;
5691 n->total_pid = 1;
5692 n->last_pid = initpid;
5693 n->cfd = cfd;
5694 insert_node(&n, hash);
5695 }
5696 a = n->avenrun[0] + (FIXED_1/200);
5697 b = n->avenrun[1] + (FIXED_1/200);
5698 c = n->avenrun[2] + (FIXED_1/200);
5699 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5700 LOAD_INT(a), LOAD_FRAC(a),
5701 LOAD_INT(b), LOAD_FRAC(b),
5702 LOAD_INT(c), LOAD_FRAC(c),
5703 n->run_pid, n->total_pid, n->last_pid);
5704 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5705 if (total_len < 0 || total_len >= d->buflen) {
5706 lxcfs_error("%s\n", "Failed to write to cache");
5707 rv = 0;
5708 goto err;
5709 }
5710 d->size = (int)total_len;
5711 d->cached = 1;
5712
5713 if (total_len > size)
5714 total_len = size;
5715 memcpy(buf, d->buf, total_len);
5716 rv = total_len;
5717
5718 err:
5719 free(cg);
5720 return rv;
5721 }
5722 /* Return a positive number on success, return 0 on failure.*/
5723 pthread_t load_daemon(int load_use)
5724 {
5725 int ret;
5726 pthread_t pid;
5727
5728 ret = init_load();
5729 if (ret == -1) {
5730 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5731 return 0;
5732 }
5733 ret = pthread_create(&pid, NULL, load_begin, NULL);
5734 if (ret != 0) {
5735 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5736 load_free();
5737 return 0;
5738 }
5739 /* use loadavg, here loadavg = 1*/
5740 loadavg = load_use;
5741 return pid;
5742 }
5743
5744 /* Returns 0 on success. */
5745 int stop_load_daemon(pthread_t pid)
5746 {
5747 int s;
5748
5749 /* Signal the thread to gracefully stop */
5750 loadavg_stop = 1;
5751
5752 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5753 if (s != 0) {
5754 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5755 return -1;
5756 }
5757
5758 load_free();
5759 loadavg_stop = 0;
5760
5761 return 0;
5762 }
5763
5764 static off_t get_procfile_size(const char *which)
5765 {
5766 FILE *f = fopen(which, "r");
5767 char *line = NULL;
5768 size_t len = 0;
5769 ssize_t sz, answer = 0;
5770 if (!f)
5771 return 0;
5772
5773 while ((sz = getline(&line, &len, f)) != -1)
5774 answer += sz;
5775 fclose (f);
5776 free(line);
5777
5778 return answer;
5779 }
5780
5781 int proc_getattr(const char *path, struct stat *sb)
5782 {
5783 struct timespec now;
5784
5785 memset(sb, 0, sizeof(struct stat));
5786 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5787 return -EINVAL;
5788 sb->st_uid = sb->st_gid = 0;
5789 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5790 if (strcmp(path, "/proc") == 0) {
5791 sb->st_mode = S_IFDIR | 00555;
5792 sb->st_nlink = 2;
5793 return 0;
5794 }
5795 if (strcmp(path, "/proc/meminfo") == 0 ||
5796 strcmp(path, "/proc/cpuinfo") == 0 ||
5797 strcmp(path, "/proc/uptime") == 0 ||
5798 strcmp(path, "/proc/stat") == 0 ||
5799 strcmp(path, "/proc/diskstats") == 0 ||
5800 strcmp(path, "/proc/swaps") == 0 ||
5801 strcmp(path, "/proc/loadavg") == 0) {
5802 sb->st_size = 0;
5803 sb->st_mode = S_IFREG | 00444;
5804 sb->st_nlink = 1;
5805 return 0;
5806 }
5807
5808 return -ENOENT;
5809 }
5810
5811 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5812 struct fuse_file_info *fi)
5813 {
5814 if (filler(buf, ".", NULL, 0) != 0 ||
5815 filler(buf, "..", NULL, 0) != 0 ||
5816 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5817 filler(buf, "meminfo", NULL, 0) != 0 ||
5818 filler(buf, "stat", NULL, 0) != 0 ||
5819 filler(buf, "uptime", NULL, 0) != 0 ||
5820 filler(buf, "diskstats", NULL, 0) != 0 ||
5821 filler(buf, "swaps", NULL, 0) != 0 ||
5822 filler(buf, "loadavg", NULL, 0) != 0)
5823 return -EINVAL;
5824 return 0;
5825 }
5826
5827 int proc_open(const char *path, struct fuse_file_info *fi)
5828 {
5829 int type = -1;
5830 struct file_info *info;
5831
5832 if (strcmp(path, "/proc/meminfo") == 0)
5833 type = LXC_TYPE_PROC_MEMINFO;
5834 else if (strcmp(path, "/proc/cpuinfo") == 0)
5835 type = LXC_TYPE_PROC_CPUINFO;
5836 else if (strcmp(path, "/proc/uptime") == 0)
5837 type = LXC_TYPE_PROC_UPTIME;
5838 else if (strcmp(path, "/proc/stat") == 0)
5839 type = LXC_TYPE_PROC_STAT;
5840 else if (strcmp(path, "/proc/diskstats") == 0)
5841 type = LXC_TYPE_PROC_DISKSTATS;
5842 else if (strcmp(path, "/proc/swaps") == 0)
5843 type = LXC_TYPE_PROC_SWAPS;
5844 else if (strcmp(path, "/proc/loadavg") == 0)
5845 type = LXC_TYPE_PROC_LOADAVG;
5846 if (type == -1)
5847 return -ENOENT;
5848
5849 info = malloc(sizeof(*info));
5850 if (!info)
5851 return -ENOMEM;
5852
5853 memset(info, 0, sizeof(*info));
5854 info->type = type;
5855
5856 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5857 do {
5858 info->buf = malloc(info->buflen);
5859 } while (!info->buf);
5860 memset(info->buf, 0, info->buflen);
5861 /* set actual size to buffer size */
5862 info->size = info->buflen;
5863
5864 fi->fh = (unsigned long)info;
5865 return 0;
5866 }
5867
5868 int proc_access(const char *path, int mask)
5869 {
5870 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5871 return 0;
5872
5873 /* these are all read-only */
5874 if ((mask & ~R_OK) != 0)
5875 return -EACCES;
5876 return 0;
5877 }
5878
5879 int proc_release(const char *path, struct fuse_file_info *fi)
5880 {
5881 do_release_file_info(fi);
5882 return 0;
5883 }
5884
5885 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5886 struct fuse_file_info *fi)
5887 {
5888 struct file_info *f = (struct file_info *) fi->fh;
5889
5890 switch (f->type) {
5891 case LXC_TYPE_PROC_MEMINFO:
5892 return proc_meminfo_read(buf, size, offset, fi);
5893 case LXC_TYPE_PROC_CPUINFO:
5894 return proc_cpuinfo_read(buf, size, offset, fi);
5895 case LXC_TYPE_PROC_UPTIME:
5896 return proc_uptime_read(buf, size, offset, fi);
5897 case LXC_TYPE_PROC_STAT:
5898 return proc_stat_read(buf, size, offset, fi);
5899 case LXC_TYPE_PROC_DISKSTATS:
5900 return proc_diskstats_read(buf, size, offset, fi);
5901 case LXC_TYPE_PROC_SWAPS:
5902 return proc_swaps_read(buf, size, offset, fi);
5903 case LXC_TYPE_PROC_LOADAVG:
5904 return proc_loadavg_read(buf, size, offset, fi);
5905 default:
5906 return -EINVAL;
5907 }
5908 }
5909
5910 /*
5911 * Functions needed to setup cgroups in the __constructor__.
5912 */
5913
5914 static bool umount_if_mounted(void)
5915 {
5916 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5917 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5918 return false;
5919 }
5920 return true;
5921 }
5922
5923 /* __typeof__ should be safe to use with all compilers. */
5924 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5925 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5926 {
5927 return (fs->f_type == (fs_type_magic)magic_val);
5928 }
5929
5930 /*
5931 * looking at fs/proc_namespace.c, it appears we can
5932 * actually expect the rootfs entry to very specifically contain
5933 * " - rootfs rootfs "
5934 * IIUC, so long as we've chrooted so that rootfs is not our root,
5935 * the rootfs entry should always be skipped in mountinfo contents.
5936 */
5937 static bool is_on_ramfs(void)
5938 {
5939 FILE *f;
5940 char *p, *p2;
5941 char *line = NULL;
5942 size_t len = 0;
5943 int i;
5944
5945 f = fopen("/proc/self/mountinfo", "r");
5946 if (!f)
5947 return false;
5948
5949 while (getline(&line, &len, f) != -1) {
5950 for (p = line, i = 0; p && i < 4; i++)
5951 p = strchr(p + 1, ' ');
5952 if (!p)
5953 continue;
5954 p2 = strchr(p + 1, ' ');
5955 if (!p2)
5956 continue;
5957 *p2 = '\0';
5958 if (strcmp(p + 1, "/") == 0) {
5959 // this is '/'. is it the ramfs?
5960 p = strchr(p2 + 1, '-');
5961 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5962 free(line);
5963 fclose(f);
5964 return true;
5965 }
5966 }
5967 }
5968 free(line);
5969 fclose(f);
5970 return false;
5971 }
5972
5973 static int pivot_enter()
5974 {
5975 int ret = -1, oldroot = -1, newroot = -1;
5976
5977 oldroot = open("/", O_DIRECTORY | O_RDONLY);
5978 if (oldroot < 0) {
5979 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5980 return ret;
5981 }
5982
5983 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5984 if (newroot < 0) {
5985 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5986 goto err;
5987 }
5988
5989 /* change into new root fs */
5990 if (fchdir(newroot) < 0) {
5991 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5992 goto err;
5993 }
5994
5995 /* pivot_root into our new root fs */
5996 if (pivot_root(".", ".") < 0) {
5997 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5998 goto err;
5999 }
6000
6001 /*
6002 * At this point the old-root is mounted on top of our new-root.
6003 * To unmounted it we must not be chdir'd into it, so escape back
6004 * to the old-root.
6005 */
6006 if (fchdir(oldroot) < 0) {
6007 lxcfs_error("%s\n", "Failed to enter old root.");
6008 goto err;
6009 }
6010
6011 if (umount2(".", MNT_DETACH) < 0) {
6012 lxcfs_error("%s\n", "Failed to detach old root.");
6013 goto err;
6014 }
6015
6016 if (fchdir(newroot) < 0) {
6017 lxcfs_error("%s\n", "Failed to re-enter new root.");
6018 goto err;
6019 }
6020
6021 ret = 0;
6022
6023 err:
6024 if (oldroot > 0)
6025 close(oldroot);
6026 if (newroot > 0)
6027 close(newroot);
6028
6029 return ret;
6030 }
6031
6032 static int chroot_enter()
6033 {
6034 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
6035 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
6036 return -1;
6037 }
6038
6039 if (chroot(".") < 0) {
6040 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
6041 return -1;
6042 }
6043
6044 if (chdir("/") < 0) {
6045 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
6046 return -1;
6047 }
6048
6049 return 0;
6050 }
6051
6052 static int permute_and_enter(void)
6053 {
6054 struct statfs sb;
6055
6056 if (statfs("/", &sb) < 0) {
6057 lxcfs_error("%s\n", "Could not stat / mountpoint.");
6058 return -1;
6059 }
6060
6061 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
6062 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
6063 * /proc/1/mountinfo. */
6064 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
6065 return chroot_enter();
6066
6067 if (pivot_enter() < 0) {
6068 lxcfs_error("%s\n", "Could not perform pivot root.");
6069 return -1;
6070 }
6071
6072 return 0;
6073 }
6074
6075 /* Prepare our new clean root. */
6076 static int permute_prepare(void)
6077 {
6078 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
6079 lxcfs_error("%s\n", "Failed to create directory for new root.");
6080 return -1;
6081 }
6082
6083 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
6084 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
6085 return -1;
6086 }
6087
6088 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
6089 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
6090 return -1;
6091 }
6092
6093 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
6094 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
6095 return -1;
6096 }
6097
6098 return 0;
6099 }
6100
6101 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6102 static bool permute_root(void)
6103 {
6104 /* Prepare new root. */
6105 if (permute_prepare() < 0)
6106 return false;
6107
6108 /* Pivot into new root. */
6109 if (permute_and_enter() < 0)
6110 return false;
6111
6112 return true;
6113 }
6114
6115 static int preserve_mnt_ns(int pid)
6116 {
6117 int ret;
6118 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6119 char path[len];
6120
6121 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6122 if (ret < 0 || (size_t)ret >= len)
6123 return -1;
6124
6125 return open(path, O_RDONLY | O_CLOEXEC);
6126 }
6127
6128 static bool cgfs_prepare_mounts(void)
6129 {
6130 if (!mkdir_p(BASEDIR, 0700)) {
6131 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6132 return false;
6133 }
6134
6135 if (!umount_if_mounted()) {
6136 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6137 return false;
6138 }
6139
6140 if (unshare(CLONE_NEWNS) < 0) {
6141 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6142 return false;
6143 }
6144
6145 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6146 if (cgroup_mount_ns_fd < 0) {
6147 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6148 return false;
6149 }
6150
6151 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6152 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6153 return false;
6154 }
6155
6156 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6157 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6158 return false;
6159 }
6160
6161 return true;
6162 }
6163
6164 static bool cgfs_mount_hierarchies(void)
6165 {
6166 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
6167 return false;
6168
6169 if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
6170 return false;
6171
6172 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
6173 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
6174 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
6175 if ((*h)->fd < 0)
6176 return false;
6177 }
6178
6179 return true;
6180 }
6181
6182 static bool cgfs_setup_controllers(void)
6183 {
6184 if (!cgfs_prepare_mounts())
6185 return false;
6186
6187 if (!cgfs_mount_hierarchies()) {
6188 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6189 return false;
6190 }
6191
6192 if (!permute_root())
6193 return false;
6194
6195 return true;
6196 }
6197
6198 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
6199 {
6200 char *cret;
6201 char cwd[MAXPATHLEN];
6202 int init_ns = -1;
6203
6204 cgroup_ops = cgroup_init();
6205 if (!cgroup_ops)
6206 return;
6207
6208 /* Preserve initial namespace. */
6209 init_ns = preserve_mnt_ns(getpid());
6210 if (init_ns < 0) {
6211 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
6212 goto out;
6213 }
6214
6215 cret = getcwd(cwd, MAXPATHLEN);
6216 if (!cret)
6217 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6218
6219 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6220 * to privately mount lxcfs cgroups. */
6221 if (!cgfs_setup_controllers()) {
6222 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
6223 goto out;
6224 }
6225
6226 if (setns(init_ns, 0) < 0) {
6227 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
6228 goto out;
6229 }
6230
6231 if (!cret || chdir(cwd) < 0)
6232 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6233
6234 if (!init_cpuview()) {
6235 lxcfs_error("%s\n", "failed to init CPU view");
6236 goto out;
6237 }
6238
6239 print_subsystems();
6240
6241 out:
6242 if (init_ns >= 0)
6243 close(init_ns);
6244 }
6245
6246 static void __attribute__((destructor)) free_subsystems(void)
6247 {
6248 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6249
6250 cgroup_exit(cgroup_ops);
6251 free_cpuview();
6252
6253 if (cgroup_mount_ns_fd >= 0)
6254 close(cgroup_mount_ns_fd);
6255 }