]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
Merge pull request #325 from brauner/2020-02-20/cgroup2_support_9
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #define __STDC_FORMAT_MACROS
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <fuse.h>
16 #include <inttypes.h>
17 #include <libgen.h>
18 #include <pthread.h>
19 #include <sched.h>
20 #include <stdarg.h>
21 #include <stdbool.h>
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <time.h>
27 #include <unistd.h>
28 #include <wait.h>
29 #include <linux/magic.h>
30 #include <linux/sched.h>
31 #include <sys/epoll.h>
32 #include <sys/mman.h>
33 #include <sys/mount.h>
34 #include <sys/param.h>
35 #include <sys/socket.h>
36 #include <sys/syscall.h>
37 #include <sys/sysinfo.h>
38 #include <sys/vfs.h>
39
40 #include "bindings.h"
41 #include "cgroups/cgroup.h"
42 #include "cgroups/cgroup_utils.h"
43 #include "memory_utils.h"
44 #include "config.h"
45
46 /* Define pivot_root() if missing from the C library */
47 #ifndef HAVE_PIVOT_ROOT
48 static int pivot_root(const char * new_root, const char * put_old)
49 {
50 #ifdef __NR_pivot_root
51 return syscall(__NR_pivot_root, new_root, put_old);
52 #else
53 errno = ENOSYS;
54 return -1;
55 #endif
56 }
57 #else
58 extern int pivot_root(const char * new_root, const char * put_old);
59 #endif
60
61 struct cpuacct_usage {
62 uint64_t user;
63 uint64_t system;
64 uint64_t idle;
65 bool online;
66 };
67
68 /* The function of hash table.*/
69 #define LOAD_SIZE 100 /*the size of hash_table */
70 #define FLUSH_TIME 5 /*the flush rate */
71 #define DEPTH_DIR 3 /*the depth of per cgroup */
72 /* The function of calculate loadavg .*/
73 #define FSHIFT 11 /* nr of bits of precision */
74 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
75 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
76 #define EXP_5 2014 /* 1/exp(5sec/5min) */
77 #define EXP_15 2037 /* 1/exp(5sec/15min) */
78 #define LOAD_INT(x) ((x) >> FSHIFT)
79 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
80 /*
81 * This parameter is used for proc_loadavg_read().
82 * 1 means use loadavg, 0 means not use.
83 */
84 static int loadavg = 0;
85 static volatile sig_atomic_t loadavg_stop = 0;
86 static int calc_hash(const char *name)
87 {
88 unsigned int hash = 0;
89 unsigned int x = 0;
90 /* ELFHash algorithm. */
91 while (*name) {
92 hash = (hash << 4) + *name++;
93 x = hash & 0xf0000000;
94 if (x != 0)
95 hash ^= (x >> 24);
96 hash &= ~x;
97 }
98 return (hash & 0x7fffffff);
99 }
100
101 struct load_node {
102 char *cg; /*cg */
103 unsigned long avenrun[3]; /* Load averages */
104 unsigned int run_pid;
105 unsigned int total_pid;
106 unsigned int last_pid;
107 int cfd; /* The file descriptor of the mounted cgroup */
108 struct load_node *next;
109 struct load_node **pre;
110 };
111
112 struct load_head {
113 /*
114 * The lock is about insert load_node and refresh load_node.To the first
115 * load_node of each hash bucket, insert and refresh in this hash bucket is
116 * mutually exclusive.
117 */
118 pthread_mutex_t lock;
119 /*
120 * The rdlock is about read loadavg and delete load_node.To each hash
121 * bucket, read and delete is mutually exclusive. But at the same time, we
122 * allow paratactic read operation. This rdlock is at list level.
123 */
124 pthread_rwlock_t rdlock;
125 /*
126 * The rilock is about read loadavg and insert load_node.To the first
127 * load_node of each hash bucket, read and insert is mutually exclusive.
128 * But at the same time, we allow paratactic read operation.
129 */
130 pthread_rwlock_t rilock;
131 struct load_node *next;
132 };
133
134 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
135 /*
136 * init_load initialize the hash table.
137 * Return 0 on success, return -1 on failure.
138 */
139 static int init_load(void)
140 {
141 int i;
142 int ret;
143
144 for (i = 0; i < LOAD_SIZE; i++) {
145 load_hash[i].next = NULL;
146 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
147 if (ret != 0) {
148 lxcfs_error("%s\n", "Failed to initialize lock");
149 goto out3;
150 }
151 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
152 if (ret != 0) {
153 lxcfs_error("%s\n", "Failed to initialize rdlock");
154 goto out2;
155 }
156 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
157 if (ret != 0) {
158 lxcfs_error("%s\n", "Failed to initialize rilock");
159 goto out1;
160 }
161 }
162 return 0;
163 out1:
164 pthread_rwlock_destroy(&load_hash[i].rdlock);
165 out2:
166 pthread_mutex_destroy(&load_hash[i].lock);
167 out3:
168 while (i > 0) {
169 i--;
170 pthread_mutex_destroy(&load_hash[i].lock);
171 pthread_rwlock_destroy(&load_hash[i].rdlock);
172 pthread_rwlock_destroy(&load_hash[i].rilock);
173 }
174 return -1;
175 }
176
177 static void insert_node(struct load_node **n, int locate)
178 {
179 struct load_node *f;
180
181 pthread_mutex_lock(&load_hash[locate].lock);
182 pthread_rwlock_wrlock(&load_hash[locate].rilock);
183 f = load_hash[locate].next;
184 load_hash[locate].next = *n;
185
186 (*n)->pre = &(load_hash[locate].next);
187 if (f)
188 f->pre = &((*n)->next);
189 (*n)->next = f;
190 pthread_mutex_unlock(&load_hash[locate].lock);
191 pthread_rwlock_unlock(&load_hash[locate].rilock);
192 }
193 /*
194 * locate_node() finds special node. Not return NULL means success.
195 * It should be noted that rdlock isn't unlocked at the end of code
196 * because this function is used to read special node. Delete is not
197 * allowed before read has ended.
198 * unlock rdlock only in proc_loadavg_read().
199 */
200 static struct load_node *locate_node(char *cg, int locate)
201 {
202 struct load_node *f = NULL;
203 int i = 0;
204
205 pthread_rwlock_rdlock(&load_hash[locate].rilock);
206 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
207 if (load_hash[locate].next == NULL) {
208 pthread_rwlock_unlock(&load_hash[locate].rilock);
209 return f;
210 }
211 f = load_hash[locate].next;
212 pthread_rwlock_unlock(&load_hash[locate].rilock);
213 while (f && ((i = strcmp(f->cg, cg)) != 0))
214 f = f->next;
215 return f;
216 }
217
218 /* Delete the load_node n and return the next node of it. */
219 static struct load_node *del_node(struct load_node *n, int locate)
220 {
221 struct load_node *g;
222
223 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
224 if (n->next == NULL) {
225 *(n->pre) = NULL;
226 } else {
227 *(n->pre) = n->next;
228 n->next->pre = n->pre;
229 }
230 g = n->next;
231 free_disarm(n->cg);
232 free_disarm(n);
233 pthread_rwlock_unlock(&load_hash[locate].rdlock);
234 return g;
235 }
236
237 static void load_free(void)
238 {
239 struct load_node *f, *p;
240
241 for (int i = 0; i < LOAD_SIZE; i++) {
242 pthread_mutex_lock(&load_hash[i].lock);
243 pthread_rwlock_wrlock(&load_hash[i].rilock);
244 pthread_rwlock_wrlock(&load_hash[i].rdlock);
245 if (load_hash[i].next == NULL) {
246 pthread_mutex_unlock(&load_hash[i].lock);
247 pthread_mutex_destroy(&load_hash[i].lock);
248 pthread_rwlock_unlock(&load_hash[i].rilock);
249 pthread_rwlock_destroy(&load_hash[i].rilock);
250 pthread_rwlock_unlock(&load_hash[i].rdlock);
251 pthread_rwlock_destroy(&load_hash[i].rdlock);
252 continue;
253 }
254
255 for (f = load_hash[i].next; f;) {
256 free_disarm(f->cg);
257 p = f->next;
258 free_disarm(f);
259 f = p;
260 }
261
262 pthread_mutex_unlock(&load_hash[i].lock);
263 pthread_mutex_destroy(&load_hash[i].lock);
264 pthread_rwlock_unlock(&load_hash[i].rilock);
265 pthread_rwlock_destroy(&load_hash[i].rilock);
266 pthread_rwlock_unlock(&load_hash[i].rdlock);
267 pthread_rwlock_destroy(&load_hash[i].rdlock);
268 }
269 }
270
271 /* Data for CPU view */
272 struct cg_proc_stat {
273 char *cg;
274 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
275 struct cpuacct_usage *view; // Usage stats reported to the container
276 int cpu_count;
277 pthread_mutex_t lock; // For node manipulation
278 struct cg_proc_stat *next;
279 };
280
281 struct cg_proc_stat_head {
282 struct cg_proc_stat *next;
283 time_t lastcheck;
284
285 /*
286 * For access to the list. Reading can be parallel, pruning is exclusive.
287 */
288 pthread_rwlock_t lock;
289 };
290
291 #define CPUVIEW_HASH_SIZE 100
292 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
293
294 static bool cpuview_init_head(struct cg_proc_stat_head **head)
295 {
296 *head = malloc(sizeof(struct cg_proc_stat_head));
297 if (!(*head)) {
298 lxcfs_error("%s\n", strerror(errno));
299 return false;
300 }
301
302 (*head)->lastcheck = time(NULL);
303 (*head)->next = NULL;
304
305 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
306 lxcfs_error("%s\n", "Failed to initialize list lock");
307 free_disarm(*head);
308 return false;
309 }
310
311 return true;
312 }
313
314 static bool init_cpuview()
315 {
316 int i;
317
318 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
319 proc_stat_history[i] = NULL;
320
321 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
322 if (!cpuview_init_head(&proc_stat_history[i]))
323 goto err;
324 }
325
326 return true;
327
328 err:
329 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
330 if (proc_stat_history[i])
331 free_disarm(proc_stat_history[i]);
332 }
333
334 return false;
335 }
336
337 static void free_proc_stat_node(struct cg_proc_stat *node)
338 {
339 pthread_mutex_destroy(&node->lock);
340 free_disarm(node->cg);
341 free_disarm(node->usage);
342 free_disarm(node->view);
343 free_disarm(node);
344 }
345
346 static void cpuview_free_head(struct cg_proc_stat_head *head)
347 {
348 struct cg_proc_stat *node, *tmp;
349
350 if (head->next) {
351 node = head->next;
352
353 for (;;) {
354 tmp = node;
355 node = node->next;
356 free_proc_stat_node(tmp);
357
358 if (!node)
359 break;
360 }
361 }
362
363 pthread_rwlock_destroy(&head->lock);
364 free_disarm(head);
365 }
366
367 static void free_cpuview()
368 {
369 int i;
370
371 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
372 if (proc_stat_history[i])
373 cpuview_free_head(proc_stat_history[i]);
374 }
375 }
376
377 /*
378 * A table caching which pid is init for a pid namespace.
379 * When looking up which pid is init for $qpid, we first
380 * 1. Stat /proc/$qpid/ns/pid.
381 * 2. Check whether the ino_t is in our store.
382 * a. if not, fork a child in qpid's ns to send us
383 * ucred.pid = 1, and read the initpid. Cache
384 * initpid and creation time for /proc/initpid
385 * in a new store entry.
386 * b. if so, verify that /proc/initpid still matches
387 * what we have saved. If not, clear the store
388 * entry and go back to a. If so, return the
389 * cached initpid.
390 */
391 struct pidns_init_store {
392 ino_t ino; // inode number for /proc/$pid/ns/pid
393 pid_t initpid; // the pid of nit in that ns
394 long int ctime; // the time at which /proc/$initpid was created
395 struct pidns_init_store *next;
396 long int lastcheck;
397 };
398
399 /* lol - look at how they are allocated in the kernel */
400 #define PIDNS_HASH_SIZE 4096
401 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
402
403 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
404 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
405 static void lock_mutex(pthread_mutex_t *l)
406 {
407 int ret;
408
409 if ((ret = pthread_mutex_lock(l)) != 0) {
410 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
411 exit(1);
412 }
413 }
414
415 struct cgroup_ops *cgroup_ops;
416
417 static void unlock_mutex(pthread_mutex_t *l)
418 {
419 int ret;
420
421 if ((ret = pthread_mutex_unlock(l)) != 0) {
422 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
423 exit(1);
424 }
425 }
426
427 static void store_lock(void)
428 {
429 lock_mutex(&pidns_store_mutex);
430 }
431
432 static void store_unlock(void)
433 {
434 unlock_mutex(&pidns_store_mutex);
435 }
436
437 /* Must be called under store_lock */
438 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
439 {
440 struct stat initsb;
441 char fnam[100];
442
443 snprintf(fnam, 100, "/proc/%d", e->initpid);
444 if (stat(fnam, &initsb) < 0)
445 return false;
446
447 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
448 initsb.st_ctime, e->initpid);
449
450 if (e->ctime != initsb.st_ctime)
451 return false;
452 return true;
453 }
454
455 /* Must be called under store_lock */
456 static void remove_initpid(struct pidns_init_store *e)
457 {
458 struct pidns_init_store *tmp;
459 int h;
460
461 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
462
463 h = HASH(e->ino);
464 if (pidns_hash_table[h] == e) {
465 pidns_hash_table[h] = e->next;
466 free_disarm(e);
467 return;
468 }
469
470 tmp = pidns_hash_table[h];
471 while (tmp) {
472 if (tmp->next == e) {
473 tmp->next = e->next;
474 free_disarm(e);
475 return;
476 }
477 tmp = tmp->next;
478 }
479 }
480
481 #define PURGE_SECS 5
482 /* Must be called under store_lock */
483 static void prune_initpid_store(void)
484 {
485 static long int last_prune = 0;
486 struct pidns_init_store *e, *prev, *delme;
487 long int now, threshold;
488 int i;
489
490 if (!last_prune) {
491 last_prune = time(NULL);
492 return;
493 }
494 now = time(NULL);
495 if (now < last_prune + PURGE_SECS)
496 return;
497
498 lxcfs_debug("%s\n", "Pruning.");
499
500 last_prune = now;
501 threshold = now - 2 * PURGE_SECS;
502
503 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
504 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
505 if (e->lastcheck < threshold) {
506
507 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
508
509 delme = e;
510 if (prev)
511 prev->next = e->next;
512 else
513 pidns_hash_table[i] = e->next;
514 e = e->next;
515 free_disarm(delme);
516 } else {
517 prev = e;
518 e = e->next;
519 }
520 }
521 }
522 }
523
524 /* Must be called under store_lock */
525 static void save_initpid(struct stat *sb, pid_t pid)
526 {
527 struct pidns_init_store *e;
528 char fpath[100];
529 struct stat procsb;
530 int h;
531
532 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
533
534 snprintf(fpath, 100, "/proc/%d", pid);
535 if (stat(fpath, &procsb) < 0)
536 return;
537 do {
538 e = malloc(sizeof(*e));
539 } while (!e);
540 e->ino = sb->st_ino;
541 e->initpid = pid;
542 e->ctime = procsb.st_ctime;
543 h = HASH(e->ino);
544 e->next = pidns_hash_table[h];
545 e->lastcheck = time(NULL);
546 pidns_hash_table[h] = e;
547 }
548
549 /*
550 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
551 * entry for the inode number and creation time. Verify that the init pid
552 * is still valid. If not, remove it. Return the entry if valid, NULL
553 * otherwise.
554 * Must be called under store_lock
555 */
556 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
557 {
558 int h = HASH(sb->st_ino);
559 struct pidns_init_store *e = pidns_hash_table[h];
560
561 while (e) {
562 if (e->ino == sb->st_ino) {
563 if (initpid_still_valid(e, sb)) {
564 e->lastcheck = time(NULL);
565 return e;
566 }
567 remove_initpid(e);
568 return NULL;
569 }
570 e = e->next;
571 }
572
573 return NULL;
574 }
575
576 static int is_dir(const char *path, int fd)
577 {
578 struct stat statbuf;
579 int ret = fstatat(fd, path, &statbuf, fd);
580 if (ret == 0 && S_ISDIR(statbuf.st_mode))
581 return 1;
582 return 0;
583 }
584
585 static int preserve_ns(const int pid, const char *ns)
586 {
587 int ret;
588 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
589 #define __NS_PATH_LEN 50
590 char path[__NS_PATH_LEN];
591
592 /* This way we can use this function to also check whether namespaces
593 * are supported by the kernel by passing in the NULL or the empty
594 * string.
595 */
596 ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
597 !ns || strcmp(ns, "") == 0 ? "" : "/",
598 !ns || strcmp(ns, "") == 0 ? "" : ns);
599 if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
600 errno = EFBIG;
601 return -1;
602 }
603
604 return open(path, O_RDONLY | O_CLOEXEC);
605 }
606
607 /**
608 * in_same_namespace - Check whether two processes are in the same namespace.
609 * @pid1 - PID of the first process.
610 * @pid2 - PID of the second process.
611 * @ns - Name of the namespace to check. Must correspond to one of the names
612 * for the namespaces as shown in /proc/<pid/ns/
613 *
614 * If the two processes are not in the same namespace returns an fd to the
615 * namespace of the second process identified by @pid2. If the two processes are
616 * in the same namespace returns -EINVAL, -1 if an error occurred.
617 */
618 static int in_same_namespace(pid_t pid1, pid_t pid2, const char *ns)
619 {
620 __do_close_prot_errno int ns_fd1 = -1, ns_fd2 = -1;
621 int ret = -1;
622 struct stat ns_st1, ns_st2;
623
624 ns_fd1 = preserve_ns(pid1, ns);
625 if (ns_fd1 < 0) {
626 /* The kernel does not support this namespace. This is not an
627 * error.
628 */
629 if (errno == ENOENT)
630 return -EINVAL;
631
632 return -1;
633 }
634
635 ns_fd2 = preserve_ns(pid2, ns);
636 if (ns_fd2 < 0)
637 return -1;
638
639 ret = fstat(ns_fd1, &ns_st1);
640 if (ret < 0)
641 return -1;
642
643 ret = fstat(ns_fd2, &ns_st2);
644 if (ret < 0)
645 return -1;
646
647 /* processes are in the same namespace */
648 if ((ns_st1.st_dev == ns_st2.st_dev) && (ns_st1.st_ino == ns_st2.st_ino))
649 return -EINVAL;
650
651 /* processes are in different namespaces */
652 return move_fd(ns_fd2);
653 }
654
655 static bool is_shared_pidns(pid_t pid)
656 {
657 if (pid != 1)
658 return false;
659
660 if (in_same_namespace(pid, getpid(), "pid") == -EINVAL)
661 return true;
662
663 return false;
664 }
665
666 static bool write_string(const char *fnam, const char *string, int fd)
667 {
668 FILE *f;
669 size_t len, ret;
670
671 f = fdopen(fd, "w");
672 if (!f)
673 return false;
674
675 len = strlen(string);
676 ret = fwrite(string, 1, len, f);
677 if (ret != len) {
678 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
679 strerror(errno), string, fnam);
680 fclose(f);
681 return false;
682 }
683
684 if (fclose(f) < 0) {
685 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
686 return false;
687 }
688
689 return true;
690 }
691
692 struct cgfs_files {
693 char *name;
694 uint32_t uid, gid;
695 uint32_t mode;
696 };
697
698 static void print_subsystems(void)
699 {
700 int i = 0;
701
702 fprintf(stderr, "mount namespace: %d\n", cgroup_ops->mntns_fd);
703 fprintf(stderr, "hierarchies:\n");
704 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
705 __do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false);
706 fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: "");
707 }
708 }
709
710 /* do we need to do any massaging here? I'm not sure... */
711 /* Return the mounted controller and store the corresponding open file descriptor
712 * referring to the controller mountpoint in the private lxcfs namespace in
713 * @cfd.
714 */
715 static int find_mounted_controller(const char *controller)
716 {
717 struct hierarchy *h;
718
719 h = cgroup_ops->get_hierarchy(cgroup_ops, controller);
720 return h ? h->fd : -EBADF;
721 }
722
723 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
724 const char *value)
725 {
726 int ret, fd, cfd;
727 size_t len;
728 char *fnam;
729
730 cfd = find_mounted_controller(controller);
731 if (cfd < 0)
732 return false;
733
734 /* Make sure we pass a relative path to *at() family of functions.
735 * . + /cgroup + / + file + \0
736 */
737 len = strlen(cgroup) + strlen(file) + 3;
738 fnam = alloca(len);
739 ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
740 if (ret < 0 || (size_t)ret >= len)
741 return false;
742
743 fd = openat(cfd, fnam, O_WRONLY);
744 if (fd < 0)
745 return false;
746
747 return write_string(fnam, value, fd);
748 }
749
750 // Chown all the files in the cgroup directory. We do this when we create
751 // a cgroup on behalf of a user.
752 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
753 {
754 struct dirent *direntp;
755 char path[MAXPATHLEN];
756 size_t len;
757 DIR *d;
758 int fd1, ret;
759
760 len = strlen(dirname);
761 if (len >= MAXPATHLEN) {
762 lxcfs_error("Pathname too long: %s\n", dirname);
763 return;
764 }
765
766 fd1 = openat(fd, dirname, O_DIRECTORY);
767 if (fd1 < 0)
768 return;
769
770 d = fdopendir(fd1);
771 if (!d) {
772 lxcfs_error("Failed to open %s\n", dirname);
773 return;
774 }
775
776 while ((direntp = readdir(d))) {
777 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
778 continue;
779 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
780 if (ret < 0 || ret >= MAXPATHLEN) {
781 lxcfs_error("Pathname too long under %s\n", dirname);
782 continue;
783 }
784 if (fchownat(fd, path, uid, gid, 0) < 0)
785 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
786 }
787 closedir(d);
788 }
789
790 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
791 {
792 int cfd;
793 size_t len;
794 char *dirnam;
795
796 cfd = find_mounted_controller(controller);
797 if (cfd < 0)
798 return -EINVAL;
799
800 /* Make sure we pass a relative path to *at() family of functions.
801 * . + /cg + \0
802 */
803 len = strlen(cg) + 2;
804 dirnam = alloca(len);
805 snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
806
807 if (mkdirat(cfd, dirnam, 0755) < 0)
808 return -errno;
809
810 if (uid == 0 && gid == 0)
811 return 0;
812
813 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
814 return -errno;
815
816 chown_all_cgroup_files(dirnam, uid, gid, cfd);
817
818 return 0;
819 }
820
821 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
822 {
823 struct dirent *direntp;
824 DIR *dir;
825 bool ret = false;
826 char pathname[MAXPATHLEN];
827 int dupfd;
828
829 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
830 if (dupfd < 0)
831 return false;
832
833 dir = fdopendir(dupfd);
834 if (!dir) {
835 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
836 close(dupfd);
837 return false;
838 }
839
840 while ((direntp = readdir(dir))) {
841 struct stat mystat;
842 int rc;
843
844 if (!strcmp(direntp->d_name, ".") ||
845 !strcmp(direntp->d_name, ".."))
846 continue;
847
848 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
849 if (rc < 0 || rc >= MAXPATHLEN) {
850 lxcfs_error("%s\n", "Pathname too long.");
851 continue;
852 }
853
854 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
855 if (rc) {
856 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
857 continue;
858 }
859 if (S_ISDIR(mystat.st_mode))
860 if (!recursive_rmdir(pathname, fd, cfd))
861 lxcfs_debug("Error removing %s.\n", pathname);
862 }
863
864 ret = true;
865 if (closedir(dir) < 0) {
866 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
867 ret = false;
868 }
869
870 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
871 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
872 ret = false;
873 }
874
875 close(dupfd);
876
877 return ret;
878 }
879
880 bool cgfs_remove(const char *controller, const char *cg)
881 {
882 int fd, cfd;
883 size_t len;
884 char *dirnam;
885 bool bret;
886
887 cfd = find_mounted_controller(controller);
888 if (cfd < 0)
889 return false;
890
891 /* Make sure we pass a relative path to *at() family of functions.
892 * . + /cg + \0
893 */
894 len = strlen(cg) + 2;
895 dirnam = alloca(len);
896 snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
897
898 fd = openat(cfd, dirnam, O_DIRECTORY);
899 if (fd < 0)
900 return false;
901
902 bret = recursive_rmdir(dirnam, fd, cfd);
903 close(fd);
904 return bret;
905 }
906
907 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
908 {
909 int cfd;
910 size_t len;
911 char *pathname;
912
913 cfd = find_mounted_controller(controller);
914 if (cfd < 0)
915 return false;
916
917 /* Make sure we pass a relative path to *at() family of functions.
918 * . + /file + \0
919 */
920 len = strlen(file) + 2;
921 pathname = alloca(len);
922 snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
923 if (fchmodat(cfd, pathname, mode, 0) < 0)
924 return false;
925 return true;
926 }
927
928 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
929 {
930 size_t len;
931 char *fname;
932
933 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
934 fname = alloca(len);
935 snprintf(fname, len, "%s/tasks", dirname);
936 if (fchownat(fd, fname, uid, gid, 0) != 0)
937 return -errno;
938 snprintf(fname, len, "%s/cgroup.procs", dirname);
939 if (fchownat(fd, fname, uid, gid, 0) != 0)
940 return -errno;
941 return 0;
942 }
943
944 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
945 {
946 int cfd;
947 size_t len;
948 char *pathname;
949
950 cfd = find_mounted_controller(controller);
951 if (cfd < 0)
952 return false;
953
954 /* Make sure we pass a relative path to *at() family of functions.
955 * . + /file + \0
956 */
957 len = strlen(file) + 2;
958 pathname = alloca(len);
959 snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
960 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
961 return -errno;
962
963 if (is_dir(pathname, cfd))
964 // like cgmanager did, we want to chown the tasks file as well
965 return chown_tasks_files(pathname, uid, gid, cfd);
966
967 return 0;
968 }
969
970 FILE *open_pids_file(const char *controller, const char *cgroup)
971 {
972 int fd, cfd;
973 size_t len;
974 char *pathname;
975
976 cfd = find_mounted_controller(controller);
977 if (cfd < 0)
978 return false;
979
980 /* Make sure we pass a relative path to *at() family of functions.
981 * . + /cgroup + / "cgroup.procs" + \0
982 */
983 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
984 pathname = alloca(len);
985 snprintf(pathname, len, "%s%s/cgroup.procs", dot_or_empty(cgroup), cgroup);
986
987 fd = openat(cfd, pathname, O_WRONLY);
988 if (fd < 0)
989 return NULL;
990
991 return fdopen(fd, "w");
992 }
993
994 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
995 void ***list, size_t typesize,
996 void* (*iterator)(const char*, const char*, const char*))
997 {
998 int cfd, fd, ret;
999 size_t len;
1000 char *cg;
1001 char pathname[MAXPATHLEN];
1002 size_t sz = 0, asz = 0;
1003 struct dirent *dirent;
1004 DIR *dir;
1005
1006 cfd = find_mounted_controller(controller);
1007 *list = NULL;
1008 if (cfd < 0)
1009 return false;
1010
1011 /* Make sure we pass a relative path to *at() family of functions. */
1012 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1013 cg = alloca(len);
1014 ret = snprintf(cg, len, "%s%s", dot_or_empty(cgroup), cgroup);
1015 if (ret < 0 || (size_t)ret >= len) {
1016 lxcfs_error("Pathname too long under %s\n", cgroup);
1017 return false;
1018 }
1019
1020 fd = openat(cfd, cg, O_DIRECTORY);
1021 if (fd < 0)
1022 return false;
1023
1024 dir = fdopendir(fd);
1025 if (!dir)
1026 return false;
1027
1028 while ((dirent = readdir(dir))) {
1029 struct stat mystat;
1030
1031 if (!strcmp(dirent->d_name, ".") ||
1032 !strcmp(dirent->d_name, ".."))
1033 continue;
1034
1035 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1036 if (ret < 0 || ret >= MAXPATHLEN) {
1037 lxcfs_error("Pathname too long under %s\n", cg);
1038 continue;
1039 }
1040
1041 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1042 if (ret) {
1043 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1044 continue;
1045 }
1046 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1047 (directories && !S_ISDIR(mystat.st_mode)))
1048 continue;
1049
1050 if (sz+2 >= asz) {
1051 void **tmp;
1052 asz += BATCH_SIZE;
1053 do {
1054 tmp = realloc(*list, asz * typesize);
1055 } while (!tmp);
1056 *list = tmp;
1057 }
1058 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1059 (*list)[sz+1] = NULL;
1060 sz++;
1061 }
1062 if (closedir(dir) < 0) {
1063 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1064 return false;
1065 }
1066 return true;
1067 }
1068
1069 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1070 {
1071 char *dup;
1072 do {
1073 dup = strdup(dir_entry);
1074 } while (!dup);
1075 return dup;
1076 }
1077
1078 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1079 {
1080 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1081 }
1082
1083 void free_key(struct cgfs_files *k)
1084 {
1085 if (!k)
1086 return;
1087 free_disarm(k->name);
1088 free_disarm(k);
1089 }
1090
1091 void free_keys(struct cgfs_files **keys)
1092 {
1093 int i;
1094
1095 if (!keys)
1096 return;
1097 for (i = 0; keys[i]; i++) {
1098 free_key(keys[i]);
1099 }
1100 free_disarm(keys);
1101 }
1102
1103 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1104 {
1105 int ret, cfd;
1106 size_t len;
1107 char *fnam;
1108
1109 cfd = find_mounted_controller(controller);
1110 if (cfd < 0)
1111 return false;
1112
1113 /* Make sure we pass a relative path to *at() family of functions.
1114 * . + /cgroup + / + file + \0
1115 */
1116 len = strlen(cgroup) + strlen(file) + 3;
1117 fnam = alloca(len);
1118 ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
1119 if (ret < 0 || (size_t)ret >= len)
1120 return false;
1121
1122 return (faccessat(cfd, fnam, F_OK, 0) == 0);
1123 }
1124
1125 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1126 {
1127 int ret, cfd;
1128 size_t len;
1129 char *fnam;
1130 struct stat sb;
1131 struct cgfs_files *newkey;
1132
1133 cfd = find_mounted_controller(controller);
1134 if (cfd < 0)
1135 return false;
1136
1137 if (file && *file == '/')
1138 file++;
1139
1140 if (file && strchr(file, '/'))
1141 return NULL;
1142
1143 /* Make sure we pass a relative path to *at() family of functions.
1144 * . + /cgroup + / + file + \0
1145 */
1146 len = strlen(cgroup) + 3;
1147 if (file)
1148 len += strlen(file) + 1;
1149 fnam = alloca(len);
1150 snprintf(fnam, len, "%s%s%s%s", dot_or_empty(cgroup), cgroup,
1151 file ? "/" : "", file ? file : "");
1152
1153 ret = fstatat(cfd, fnam, &sb, 0);
1154 if (ret < 0)
1155 return NULL;
1156
1157 do {
1158 newkey = malloc(sizeof(struct cgfs_files));
1159 } while (!newkey);
1160 if (file)
1161 newkey->name = must_copy_string(file);
1162 else if (strrchr(cgroup, '/'))
1163 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1164 else
1165 newkey->name = must_copy_string(cgroup);
1166 newkey->uid = sb.st_uid;
1167 newkey->gid = sb.st_gid;
1168 newkey->mode = sb.st_mode;
1169
1170 return newkey;
1171 }
1172
1173 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1174 {
1175 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1176 if (!entry) {
1177 lxcfs_error("Error getting files under %s:%s\n", controller,
1178 cgroup);
1179 }
1180 return entry;
1181 }
1182
1183 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1184 {
1185 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1186 }
1187
1188 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1189 {
1190 int cfd;
1191 size_t len;
1192 char *fnam;
1193 int ret;
1194 struct stat sb;
1195
1196 cfd = find_mounted_controller(controller);
1197 if (cfd < 0)
1198 return false;
1199
1200 /* Make sure we pass a relative path to *at() family of functions.
1201 * . + /cgroup + / + f + \0
1202 */
1203 len = strlen(cgroup) + strlen(f) + 3;
1204 fnam = alloca(len);
1205 ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, f);
1206 if (ret < 0 || (size_t)ret >= len)
1207 return false;
1208
1209 ret = fstatat(cfd, fnam, &sb, 0);
1210 if (ret < 0 || !S_ISDIR(sb.st_mode))
1211 return false;
1212
1213 return true;
1214 }
1215
1216 #define SEND_CREDS_OK 0
1217 #define SEND_CREDS_NOTSK 1
1218 #define SEND_CREDS_FAIL 2
1219 static bool recv_creds(int sock, struct ucred *cred, char *v);
1220 static int wait_for_pid(pid_t pid);
1221 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1222 static int send_creds_clone_wrapper(void *arg);
1223
1224 /*
1225 * clone a task which switches to @task's namespace and writes '1'.
1226 * over a unix sock so we can read the task's reaper's pid in our
1227 * namespace
1228 *
1229 * Note: glibc's fork() does not respect pidns, which can lead to failed
1230 * assertions inside glibc (and thus failed forks) if the child's pid in
1231 * the pidns and the parent pid outside are identical. Using clone prevents
1232 * this issue.
1233 */
1234 static void write_task_init_pid_exit(int sock, pid_t target)
1235 {
1236 char fnam[100];
1237 pid_t pid;
1238 int fd, ret;
1239 size_t stack_size = sysconf(_SC_PAGESIZE);
1240 void *stack = alloca(stack_size);
1241
1242 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1243 if (ret < 0 || ret >= sizeof(fnam))
1244 _exit(1);
1245
1246 fd = open(fnam, O_RDONLY);
1247 if (fd < 0) {
1248 perror("write_task_init_pid_exit open of ns/pid");
1249 _exit(1);
1250 }
1251 if (setns(fd, 0)) {
1252 perror("write_task_init_pid_exit setns 1");
1253 close(fd);
1254 _exit(1);
1255 }
1256 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1257 if (pid < 0)
1258 _exit(1);
1259 if (pid != 0) {
1260 if (!wait_for_pid(pid))
1261 _exit(1);
1262 _exit(0);
1263 }
1264 }
1265
1266 static int send_creds_clone_wrapper(void *arg) {
1267 struct ucred cred;
1268 char v;
1269 int sock = *(int *)arg;
1270
1271 /* we are the child */
1272 cred.uid = 0;
1273 cred.gid = 0;
1274 cred.pid = 1;
1275 v = '1';
1276 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1277 return 1;
1278 return 0;
1279 }
1280
1281 static pid_t get_init_pid_for_task(pid_t task)
1282 {
1283 int sock[2];
1284 pid_t pid;
1285 pid_t ret = -1;
1286 char v = '0';
1287 struct ucred cred;
1288
1289 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1290 perror("socketpair");
1291 return -1;
1292 }
1293
1294 pid = fork();
1295 if (pid < 0)
1296 goto out;
1297 if (!pid) {
1298 close(sock[1]);
1299 write_task_init_pid_exit(sock[0], task);
1300 _exit(0);
1301 }
1302
1303 if (!recv_creds(sock[1], &cred, &v))
1304 goto out;
1305 ret = cred.pid;
1306
1307 out:
1308 close(sock[0]);
1309 close(sock[1]);
1310 if (pid > 0)
1311 wait_for_pid(pid);
1312 return ret;
1313 }
1314
1315 pid_t lookup_initpid_in_store(pid_t qpid)
1316 {
1317 pid_t answer = 0;
1318 struct stat sb;
1319 struct pidns_init_store *e;
1320 char fnam[100];
1321
1322 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1323 store_lock();
1324 if (stat(fnam, &sb) < 0)
1325 goto out;
1326 e = lookup_verify_initpid(&sb);
1327 if (e) {
1328 answer = e->initpid;
1329 goto out;
1330 }
1331 answer = get_init_pid_for_task(qpid);
1332 if (answer > 0)
1333 save_initpid(&sb, answer);
1334
1335 out:
1336 /* we prune at end in case we are returning
1337 * the value we were about to return */
1338 prune_initpid_store();
1339 store_unlock();
1340 return answer;
1341 }
1342
1343 static int wait_for_pid(pid_t pid)
1344 {
1345 int status, ret;
1346
1347 if (pid <= 0)
1348 return -1;
1349
1350 again:
1351 ret = waitpid(pid, &status, 0);
1352 if (ret == -1) {
1353 if (errno == EINTR)
1354 goto again;
1355 return -1;
1356 }
1357 if (ret != pid)
1358 goto again;
1359 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1360 return -1;
1361 return 0;
1362 }
1363
1364 /*
1365 * append the given formatted string to *src.
1366 * src: a pointer to a char* in which to append the formatted string.
1367 * sz: the number of characters printed so far, minus trailing \0.
1368 * asz: the allocated size so far
1369 * format: string format. See printf for details.
1370 * ...: varargs. See printf for details.
1371 */
1372 static void must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...)
1373 {
1374 char tmp[BUF_RESERVE_SIZE];
1375 va_list args;
1376
1377 va_start (args, format);
1378 int tmplen = vsnprintf(tmp, BUF_RESERVE_SIZE, format, args);
1379 va_end(args);
1380
1381 if (!*src || tmplen + *sz + 1 >= *asz) {
1382 char *tmp;
1383 do {
1384 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1385 } while (!tmp);
1386 *src = tmp;
1387 *asz += BUF_RESERVE_SIZE;
1388 }
1389 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1390 *sz += tmplen;
1391 }
1392
1393 /*
1394 * append pid to *src.
1395 * src: a pointer to a char* in which ot append the pid.
1396 * sz: the number of characters printed so far, minus trailing \0.
1397 * asz: the allocated size so far
1398 * pid: the pid to append
1399 */
1400 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1401 {
1402 must_strcat(src, sz, asz, "%d\n", (int)pid);
1403 }
1404
1405 /*
1406 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1407 * valid in the caller's namespace, return the id mapped into
1408 * pid's namespace.
1409 * Returns the mapped id, or -1 on error.
1410 */
1411 unsigned int
1412 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1413 {
1414 unsigned int nsuid, // base id for a range in the idfile's namespace
1415 hostuid, // base id for a range in the caller's namespace
1416 count; // number of ids in this range
1417 char line[400];
1418 int ret;
1419
1420 fseek(idfile, 0L, SEEK_SET);
1421 while (fgets(line, 400, idfile)) {
1422 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1423 if (ret != 3)
1424 continue;
1425 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1426 /*
1427 * uids wrapped around - unexpected as this is a procfile,
1428 * so just bail.
1429 */
1430 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1431 nsuid, hostuid, count, line);
1432 return -1;
1433 }
1434 if (hostuid <= in_id && hostuid+count > in_id) {
1435 /*
1436 * now since hostuid <= in_id < hostuid+count, and
1437 * hostuid+count and nsuid+count do not wrap around,
1438 * we know that nsuid+(in_id-hostuid) which must be
1439 * less that nsuid+(count) must not wrap around
1440 */
1441 return (in_id - hostuid) + nsuid;
1442 }
1443 }
1444
1445 // no answer found
1446 return -1;
1447 }
1448
1449 /*
1450 * for is_privileged_over,
1451 * specify whether we require the calling uid to be root in his
1452 * namespace
1453 */
1454 #define NS_ROOT_REQD true
1455 #define NS_ROOT_OPT false
1456
1457 #define PROCLEN 100
1458
1459 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1460 {
1461 char fpath[PROCLEN];
1462 int ret;
1463 bool answer = false;
1464 uid_t nsuid;
1465
1466 if (victim == -1 || uid == -1)
1467 return false;
1468
1469 /*
1470 * If the request is one not requiring root in the namespace,
1471 * then having the same uid suffices. (i.e. uid 1000 has write
1472 * access to files owned by uid 1000
1473 */
1474 if (!req_ns_root && uid == victim)
1475 return true;
1476
1477 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1478 if (ret < 0 || ret >= PROCLEN)
1479 return false;
1480 FILE *f = fopen(fpath, "r");
1481 if (!f)
1482 return false;
1483
1484 /* if caller's not root in his namespace, reject */
1485 nsuid = convert_id_to_ns(f, uid);
1486 if (nsuid)
1487 goto out;
1488
1489 /*
1490 * If victim is not mapped into caller's ns, reject.
1491 * XXX I'm not sure this check is needed given that fuse
1492 * will be sending requests where the vfs has converted
1493 */
1494 nsuid = convert_id_to_ns(f, victim);
1495 if (nsuid == -1)
1496 goto out;
1497
1498 answer = true;
1499
1500 out:
1501 fclose(f);
1502 return answer;
1503 }
1504
1505 static bool perms_include(int fmode, mode_t req_mode)
1506 {
1507 mode_t r;
1508
1509 switch (req_mode & O_ACCMODE) {
1510 case O_RDONLY:
1511 r = S_IROTH;
1512 break;
1513 case O_WRONLY:
1514 r = S_IWOTH;
1515 break;
1516 case O_RDWR:
1517 r = S_IROTH | S_IWOTH;
1518 break;
1519 default:
1520 return false;
1521 }
1522 return ((fmode & r) == r);
1523 }
1524
1525
1526 /*
1527 * taskcg is a/b/c
1528 * querycg is /a/b/c/d/e
1529 * we return 'd'
1530 */
1531 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1532 {
1533 char *start, *end;
1534
1535 if (strlen(taskcg) <= strlen(querycg)) {
1536 lxcfs_error("%s\n", "I was fed bad input.");
1537 return NULL;
1538 }
1539
1540 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1541 start = strdup(taskcg + 1);
1542 else
1543 start = strdup(taskcg + strlen(querycg) + 1);
1544 if (!start)
1545 return NULL;
1546 end = strchr(start, '/');
1547 if (end)
1548 *end = '\0';
1549 return start;
1550 }
1551
1552 char *get_pid_cgroup(pid_t pid, const char *contrl)
1553 {
1554 int cfd;
1555
1556 cfd = find_mounted_controller(contrl);
1557 if (cfd < 0)
1558 return false;
1559
1560 if (pure_unified_layout(cgroup_ops))
1561 return cg_unified_get_current_cgroup(pid);
1562
1563 return cg_legacy_get_current_cgroup(pid, contrl);
1564 }
1565
1566 /*
1567 * check whether a fuse context may access a cgroup dir or file
1568 *
1569 * If file is not null, it is a cgroup file to check under cg.
1570 * If file is null, then we are checking perms on cg itself.
1571 *
1572 * For files we can check the mode of the list_keys result.
1573 * For cgroups, we must make assumptions based on the files under the
1574 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1575 * yet.
1576 */
1577 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1578 {
1579 struct cgfs_files *k = NULL;
1580 bool ret = false;
1581
1582 k = cgfs_get_key(contrl, cg, file);
1583 if (!k)
1584 return false;
1585
1586 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1587 if (perms_include(k->mode >> 6, mode)) {
1588 ret = true;
1589 goto out;
1590 }
1591 }
1592 if (fc->gid == k->gid) {
1593 if (perms_include(k->mode >> 3, mode)) {
1594 ret = true;
1595 goto out;
1596 }
1597 }
1598 ret = perms_include(k->mode, mode);
1599
1600 out:
1601 free_key(k);
1602 return ret;
1603 }
1604
1605 #define INITSCOPE "/init.scope"
1606 void prune_init_slice(char *cg)
1607 {
1608 char *point;
1609 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1610
1611 if (cg_len < initscope_len)
1612 return;
1613
1614 point = cg + cg_len - initscope_len;
1615 if (strcmp(point, INITSCOPE) == 0) {
1616 if (point == cg)
1617 *(point+1) = '\0';
1618 else
1619 *point = '\0';
1620 }
1621 }
1622
1623 /*
1624 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1625 * If pid is in /a, he may act on /a/b, but not on /b.
1626 * if the answer is false and nextcg is not NULL, then *nextcg will point
1627 * to a string containing the next cgroup directory under cg, which must be
1628 * freed by the caller.
1629 */
1630 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1631 {
1632 bool answer = false;
1633 char *c2 = get_pid_cgroup(pid, contrl);
1634 char *linecmp;
1635
1636 if (!c2)
1637 return false;
1638 prune_init_slice(c2);
1639
1640 /*
1641 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1642 * they pass in a cgroup without leading '/'
1643 *
1644 * The original line here was:
1645 * linecmp = *cg == '/' ? c2 : c2+1;
1646 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1647 * Serge, do you know?
1648 */
1649 if (*cg == '/' || !strncmp(cg, "./", 2))
1650 linecmp = c2;
1651 else
1652 linecmp = c2 + 1;
1653 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1654 if (nextcg) {
1655 *nextcg = get_next_cgroup_dir(linecmp, cg);
1656 }
1657 goto out;
1658 }
1659 answer = true;
1660
1661 out:
1662 free(c2);
1663 return answer;
1664 }
1665
1666 /*
1667 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1668 */
1669 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1670 {
1671 bool answer = false;
1672 char *c2, *task_cg;
1673 size_t target_len, task_len;
1674
1675 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1676 return true;
1677
1678 c2 = get_pid_cgroup(pid, contrl);
1679 if (!c2)
1680 return false;
1681 prune_init_slice(c2);
1682
1683 task_cg = c2 + 1;
1684 target_len = strlen(cg);
1685 task_len = strlen(task_cg);
1686 if (task_len == 0) {
1687 /* Task is in the root cg, it can see everything. This case is
1688 * not handled by the strmcps below, since they test for the
1689 * last /, but that is the first / that we've chopped off
1690 * above.
1691 */
1692 answer = true;
1693 goto out;
1694 }
1695 if (strcmp(cg, task_cg) == 0) {
1696 answer = true;
1697 goto out;
1698 }
1699 if (target_len < task_len) {
1700 /* looking up a parent dir */
1701 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1702 answer = true;
1703 goto out;
1704 }
1705 if (target_len > task_len) {
1706 /* looking up a child dir */
1707 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1708 answer = true;
1709 goto out;
1710 }
1711
1712 out:
1713 free(c2);
1714 return answer;
1715 }
1716
1717 /*
1718 * given /cgroup/freezer/a/b, return "freezer".
1719 * the returned char* should NOT be freed.
1720 */
1721 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1722 {
1723 const char *p1;
1724 char *contr, *slash;
1725
1726 if (strlen(path) < 9) {
1727 errno = EACCES;
1728 return NULL;
1729 }
1730 if (*(path + 7) != '/') {
1731 errno = EINVAL;
1732 return NULL;
1733 }
1734 p1 = path + 8;
1735 contr = strdupa(p1);
1736 if (!contr) {
1737 errno = ENOMEM;
1738 return NULL;
1739 }
1740 slash = strstr(contr, "/");
1741 if (slash)
1742 *slash = '\0';
1743
1744 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1745 if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0)
1746 return (*h)->__controllers;
1747 }
1748 errno = ENOENT;
1749 return NULL;
1750 }
1751
1752 /*
1753 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1754 * Note that the returned value may include files (keynames) etc
1755 */
1756 static const char *find_cgroup_in_path(const char *path)
1757 {
1758 const char *p1;
1759
1760 if (strlen(path) < 9) {
1761 errno = EACCES;
1762 return NULL;
1763 }
1764 p1 = strstr(path + 8, "/");
1765 if (!p1) {
1766 errno = EINVAL;
1767 return NULL;
1768 }
1769 errno = 0;
1770 return p1 + 1;
1771 }
1772
1773 /*
1774 * split the last path element from the path in @cg.
1775 * @dir is newly allocated and should be freed, @last not
1776 */
1777 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1778 {
1779 char *p;
1780
1781 do {
1782 *dir = strdup(cg);
1783 } while (!*dir);
1784 *last = strrchr(cg, '/');
1785 if (!*last) {
1786 *last = NULL;
1787 return;
1788 }
1789 p = strrchr(*dir, '/');
1790 *p = '\0';
1791 }
1792
1793 /*
1794 * FUSE ops for /cgroup
1795 */
1796
1797 int cg_getattr(const char *path, struct stat *sb)
1798 {
1799 struct timespec now;
1800 struct fuse_context *fc = fuse_get_context();
1801 char * cgdir = NULL;
1802 char *last = NULL, *path1, *path2;
1803 struct cgfs_files *k = NULL;
1804 const char *cgroup;
1805 const char *controller = NULL;
1806 int ret = -ENOENT;
1807
1808
1809 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1810 return -EIO;
1811
1812 memset(sb, 0, sizeof(struct stat));
1813
1814 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1815 return -EINVAL;
1816
1817 sb->st_uid = sb->st_gid = 0;
1818 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1819 sb->st_size = 0;
1820
1821 if (strcmp(path, "/cgroup") == 0) {
1822 sb->st_mode = S_IFDIR | 00755;
1823 sb->st_nlink = 2;
1824 return 0;
1825 }
1826
1827 controller = pick_controller_from_path(fc, path);
1828 if (!controller)
1829 return -errno;
1830 cgroup = find_cgroup_in_path(path);
1831 if (!cgroup) {
1832 /* this is just /cgroup/controller, return it as a dir */
1833 sb->st_mode = S_IFDIR | 00755;
1834 sb->st_nlink = 2;
1835 return 0;
1836 }
1837
1838 get_cgdir_and_path(cgroup, &cgdir, &last);
1839
1840 if (!last) {
1841 path1 = "/";
1842 path2 = cgdir;
1843 } else {
1844 path1 = cgdir;
1845 path2 = last;
1846 }
1847
1848 pid_t initpid = lookup_initpid_in_store(fc->pid);
1849 if (initpid <= 1 || is_shared_pidns(initpid))
1850 initpid = fc->pid;
1851 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1852 * Then check that caller's cgroup is under path if last is a child
1853 * cgroup, or cgdir if last is a file */
1854
1855 if (is_child_cgroup(controller, path1, path2)) {
1856 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1857 ret = -ENOENT;
1858 goto out;
1859 }
1860 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1861 /* this is just /cgroup/controller, return it as a dir */
1862 sb->st_mode = S_IFDIR | 00555;
1863 sb->st_nlink = 2;
1864 ret = 0;
1865 goto out;
1866 }
1867 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1868 ret = -EACCES;
1869 goto out;
1870 }
1871
1872 // get uid, gid, from '/tasks' file and make up a mode
1873 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1874 sb->st_mode = S_IFDIR | 00755;
1875 k = cgfs_get_key(controller, cgroup, NULL);
1876 if (!k) {
1877 sb->st_uid = sb->st_gid = 0;
1878 } else {
1879 sb->st_uid = k->uid;
1880 sb->st_gid = k->gid;
1881 }
1882 free_key(k);
1883 sb->st_nlink = 2;
1884 ret = 0;
1885 goto out;
1886 }
1887
1888 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1889 sb->st_mode = S_IFREG | k->mode;
1890 sb->st_nlink = 1;
1891 sb->st_uid = k->uid;
1892 sb->st_gid = k->gid;
1893 sb->st_size = 0;
1894 free_key(k);
1895 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1896 ret = -ENOENT;
1897 goto out;
1898 }
1899 ret = 0;
1900 }
1901
1902 out:
1903 free(cgdir);
1904 return ret;
1905 }
1906
1907 int cg_opendir(const char *path, struct fuse_file_info *fi)
1908 {
1909 struct fuse_context *fc = fuse_get_context();
1910 const char *cgroup;
1911 struct file_info *dir_info;
1912 char *controller = NULL;
1913
1914 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1915 return -EIO;
1916
1917 if (strcmp(path, "/cgroup") == 0) {
1918 cgroup = NULL;
1919 controller = NULL;
1920 } else {
1921 // return list of keys for the controller, and list of child cgroups
1922 controller = pick_controller_from_path(fc, path);
1923 if (!controller)
1924 return -errno;
1925
1926 cgroup = find_cgroup_in_path(path);
1927 if (!cgroup) {
1928 /* this is just /cgroup/controller, return its contents */
1929 cgroup = "/";
1930 }
1931 }
1932
1933 pid_t initpid = lookup_initpid_in_store(fc->pid);
1934 if (initpid <= 1 || is_shared_pidns(initpid))
1935 initpid = fc->pid;
1936 if (cgroup) {
1937 if (!caller_may_see_dir(initpid, controller, cgroup))
1938 return -ENOENT;
1939 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1940 return -EACCES;
1941 }
1942
1943 /* we'll free this at cg_releasedir */
1944 dir_info = malloc(sizeof(*dir_info));
1945 if (!dir_info)
1946 return -ENOMEM;
1947 dir_info->controller = must_copy_string(controller);
1948 dir_info->cgroup = must_copy_string(cgroup);
1949 dir_info->type = LXC_TYPE_CGDIR;
1950 dir_info->buf = NULL;
1951 dir_info->file = NULL;
1952 dir_info->buflen = 0;
1953
1954 fi->fh = (unsigned long)dir_info;
1955 return 0;
1956 }
1957
1958 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1959 struct fuse_file_info *fi)
1960 {
1961 struct file_info *d = (struct file_info *)fi->fh;
1962 struct cgfs_files **list = NULL;
1963 int i, ret;
1964 char *nextcg = NULL;
1965 struct fuse_context *fc = fuse_get_context();
1966 char **clist = NULL;
1967
1968 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1969 return -EIO;
1970
1971 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1972 return -EIO;
1973
1974 if (d->type != LXC_TYPE_CGDIR) {
1975 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1976 return -EIO;
1977 }
1978 if (!d->cgroup && !d->controller) {
1979 /*
1980 * ls /var/lib/lxcfs/cgroup - just show list of controllers.
1981 * This only works with the legacy hierarchy.
1982 */
1983 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1984 if (is_unified_hierarchy(*h))
1985 continue;
1986
1987 if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0))
1988 return -EIO;
1989 }
1990
1991 return 0;
1992 }
1993
1994 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1995 // not a valid cgroup
1996 ret = -EINVAL;
1997 goto out;
1998 }
1999
2000 pid_t initpid = lookup_initpid_in_store(fc->pid);
2001 if (initpid <= 1 || is_shared_pidns(initpid))
2002 initpid = fc->pid;
2003 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2004 if (nextcg) {
2005 ret = filler(buf, nextcg, NULL, 0);
2006 free(nextcg);
2007 if (ret != 0) {
2008 ret = -EIO;
2009 goto out;
2010 }
2011 }
2012 ret = 0;
2013 goto out;
2014 }
2015
2016 for (i = 0; list && list[i]; i++) {
2017 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2018 ret = -EIO;
2019 goto out;
2020 }
2021 }
2022
2023 // now get the list of child cgroups
2024
2025 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2026 ret = 0;
2027 goto out;
2028 }
2029 if (clist) {
2030 for (i = 0; clist[i]; i++) {
2031 if (filler(buf, clist[i], NULL, 0) != 0) {
2032 ret = -EIO;
2033 goto out;
2034 }
2035 }
2036 }
2037 ret = 0;
2038
2039 out:
2040 free_keys(list);
2041 if (clist) {
2042 for (i = 0; clist[i]; i++)
2043 free(clist[i]);
2044 free(clist);
2045 }
2046 return ret;
2047 }
2048
2049 void do_release_file_info(struct fuse_file_info *fi)
2050 {
2051 struct file_info *f = (struct file_info *)fi->fh;
2052
2053 if (!f)
2054 return;
2055
2056 fi->fh = 0;
2057
2058 free_disarm(f->controller);
2059 free_disarm(f->cgroup);
2060 free_disarm(f->file);
2061 free_disarm(f->buf);
2062 free_disarm(f);
2063 }
2064
2065 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2066 {
2067 do_release_file_info(fi);
2068 return 0;
2069 }
2070
2071 int cg_open(const char *path, struct fuse_file_info *fi)
2072 {
2073 const char *cgroup;
2074 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2075 struct cgfs_files *k = NULL;
2076 struct file_info *file_info;
2077 struct fuse_context *fc = fuse_get_context();
2078 int ret;
2079
2080 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2081 return -EIO;
2082
2083 controller = pick_controller_from_path(fc, path);
2084 if (!controller)
2085 return -errno;
2086 cgroup = find_cgroup_in_path(path);
2087 if (!cgroup)
2088 return -errno;
2089
2090 get_cgdir_and_path(cgroup, &cgdir, &last);
2091 if (!last) {
2092 path1 = "/";
2093 path2 = cgdir;
2094 } else {
2095 path1 = cgdir;
2096 path2 = last;
2097 }
2098
2099 k = cgfs_get_key(controller, path1, path2);
2100 if (!k) {
2101 ret = -EINVAL;
2102 goto out;
2103 }
2104 free_key(k);
2105
2106 pid_t initpid = lookup_initpid_in_store(fc->pid);
2107 if (initpid <= 1 || is_shared_pidns(initpid))
2108 initpid = fc->pid;
2109 if (!caller_may_see_dir(initpid, controller, path1)) {
2110 ret = -ENOENT;
2111 goto out;
2112 }
2113 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2114 ret = -EACCES;
2115 goto out;
2116 }
2117
2118 /* we'll free this at cg_release */
2119 file_info = malloc(sizeof(*file_info));
2120 if (!file_info) {
2121 ret = -ENOMEM;
2122 goto out;
2123 }
2124 file_info->controller = must_copy_string(controller);
2125 file_info->cgroup = must_copy_string(path1);
2126 file_info->file = must_copy_string(path2);
2127 file_info->type = LXC_TYPE_CGFILE;
2128 file_info->buf = NULL;
2129 file_info->buflen = 0;
2130
2131 fi->fh = (unsigned long)file_info;
2132 ret = 0;
2133
2134 out:
2135 free(cgdir);
2136 return ret;
2137 }
2138
2139 int cg_access(const char *path, int mode)
2140 {
2141 int ret;
2142 const char *cgroup;
2143 char *path1, *path2, *controller;
2144 char *last = NULL, *cgdir = NULL;
2145 struct cgfs_files *k = NULL;
2146 struct fuse_context *fc = fuse_get_context();
2147
2148 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2149 return -EIO;
2150
2151 if (strcmp(path, "/cgroup") == 0)
2152 return 0;
2153
2154 controller = pick_controller_from_path(fc, path);
2155 if (!controller)
2156 return -errno;
2157 cgroup = find_cgroup_in_path(path);
2158 if (!cgroup) {
2159 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2160 if ((mode & W_OK) == 0)
2161 return 0;
2162 return -EACCES;
2163 }
2164
2165 get_cgdir_and_path(cgroup, &cgdir, &last);
2166 if (!last) {
2167 path1 = "/";
2168 path2 = cgdir;
2169 } else {
2170 path1 = cgdir;
2171 path2 = last;
2172 }
2173
2174 k = cgfs_get_key(controller, path1, path2);
2175 if (!k) {
2176 if ((mode & W_OK) == 0)
2177 ret = 0;
2178 else
2179 ret = -EACCES;
2180 goto out;
2181 }
2182 free_key(k);
2183
2184 pid_t initpid = lookup_initpid_in_store(fc->pid);
2185 if (initpid <= 1 || is_shared_pidns(initpid))
2186 initpid = fc->pid;
2187 if (!caller_may_see_dir(initpid, controller, path1)) {
2188 ret = -ENOENT;
2189 goto out;
2190 }
2191 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2192 ret = -EACCES;
2193 goto out;
2194 }
2195
2196 ret = 0;
2197
2198 out:
2199 free(cgdir);
2200 return ret;
2201 }
2202
2203 int cg_release(const char *path, struct fuse_file_info *fi)
2204 {
2205 do_release_file_info(fi);
2206 return 0;
2207 }
2208
2209 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2210
2211 static bool wait_for_sock(int sock, int timeout)
2212 {
2213 struct epoll_event ev;
2214 int epfd, ret, now, starttime, deltatime, saved_errno;
2215
2216 if ((starttime = time(NULL)) < 0)
2217 return false;
2218
2219 if ((epfd = epoll_create(1)) < 0) {
2220 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2221 return false;
2222 }
2223
2224 ev.events = POLLIN_SET;
2225 ev.data.fd = sock;
2226 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2227 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2228 close(epfd);
2229 return false;
2230 }
2231
2232 again:
2233 if ((now = time(NULL)) < 0) {
2234 close(epfd);
2235 return false;
2236 }
2237
2238 deltatime = (starttime + timeout) - now;
2239 if (deltatime < 0) { // timeout
2240 errno = 0;
2241 close(epfd);
2242 return false;
2243 }
2244 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2245 if (ret < 0 && errno == EINTR)
2246 goto again;
2247 saved_errno = errno;
2248 close(epfd);
2249
2250 if (ret <= 0) {
2251 errno = saved_errno;
2252 return false;
2253 }
2254 return true;
2255 }
2256
2257 static int msgrecv(int sockfd, void *buf, size_t len)
2258 {
2259 if (!wait_for_sock(sockfd, 2))
2260 return -1;
2261 return recv(sockfd, buf, len, MSG_DONTWAIT);
2262 }
2263
2264 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2265 {
2266 struct msghdr msg = { 0 };
2267 struct iovec iov;
2268 struct cmsghdr *cmsg;
2269 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2270 char buf[1];
2271 buf[0] = 'p';
2272
2273 if (pingfirst) {
2274 if (msgrecv(sock, buf, 1) != 1) {
2275 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2276 return SEND_CREDS_FAIL;
2277 }
2278 }
2279
2280 msg.msg_control = cmsgbuf;
2281 msg.msg_controllen = sizeof(cmsgbuf);
2282
2283 cmsg = CMSG_FIRSTHDR(&msg);
2284 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2285 cmsg->cmsg_level = SOL_SOCKET;
2286 cmsg->cmsg_type = SCM_CREDENTIALS;
2287 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2288
2289 msg.msg_name = NULL;
2290 msg.msg_namelen = 0;
2291
2292 buf[0] = v;
2293 iov.iov_base = buf;
2294 iov.iov_len = sizeof(buf);
2295 msg.msg_iov = &iov;
2296 msg.msg_iovlen = 1;
2297
2298 if (sendmsg(sock, &msg, 0) < 0) {
2299 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2300 if (errno == 3)
2301 return SEND_CREDS_NOTSK;
2302 return SEND_CREDS_FAIL;
2303 }
2304
2305 return SEND_CREDS_OK;
2306 }
2307
2308 static bool recv_creds(int sock, struct ucred *cred, char *v)
2309 {
2310 struct msghdr msg = { 0 };
2311 struct iovec iov;
2312 struct cmsghdr *cmsg;
2313 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2314 char buf[1];
2315 int ret;
2316 int optval = 1;
2317
2318 *v = '1';
2319
2320 cred->pid = -1;
2321 cred->uid = -1;
2322 cred->gid = -1;
2323
2324 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2325 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2326 return false;
2327 }
2328 buf[0] = '1';
2329 if (write(sock, buf, 1) != 1) {
2330 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2331 return false;
2332 }
2333
2334 msg.msg_name = NULL;
2335 msg.msg_namelen = 0;
2336 msg.msg_control = cmsgbuf;
2337 msg.msg_controllen = sizeof(cmsgbuf);
2338
2339 iov.iov_base = buf;
2340 iov.iov_len = sizeof(buf);
2341 msg.msg_iov = &iov;
2342 msg.msg_iovlen = 1;
2343
2344 if (!wait_for_sock(sock, 2)) {
2345 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2346 return false;
2347 }
2348 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2349 if (ret < 0) {
2350 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2351 return false;
2352 }
2353
2354 cmsg = CMSG_FIRSTHDR(&msg);
2355
2356 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2357 cmsg->cmsg_level == SOL_SOCKET &&
2358 cmsg->cmsg_type == SCM_CREDENTIALS) {
2359 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2360 }
2361 *v = buf[0];
2362
2363 return true;
2364 }
2365
2366 struct pid_ns_clone_args {
2367 int *cpipe;
2368 int sock;
2369 pid_t tpid;
2370 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2371 };
2372
2373 /*
2374 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2375 * with clone(). This simply writes '1' as ACK back to the parent
2376 * before calling the actual wrapped function.
2377 */
2378 static int pid_ns_clone_wrapper(void *arg) {
2379 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2380 char b = '1';
2381
2382 close(args->cpipe[0]);
2383 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2384 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2385 close(args->cpipe[1]);
2386 return args->wrapped(args->sock, args->tpid);
2387 }
2388
2389 /*
2390 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2391 * int value back over the socket. This shifts the pid from the
2392 * sender's pidns into tpid's pidns.
2393 */
2394 static int pid_to_ns(int sock, pid_t tpid)
2395 {
2396 char v = '0';
2397 struct ucred cred;
2398
2399 while (recv_creds(sock, &cred, &v)) {
2400 if (v == '1')
2401 return 0;
2402 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2403 return 1;
2404 }
2405 return 0;
2406 }
2407
2408
2409 /*
2410 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2411 * in your old pidns. Only children which you clone will be in the target
2412 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2413 * actually convert pids.
2414 *
2415 * Note: glibc's fork() does not respect pidns, which can lead to failed
2416 * assertions inside glibc (and thus failed forks) if the child's pid in
2417 * the pidns and the parent pid outside are identical. Using clone prevents
2418 * this issue.
2419 */
2420 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2421 {
2422 int newnsfd = -1, ret, cpipe[2];
2423 char fnam[100];
2424 pid_t cpid;
2425 char v;
2426
2427 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2428 if (ret < 0 || ret >= sizeof(fnam))
2429 _exit(1);
2430 newnsfd = open(fnam, O_RDONLY);
2431 if (newnsfd < 0)
2432 _exit(1);
2433 if (setns(newnsfd, 0) < 0)
2434 _exit(1);
2435 close(newnsfd);
2436
2437 if (pipe(cpipe) < 0)
2438 _exit(1);
2439
2440 struct pid_ns_clone_args args = {
2441 .cpipe = cpipe,
2442 .sock = sock,
2443 .tpid = tpid,
2444 .wrapped = &pid_to_ns
2445 };
2446 size_t stack_size = sysconf(_SC_PAGESIZE);
2447 void *stack = alloca(stack_size);
2448
2449 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2450 if (cpid < 0)
2451 _exit(1);
2452
2453 // give the child 1 second to be done forking and
2454 // write its ack
2455 if (!wait_for_sock(cpipe[0], 1))
2456 _exit(1);
2457 ret = read(cpipe[0], &v, 1);
2458 if (ret != sizeof(char) || v != '1')
2459 _exit(1);
2460
2461 if (!wait_for_pid(cpid))
2462 _exit(1);
2463 _exit(0);
2464 }
2465
2466 /*
2467 * To read cgroup files with a particular pid, we will setns into the child
2468 * pidns, open a pipe, fork a child - which will be the first to really be in
2469 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2470 */
2471 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2472 {
2473 int sock[2] = {-1, -1};
2474 char *tmpdata = NULL;
2475 int ret;
2476 pid_t qpid, cpid = -1;
2477 bool answer = false;
2478 char v = '0';
2479 struct ucred cred;
2480 size_t sz = 0, asz = 0;
2481
2482 if (!cgroup_ops->get(cgroup_ops, contrl, cg, file, &tmpdata))
2483 return false;
2484
2485 /*
2486 * Now we read the pids from returned data one by one, pass
2487 * them into a child in the target namespace, read back the
2488 * translated pids, and put them into our to-return data
2489 */
2490
2491 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2492 perror("socketpair");
2493 free(tmpdata);
2494 return false;
2495 }
2496
2497 cpid = fork();
2498 if (cpid == -1)
2499 goto out;
2500
2501 if (!cpid) // child - exits when done
2502 pid_to_ns_wrapper(sock[1], tpid);
2503
2504 char *ptr = tmpdata;
2505 cred.uid = 0;
2506 cred.gid = 0;
2507 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2508 cred.pid = qpid;
2509 ret = send_creds(sock[0], &cred, v, true);
2510
2511 if (ret == SEND_CREDS_NOTSK)
2512 goto next;
2513 if (ret == SEND_CREDS_FAIL)
2514 goto out;
2515
2516 // read converted results
2517 if (!wait_for_sock(sock[0], 2)) {
2518 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2519 goto out;
2520 }
2521 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2522 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2523 goto out;
2524 }
2525 must_strcat_pid(d, &sz, &asz, qpid);
2526 next:
2527 ptr = strchr(ptr, '\n');
2528 if (!ptr)
2529 break;
2530 ptr++;
2531 }
2532
2533 cred.pid = getpid();
2534 v = '1';
2535 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2536 // failed to ask child to exit
2537 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2538 goto out;
2539 }
2540
2541 answer = true;
2542
2543 out:
2544 free(tmpdata);
2545 if (cpid != -1)
2546 wait_for_pid(cpid);
2547 if (sock[0] != -1) {
2548 close(sock[0]);
2549 close(sock[1]);
2550 }
2551 return answer;
2552 }
2553
2554 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2555 struct fuse_file_info *fi)
2556 {
2557 struct fuse_context *fc = fuse_get_context();
2558 struct file_info *f = (struct file_info *)fi->fh;
2559 struct cgfs_files *k = NULL;
2560 char *data = NULL;
2561 int ret, s;
2562 bool r;
2563
2564 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2565 return -EIO;
2566
2567 if (f->type != LXC_TYPE_CGFILE) {
2568 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2569 return -EIO;
2570 }
2571
2572 if (offset)
2573 return 0;
2574
2575 if (!f->controller)
2576 return -EINVAL;
2577
2578 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2579 return -EINVAL;
2580 }
2581 free_key(k);
2582
2583
2584 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2585 ret = -EACCES;
2586 goto out;
2587 }
2588
2589 if (strcmp(f->file, "tasks") == 0 ||
2590 strcmp(f->file, "/tasks") == 0 ||
2591 strcmp(f->file, "/cgroup.procs") == 0 ||
2592 strcmp(f->file, "cgroup.procs") == 0)
2593 // special case - we have to translate the pids
2594 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2595 else
2596 r = cgroup_ops->get(cgroup_ops, f->controller, f->cgroup, f->file, &data);
2597
2598 if (!r) {
2599 ret = -EINVAL;
2600 goto out;
2601 }
2602
2603 if (!data) {
2604 ret = 0;
2605 goto out;
2606 }
2607 s = strlen(data);
2608 if (s > size)
2609 s = size;
2610 memcpy(buf, data, s);
2611 if (s > 0 && s < size && data[s-1] != '\n')
2612 buf[s++] = '\n';
2613
2614 ret = s;
2615
2616 out:
2617 free(data);
2618 return ret;
2619 }
2620
2621 static int pid_from_ns(int sock, pid_t tpid)
2622 {
2623 pid_t vpid;
2624 struct ucred cred;
2625 char v;
2626 int ret;
2627
2628 cred.uid = 0;
2629 cred.gid = 0;
2630 while (1) {
2631 if (!wait_for_sock(sock, 2)) {
2632 lxcfs_error("%s\n", "Timeout reading from parent.");
2633 return 1;
2634 }
2635 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2636 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2637 return 1;
2638 }
2639 if (vpid == -1) // done
2640 break;
2641 v = '0';
2642 cred.pid = vpid;
2643 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2644 v = '1';
2645 cred.pid = getpid();
2646 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2647 return 1;
2648 }
2649 }
2650 return 0;
2651 }
2652
2653 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2654 {
2655 int newnsfd = -1, ret, cpipe[2];
2656 char fnam[100];
2657 pid_t cpid;
2658 char v;
2659
2660 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2661 if (ret < 0 || ret >= sizeof(fnam))
2662 _exit(1);
2663 newnsfd = open(fnam, O_RDONLY);
2664 if (newnsfd < 0)
2665 _exit(1);
2666 if (setns(newnsfd, 0) < 0)
2667 _exit(1);
2668 close(newnsfd);
2669
2670 if (pipe(cpipe) < 0)
2671 _exit(1);
2672
2673 struct pid_ns_clone_args args = {
2674 .cpipe = cpipe,
2675 .sock = sock,
2676 .tpid = tpid,
2677 .wrapped = &pid_from_ns
2678 };
2679 size_t stack_size = sysconf(_SC_PAGESIZE);
2680 void *stack = alloca(stack_size);
2681
2682 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2683 if (cpid < 0)
2684 _exit(1);
2685
2686 // give the child 1 second to be done forking and
2687 // write its ack
2688 if (!wait_for_sock(cpipe[0], 1))
2689 _exit(1);
2690 ret = read(cpipe[0], &v, 1);
2691 if (ret != sizeof(char) || v != '1')
2692 _exit(1);
2693
2694 if (!wait_for_pid(cpid))
2695 _exit(1);
2696 _exit(0);
2697 }
2698
2699 /*
2700 * Given host @uid, return the uid to which it maps in
2701 * @pid's user namespace, or -1 if none.
2702 */
2703 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2704 {
2705 FILE *f;
2706 char line[400];
2707
2708 sprintf(line, "/proc/%d/uid_map", pid);
2709 if ((f = fopen(line, "r")) == NULL) {
2710 return false;
2711 }
2712
2713 *answer = convert_id_to_ns(f, uid);
2714 fclose(f);
2715
2716 if (*answer == -1)
2717 return false;
2718 return true;
2719 }
2720
2721 /*
2722 * get_pid_creds: get the real uid and gid of @pid from
2723 * /proc/$$/status
2724 * (XXX should we use euid here?)
2725 */
2726 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2727 {
2728 char line[400];
2729 uid_t u;
2730 gid_t g;
2731 FILE *f;
2732
2733 *uid = -1;
2734 *gid = -1;
2735 sprintf(line, "/proc/%d/status", pid);
2736 if ((f = fopen(line, "r")) == NULL) {
2737 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2738 return;
2739 }
2740 while (fgets(line, 400, f)) {
2741 if (strncmp(line, "Uid:", 4) == 0) {
2742 if (sscanf(line+4, "%u", &u) != 1) {
2743 lxcfs_error("bad uid line for pid %u\n", pid);
2744 fclose(f);
2745 return;
2746 }
2747 *uid = u;
2748 } else if (strncmp(line, "Gid:", 4) == 0) {
2749 if (sscanf(line+4, "%u", &g) != 1) {
2750 lxcfs_error("bad gid line for pid %u\n", pid);
2751 fclose(f);
2752 return;
2753 }
2754 *gid = g;
2755 }
2756 }
2757 fclose(f);
2758 }
2759
2760 /*
2761 * May the requestor @r move victim @v to a new cgroup?
2762 * This is allowed if
2763 * . they are the same task
2764 * . they are ownedy by the same uid
2765 * . @r is root on the host, or
2766 * . @v's uid is mapped into @r's where @r is root.
2767 */
2768 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2769 {
2770 uid_t v_uid, tmpuid;
2771 gid_t v_gid;
2772
2773 if (r == v)
2774 return true;
2775 if (r_uid == 0)
2776 return true;
2777 get_pid_creds(v, &v_uid, &v_gid);
2778 if (r_uid == v_uid)
2779 return true;
2780 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2781 && hostuid_to_ns(v_uid, r, &tmpuid))
2782 return true;
2783 return false;
2784 }
2785
2786 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2787 const char *file, const char *buf)
2788 {
2789 int sock[2] = {-1, -1};
2790 pid_t qpid, cpid = -1;
2791 FILE *pids_file = NULL;
2792 bool answer = false, fail = false;
2793
2794 pids_file = open_pids_file(contrl, cg);
2795 if (!pids_file)
2796 return false;
2797
2798 /*
2799 * write the pids to a socket, have helper in writer's pidns
2800 * call movepid for us
2801 */
2802 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2803 perror("socketpair");
2804 goto out;
2805 }
2806
2807 cpid = fork();
2808 if (cpid == -1)
2809 goto out;
2810
2811 if (!cpid) { // child
2812 fclose(pids_file);
2813 pid_from_ns_wrapper(sock[1], tpid);
2814 }
2815
2816 const char *ptr = buf;
2817 while (sscanf(ptr, "%d", &qpid) == 1) {
2818 struct ucred cred;
2819 char v;
2820
2821 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2822 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2823 goto out;
2824 }
2825
2826 if (recv_creds(sock[0], &cred, &v)) {
2827 if (v == '0') {
2828 if (!may_move_pid(tpid, tuid, cred.pid)) {
2829 fail = true;
2830 break;
2831 }
2832 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2833 fail = true;
2834 }
2835 }
2836
2837 ptr = strchr(ptr, '\n');
2838 if (!ptr)
2839 break;
2840 ptr++;
2841 }
2842
2843 /* All good, write the value */
2844 qpid = -1;
2845 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2846 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2847
2848 if (!fail)
2849 answer = true;
2850
2851 out:
2852 if (cpid != -1)
2853 wait_for_pid(cpid);
2854 if (sock[0] != -1) {
2855 close(sock[0]);
2856 close(sock[1]);
2857 }
2858 if (pids_file) {
2859 if (fclose(pids_file) != 0)
2860 answer = false;
2861 }
2862 return answer;
2863 }
2864
2865 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2866 struct fuse_file_info *fi)
2867 {
2868 struct fuse_context *fc = fuse_get_context();
2869 char *localbuf = NULL;
2870 struct cgfs_files *k = NULL;
2871 struct file_info *f = (struct file_info *)fi->fh;
2872 bool r;
2873
2874 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2875 return -EIO;
2876
2877 if (f->type != LXC_TYPE_CGFILE) {
2878 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2879 return -EIO;
2880 }
2881
2882 if (offset)
2883 return 0;
2884
2885 localbuf = alloca(size+1);
2886 localbuf[size] = '\0';
2887 memcpy(localbuf, buf, size);
2888
2889 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2890 size = -EINVAL;
2891 goto out;
2892 }
2893
2894 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2895 size = -EACCES;
2896 goto out;
2897 }
2898
2899 if (strcmp(f->file, "tasks") == 0 ||
2900 strcmp(f->file, "/tasks") == 0 ||
2901 strcmp(f->file, "/cgroup.procs") == 0 ||
2902 strcmp(f->file, "cgroup.procs") == 0)
2903 // special case - we have to translate the pids
2904 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2905 else
2906 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2907
2908 if (!r)
2909 size = -EINVAL;
2910
2911 out:
2912 free_key(k);
2913 return size;
2914 }
2915
2916 int cg_chown(const char *path, uid_t uid, gid_t gid)
2917 {
2918 struct fuse_context *fc = fuse_get_context();
2919 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2920 struct cgfs_files *k = NULL;
2921 const char *cgroup;
2922 int ret;
2923
2924 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2925 return -EIO;
2926
2927 if (strcmp(path, "/cgroup") == 0)
2928 return -EPERM;
2929
2930 controller = pick_controller_from_path(fc, path);
2931 if (!controller)
2932 return errno == ENOENT ? -EPERM : -errno;
2933
2934 cgroup = find_cgroup_in_path(path);
2935 if (!cgroup)
2936 /* this is just /cgroup/controller */
2937 return -EPERM;
2938
2939 get_cgdir_and_path(cgroup, &cgdir, &last);
2940
2941 if (!last) {
2942 path1 = "/";
2943 path2 = cgdir;
2944 } else {
2945 path1 = cgdir;
2946 path2 = last;
2947 }
2948
2949 if (is_child_cgroup(controller, path1, path2)) {
2950 // get uid, gid, from '/tasks' file and make up a mode
2951 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2952 k = cgfs_get_key(controller, cgroup, "tasks");
2953
2954 } else
2955 k = cgfs_get_key(controller, path1, path2);
2956
2957 if (!k) {
2958 ret = -EINVAL;
2959 goto out;
2960 }
2961
2962 /*
2963 * This being a fuse request, the uid and gid must be valid
2964 * in the caller's namespace. So we can just check to make
2965 * sure that the caller is root in his uid, and privileged
2966 * over the file's current owner.
2967 */
2968 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2969 ret = -EACCES;
2970 goto out;
2971 }
2972
2973 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2974
2975 out:
2976 free_key(k);
2977 free(cgdir);
2978
2979 return ret;
2980 }
2981
2982 int cg_chmod(const char *path, mode_t mode)
2983 {
2984 struct fuse_context *fc = fuse_get_context();
2985 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2986 struct cgfs_files *k = NULL;
2987 const char *cgroup;
2988 int ret;
2989
2990 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2991 return -EIO;
2992
2993 if (strcmp(path, "/cgroup") == 0)
2994 return -EPERM;
2995
2996 controller = pick_controller_from_path(fc, path);
2997 if (!controller)
2998 return errno == ENOENT ? -EPERM : -errno;
2999
3000 cgroup = find_cgroup_in_path(path);
3001 if (!cgroup)
3002 /* this is just /cgroup/controller */
3003 return -EPERM;
3004
3005 get_cgdir_and_path(cgroup, &cgdir, &last);
3006
3007 if (!last) {
3008 path1 = "/";
3009 path2 = cgdir;
3010 } else {
3011 path1 = cgdir;
3012 path2 = last;
3013 }
3014
3015 if (is_child_cgroup(controller, path1, path2)) {
3016 // get uid, gid, from '/tasks' file and make up a mode
3017 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3018 k = cgfs_get_key(controller, cgroup, "tasks");
3019
3020 } else
3021 k = cgfs_get_key(controller, path1, path2);
3022
3023 if (!k) {
3024 ret = -EINVAL;
3025 goto out;
3026 }
3027
3028 /*
3029 * This being a fuse request, the uid and gid must be valid
3030 * in the caller's namespace. So we can just check to make
3031 * sure that the caller is root in his uid, and privileged
3032 * over the file's current owner.
3033 */
3034 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3035 ret = -EPERM;
3036 goto out;
3037 }
3038
3039 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3040 ret = -EINVAL;
3041 goto out;
3042 }
3043
3044 ret = 0;
3045 out:
3046 free_key(k);
3047 free(cgdir);
3048 return ret;
3049 }
3050
3051 int cg_mkdir(const char *path, mode_t mode)
3052 {
3053 struct fuse_context *fc = fuse_get_context();
3054 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3055 const char *cgroup;
3056 int ret;
3057
3058 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
3059 return -EIO;
3060
3061 controller = pick_controller_from_path(fc, path);
3062 if (!controller)
3063 return errno == ENOENT ? -EPERM : -errno;
3064
3065 cgroup = find_cgroup_in_path(path);
3066 if (!cgroup)
3067 return -errno;
3068
3069 get_cgdir_and_path(cgroup, &cgdir, &last);
3070 if (!last)
3071 path1 = "/";
3072 else
3073 path1 = cgdir;
3074
3075 pid_t initpid = lookup_initpid_in_store(fc->pid);
3076 if (initpid <= 1 || is_shared_pidns(initpid))
3077 initpid = fc->pid;
3078 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3079 if (!next)
3080 ret = -EINVAL;
3081 else if (last && strcmp(next, last) == 0)
3082 ret = -EEXIST;
3083 else
3084 ret = -EPERM;
3085 goto out;
3086 }
3087
3088 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3089 ret = -EACCES;
3090 goto out;
3091 }
3092 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3093 ret = -EACCES;
3094 goto out;
3095 }
3096
3097 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3098
3099 out:
3100 free(cgdir);
3101 free(next);
3102 return ret;
3103 }
3104
3105 int cg_rmdir(const char *path)
3106 {
3107 struct fuse_context *fc = fuse_get_context();
3108 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3109 const char *cgroup;
3110 int ret;
3111
3112 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
3113 return -EIO;
3114
3115 controller = pick_controller_from_path(fc, path);
3116 if (!controller) /* Someone's trying to delete "/cgroup". */
3117 return -EPERM;
3118
3119 cgroup = find_cgroup_in_path(path);
3120 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3121 return -EPERM;
3122
3123 get_cgdir_and_path(cgroup, &cgdir, &last);
3124 if (!last) {
3125 /* Someone's trying to delete a cgroup on the same level as the
3126 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3127 * rmdir "/cgroup/blkio/init.slice".
3128 */
3129 ret = -EPERM;
3130 goto out;
3131 }
3132
3133 pid_t initpid = lookup_initpid_in_store(fc->pid);
3134 if (initpid <= 1 || is_shared_pidns(initpid))
3135 initpid = fc->pid;
3136 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3137 if (!last || (next && (strcmp(next, last) == 0)))
3138 ret = -EBUSY;
3139 else
3140 ret = -ENOENT;
3141 goto out;
3142 }
3143
3144 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3145 ret = -EACCES;
3146 goto out;
3147 }
3148 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3149 ret = -EACCES;
3150 goto out;
3151 }
3152
3153 if (!cgfs_remove(controller, cgroup)) {
3154 ret = -EINVAL;
3155 goto out;
3156 }
3157
3158 ret = 0;
3159
3160 out:
3161 free(cgdir);
3162 free(next);
3163 return ret;
3164 }
3165
3166 static bool startswith(const char *line, const char *pref)
3167 {
3168 if (strncmp(line, pref, strlen(pref)) == 0)
3169 return true;
3170 return false;
3171 }
3172
3173 /* Note that "memory.stat" in cgroup2 is hierarchical by default. */
3174 static void parse_memstat(int version,
3175 char *memstat,
3176 unsigned long *cached,
3177 unsigned long *active_anon,
3178 unsigned long *inactive_anon,
3179 unsigned long *active_file,
3180 unsigned long *inactive_file,
3181 unsigned long *unevictable,
3182 unsigned long *shmem)
3183 {
3184 char *eol;
3185
3186 while (*memstat) {
3187 if (startswith(memstat, is_unified_controller(version)
3188 ? "cache"
3189 : "total_cache")) {
3190 sscanf(memstat + 11, "%lu", cached);
3191 *cached /= 1024;
3192 } else if (startswith(memstat, is_unified_controller(version)
3193 ? "active_anon"
3194 : "total_active_anon")) {
3195 sscanf(memstat + 17, "%lu", active_anon);
3196 *active_anon /= 1024;
3197 } else if (startswith(memstat, is_unified_controller(version)
3198 ? "inactive_anon"
3199 : "total_inactive_anon")) {
3200 sscanf(memstat + 19, "%lu", inactive_anon);
3201 *inactive_anon /= 1024;
3202 } else if (startswith(memstat, is_unified_controller(version)
3203 ? "active_file"
3204 : "total_active_file")) {
3205 sscanf(memstat + 17, "%lu", active_file);
3206 *active_file /= 1024;
3207 } else if (startswith(memstat, is_unified_controller(version)
3208 ? "inactive_file"
3209 : "total_inactive_file")) {
3210 sscanf(memstat + 19, "%lu", inactive_file);
3211 *inactive_file /= 1024;
3212 } else if (startswith(memstat, is_unified_controller(version)
3213 ? "unevictable"
3214 : "total_unevictable")) {
3215 sscanf(memstat + 17, "%lu", unevictable);
3216 *unevictable /= 1024;
3217 } else if (startswith(memstat, is_unified_controller(version)
3218 ? "shmem"
3219 : "total_shmem")) {
3220 sscanf(memstat + 11, "%lu", shmem);
3221 *shmem /= 1024;
3222 }
3223 eol = strchr(memstat, '\n');
3224 if (!eol)
3225 return;
3226 memstat = eol+1;
3227 }
3228 }
3229
3230 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3231 {
3232 char *eol;
3233 char key[32];
3234
3235 memset(key, 0, 32);
3236 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3237
3238 size_t len = strlen(key);
3239 *v = 0;
3240
3241 while (*str) {
3242 if (startswith(str, key)) {
3243 sscanf(str + len, "%lu", v);
3244 return;
3245 }
3246 eol = strchr(str, '\n');
3247 if (!eol)
3248 return;
3249 str = eol+1;
3250 }
3251 }
3252
3253 int read_file_fuse(const char *path, char *buf, size_t size, struct file_info *d)
3254 {
3255 __do_free char *line = NULL;
3256 __do_fclose FILE *f = NULL;
3257 size_t linelen = 0, total_len = 0;
3258 char *cache = d->buf;
3259 size_t cache_size = d->buflen;
3260
3261 f = fopen(path, "r");
3262 if (!f)
3263 return 0;
3264
3265 while (getline(&line, &linelen, f) != -1) {
3266 ssize_t l = snprintf(cache, cache_size, "%s", line);
3267 if (l < 0) {
3268 perror("Error writing to cache");
3269 return 0;
3270 }
3271 if (l >= cache_size) {
3272 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3273 return 0;
3274 }
3275 cache += l;
3276 cache_size -= l;
3277 total_len += l;
3278 }
3279
3280 d->size = total_len;
3281 if (total_len > size)
3282 total_len = size;
3283
3284 /* read from off 0 */
3285 memcpy(buf, d->buf, total_len);
3286
3287 if (d->size > total_len)
3288 d->cached = d->size - total_len;
3289 return total_len;
3290 }
3291
3292 /*
3293 * FUSE ops for /proc
3294 */
3295
3296 static unsigned long get_memlimit(const char *cgroup, bool swap)
3297 {
3298 int ret;
3299 __do_free char *memlimit_str = NULL;
3300 unsigned long memlimit = -1;
3301
3302 if (swap)
3303 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
3304 else
3305 ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
3306 if (ret > 0)
3307 memlimit = strtoul(memlimit_str, NULL, 10);
3308
3309 return memlimit;
3310 }
3311
3312 static unsigned long get_min_memlimit(const char *cgroup, bool swap)
3313 {
3314 __do_free char *copy = NULL;
3315 unsigned long memlimit = 0;
3316 unsigned long retlimit;
3317
3318 copy = strdup(cgroup);
3319 retlimit = get_memlimit(copy, swap);
3320
3321 while (strcmp(copy, "/") != 0) {
3322 char *it = copy;
3323
3324 it = dirname(it);
3325 memlimit = get_memlimit(it, swap);
3326 if (memlimit != -1 && memlimit < retlimit)
3327 retlimit = memlimit;
3328 };
3329
3330 return retlimit;
3331 }
3332
3333 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3334 struct fuse_file_info *fi)
3335 {
3336 __do_free char *cgroup = NULL, *line = NULL,
3337 *memusage_str = NULL, *memstat_str = NULL,
3338 *memswlimit_str = NULL, *memswusage_str = NULL;
3339 __do_fclose FILE *f = NULL;
3340 struct fuse_context *fc = fuse_get_context();
3341 struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
3342 struct file_info *d = (struct file_info *)fi->fh;
3343 unsigned long memlimit = 0, memusage = 0, memswlimit = 0,
3344 memswusage = 0, cached = 0, hosttotal = 0, active_anon = 0,
3345 inactive_anon = 0, active_file = 0, inactive_file = 0,
3346 unevictable = 0, shmem = 0, hostswtotal = 0;
3347 size_t linelen = 0, total_len = 0;
3348 char *cache = d->buf;
3349 size_t cache_size = d->buflen;
3350 int ret;
3351
3352 if (offset) {
3353 int left;
3354
3355 if (offset > d->size)
3356 return -EINVAL;
3357
3358 if (!d->cached)
3359 return 0;
3360
3361 left = d->size - offset;
3362 total_len = left > size ? size : left;
3363 memcpy(buf, cache + offset, total_len);
3364
3365 return total_len;
3366 }
3367
3368 pid_t initpid = lookup_initpid_in_store(fc->pid);
3369 if (initpid <= 1 || is_shared_pidns(initpid))
3370 initpid = fc->pid;
3371
3372 cgroup = get_pid_cgroup(initpid, "memory");
3373 if (!cgroup)
3374 return read_file_fuse("/proc/meminfo", buf, size, d);
3375
3376 prune_init_slice(cgroup);
3377
3378 memlimit = get_min_memlimit(cgroup, false);
3379
3380 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
3381 if (ret < 0)
3382 return 0;
3383
3384 ret = cgroup_ops->get_memory_stats(cgroup_ops, cgroup, &memstat_str);
3385 if (ret < 0)
3386 return 0;
3387 parse_memstat(ret, memstat_str, &cached, &active_anon, &inactive_anon,
3388 &active_file, &inactive_file, &unevictable, &shmem);
3389
3390 /*
3391 * Following values are allowed to fail, because swapaccount might be
3392 * turned off for current kernel.
3393 */
3394 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memswlimit_str);
3395 if (ret >= 0)
3396 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
3397 if (ret >= 0) {
3398 memswlimit = get_min_memlimit(cgroup, true);
3399 memswusage = strtoul(memswusage_str, NULL, 10);
3400 memswlimit = memswlimit / 1024;
3401 memswusage = memswusage / 1024;
3402 }
3403
3404 memusage = strtoul(memusage_str, NULL, 10);
3405 memlimit /= 1024;
3406 memusage /= 1024;
3407
3408 f = fopen("/proc/meminfo", "r");
3409 if (!f)
3410 return 0;
3411
3412 while (getline(&line, &linelen, f) != -1) {
3413 ssize_t l;
3414 char *printme, lbuf[100];
3415
3416 memset(lbuf, 0, 100);
3417 if (startswith(line, "MemTotal:")) {
3418 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3419 if (hosttotal < memlimit)
3420 memlimit = hosttotal;
3421 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3422 printme = lbuf;
3423 } else if (startswith(line, "MemFree:")) {
3424 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3425 printme = lbuf;
3426 } else if (startswith(line, "MemAvailable:")) {
3427 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
3428 printme = lbuf;
3429 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 &&
3430 opts && opts->swap_off == false) {
3431 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3432 if (hostswtotal < memswlimit)
3433 memswlimit = hostswtotal;
3434 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
3435 printme = lbuf;
3436 } else if (startswith(line, "SwapTotal:") && opts && opts->swap_off == true) {
3437 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", 0UL);
3438 printme = lbuf;
3439 } else if (startswith(line, "SwapFree:") && memswlimit > 0 &&
3440 memswusage > 0 && opts && opts->swap_off == false) {
3441 unsigned long swaptotal = memswlimit,
3442 swapusage = memusage > memswusage
3443 ? 0
3444 : memswusage - memusage,
3445 swapfree = swapusage < swaptotal
3446 ? swaptotal - swapusage
3447 : 0;
3448 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3449 printme = lbuf;
3450 } else if (startswith(line, "SwapFree:") && opts && opts->swap_off == true) {
3451 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", 0UL);
3452 printme = lbuf;
3453 } else if (startswith(line, "Slab:")) {
3454 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3455 printme = lbuf;
3456 } else if (startswith(line, "Buffers:")) {
3457 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3458 printme = lbuf;
3459 } else if (startswith(line, "Cached:")) {
3460 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3461 printme = lbuf;
3462 } else if (startswith(line, "SwapCached:")) {
3463 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3464 printme = lbuf;
3465 } else if (startswith(line, "Active:")) {
3466 snprintf(lbuf, 100, "Active: %8lu kB\n",
3467 active_anon + active_file);
3468 printme = lbuf;
3469 } else if (startswith(line, "Inactive:")) {
3470 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3471 inactive_anon + inactive_file);
3472 printme = lbuf;
3473 } else if (startswith(line, "Active(anon)")) {
3474 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3475 printme = lbuf;
3476 } else if (startswith(line, "Inactive(anon)")) {
3477 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3478 printme = lbuf;
3479 } else if (startswith(line, "Active(file)")) {
3480 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3481 printme = lbuf;
3482 } else if (startswith(line, "Inactive(file)")) {
3483 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3484 printme = lbuf;
3485 } else if (startswith(line, "Unevictable")) {
3486 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3487 printme = lbuf;
3488 } else if (startswith(line, "SReclaimable")) {
3489 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3490 printme = lbuf;
3491 } else if (startswith(line, "SUnreclaim")) {
3492 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3493 printme = lbuf;
3494 } else if (startswith(line, "Shmem:")) {
3495 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3496 printme = lbuf;
3497 } else if (startswith(line, "ShmemHugePages")) {
3498 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3499 printme = lbuf;
3500 } else if (startswith(line, "ShmemPmdMapped")) {
3501 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3502 printme = lbuf;
3503 } else
3504 printme = line;
3505
3506 l = snprintf(cache, cache_size, "%s", printme);
3507 if (l < 0) {
3508 perror("Error writing to cache");
3509 return 0;
3510
3511 }
3512 if (l >= cache_size) {
3513 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3514 return 0;
3515 }
3516
3517 cache += l;
3518 cache_size -= l;
3519 total_len += l;
3520 }
3521
3522 d->cached = 1;
3523 d->size = total_len;
3524 if (total_len > size ) total_len = size;
3525 memcpy(buf, d->buf, total_len);
3526
3527 return total_len;
3528 }
3529
3530 /*
3531 * Read the cpuset.cpus for cg
3532 * Return the answer in a newly allocated string which must be freed
3533 */
3534 char *get_cpuset(const char *cg)
3535 {
3536 char *value = NULL;
3537 int ret;
3538
3539 ret = cgroup_ops->get_cpuset_cpus(cgroup_ops, cg, &value);
3540 if (ret < 0)
3541 return NULL;
3542
3543 return value;
3544 }
3545
3546 bool cpu_in_cpuset(int cpu, const char *cpuset);
3547
3548 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3549 {
3550 int cpu;
3551
3552 if (sscanf(line, "processor : %d", &cpu) != 1)
3553 return false;
3554 return cpu_in_cpuset(cpu, cpuset);
3555 }
3556
3557 /*
3558 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3559 * depending on `param`. Parameter value is returned throuh `value`.
3560 */
3561 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3562 {
3563 __do_free char *str = NULL;
3564 char file[11 + 6 + 1]; /* cpu.cfs__us + quota/period + \0 */
3565
3566 snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
3567
3568 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
3569 return false;
3570
3571 if (sscanf(str, "%ld", value) != 1)
3572 return false;
3573
3574 return true;
3575 }
3576
3577 /*
3578 * Return the maximum number of visible CPUs based on CPU quotas.
3579 * If there is no quota set, zero is returned.
3580 */
3581 int max_cpu_count(const char *cg)
3582 {
3583 int rv, nprocs;
3584 int64_t cfs_quota, cfs_period;
3585 int nr_cpus_in_cpuset = 0;
3586 char *cpuset = NULL;
3587
3588 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3589 return 0;
3590
3591 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3592 return 0;
3593
3594 cpuset = get_cpuset(cg);
3595 if (cpuset)
3596 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
3597
3598 if (cfs_quota <= 0 || cfs_period <= 0){
3599 if (nr_cpus_in_cpuset > 0)
3600 return nr_cpus_in_cpuset;
3601
3602 return 0;
3603 }
3604
3605 rv = cfs_quota / cfs_period;
3606
3607 /* In case quota/period does not yield a whole number, add one CPU for
3608 * the remainder.
3609 */
3610 if ((cfs_quota % cfs_period) > 0)
3611 rv += 1;
3612
3613 nprocs = get_nprocs();
3614
3615 if (rv > nprocs)
3616 rv = nprocs;
3617
3618 /* use min value in cpu quota and cpuset */
3619 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
3620 rv = nr_cpus_in_cpuset;
3621
3622 return rv;
3623 }
3624
3625 /*
3626 * Return the exact number of visible CPUs based on CPU quotas.
3627 * If there is no quota set, zero is returned.
3628 */
3629 static double exact_cpu_count(const char *cg)
3630 {
3631 double rv;
3632 int nprocs;
3633 int64_t cfs_quota, cfs_period;
3634
3635 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3636 return 0;
3637
3638 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3639 return 0;
3640
3641 if (cfs_quota <= 0 || cfs_period <= 0)
3642 return 0;
3643
3644 rv = (double)cfs_quota / (double)cfs_period;
3645
3646 nprocs = get_nprocs();
3647
3648 if (rv > nprocs)
3649 rv = nprocs;
3650
3651 return rv;
3652 }
3653
3654 /*
3655 * check whether this is a '^processor" line in /proc/cpuinfo
3656 */
3657 static bool is_processor_line(const char *line)
3658 {
3659 int cpu;
3660
3661 if (sscanf(line, "processor : %d", &cpu) == 1)
3662 return true;
3663 return false;
3664 }
3665
3666 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3667 struct fuse_file_info *fi)
3668 {
3669 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
3670 __do_fclose FILE *f = NULL;
3671 struct fuse_context *fc = fuse_get_context();
3672 struct file_info *d = (struct file_info *)fi->fh;
3673 size_t linelen = 0, total_len = 0;
3674 bool am_printing = false, firstline = true, is_s390x = false;
3675 int curcpu = -1, cpu, max_cpus = 0;
3676 bool use_view;
3677 char *cache = d->buf;
3678 size_t cache_size = d->buflen;
3679
3680 if (offset){
3681 int left;
3682
3683 if (offset > d->size)
3684 return -EINVAL;
3685
3686 if (!d->cached)
3687 return 0;
3688
3689 left = d->size - offset;
3690 total_len = left > size ? size: left;
3691 memcpy(buf, cache + offset, total_len);
3692
3693 return total_len;
3694 }
3695
3696 pid_t initpid = lookup_initpid_in_store(fc->pid);
3697 if (initpid <= 1 || is_shared_pidns(initpid))
3698 initpid = fc->pid;
3699 cg = get_pid_cgroup(initpid, "cpuset");
3700 if (!cg)
3701 return read_file_fuse("proc/cpuinfo", buf, size, d);
3702 prune_init_slice(cg);
3703
3704 cpuset = get_cpuset(cg);
3705 if (!cpuset)
3706 return 0;
3707
3708 use_view = cgroup_ops->can_use_cpuview(cgroup_ops);
3709 if (use_view)
3710 max_cpus = max_cpu_count(cg);
3711
3712 f = fopen("/proc/cpuinfo", "r");
3713 if (!f)
3714 return 0;
3715
3716 while (getline(&line, &linelen, f) != -1) {
3717 ssize_t l;
3718 if (firstline) {
3719 firstline = false;
3720 if (strstr(line, "IBM/S390") != NULL) {
3721 is_s390x = true;
3722 am_printing = true;
3723 continue;
3724 }
3725 }
3726 if (strncmp(line, "# processors:", 12) == 0)
3727 continue;
3728 if (is_processor_line(line)) {
3729 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3730 break;
3731 am_printing = cpuline_in_cpuset(line, cpuset);
3732 if (am_printing) {
3733 curcpu ++;
3734 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3735 if (l < 0) {
3736 perror("Error writing to cache");
3737 return 0;
3738 }
3739 if (l >= cache_size) {
3740 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3741 return 0;
3742 }
3743 cache += l;
3744 cache_size -= l;
3745 total_len += l;
3746 }
3747 continue;
3748 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3749 char *p;
3750 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3751 break;
3752 if (!cpu_in_cpuset(cpu, cpuset))
3753 continue;
3754 curcpu ++;
3755 p = strchr(line, ':');
3756 if (!p || !*p)
3757 return 0;
3758 p++;
3759 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3760 if (l < 0) {
3761 perror("Error writing to cache");
3762 return 0;
3763 }
3764 if (l >= cache_size) {
3765 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3766 return 0;
3767 }
3768 cache += l;
3769 cache_size -= l;
3770 total_len += l;
3771 continue;
3772
3773 }
3774 if (am_printing) {
3775 l = snprintf(cache, cache_size, "%s", line);
3776 if (l < 0) {
3777 perror("Error writing to cache");
3778 return 0;
3779 }
3780 if (l >= cache_size) {
3781 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3782 return 0;
3783 }
3784 cache += l;
3785 cache_size -= l;
3786 total_len += l;
3787 }
3788 }
3789
3790 if (is_s390x) {
3791 __do_free char *origcache = d->buf;
3792 ssize_t l;
3793
3794 d->buf = malloc(d->buflen);
3795 if (!d->buf) {
3796 d->buf = move_ptr(origcache);
3797 return 0;
3798 }
3799
3800 cache = d->buf;
3801 cache_size = d->buflen;
3802 total_len = 0;
3803 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3804 if (l < 0 || l >= cache_size)
3805 return 0;
3806
3807 cache_size -= l;
3808 cache += l;
3809 total_len += l;
3810 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3811 if (l < 0 || l >= cache_size)
3812 return 0;
3813
3814 cache_size -= l;
3815 cache += l;
3816 total_len += l;
3817 l = snprintf(cache, cache_size, "%s", origcache);
3818 if (l < 0 || l >= cache_size)
3819 return 0;
3820 total_len += l;
3821 }
3822
3823 d->cached = 1;
3824 d->size = total_len;
3825 if (total_len > size ) total_len = size;
3826
3827 /* read from off 0 */
3828 memcpy(buf, d->buf, total_len);
3829 return total_len;
3830 }
3831
3832 static uint64_t get_reaper_start_time(pid_t pid)
3833 {
3834 int ret;
3835 FILE *f;
3836 uint64_t starttime;
3837 /* strlen("/proc/") = 6
3838 * +
3839 * LXCFS_NUMSTRLEN64
3840 * +
3841 * strlen("/stat") = 5
3842 * +
3843 * \0 = 1
3844 * */
3845 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3846 char path[__PROC_PID_STAT_LEN];
3847 pid_t qpid;
3848
3849 qpid = lookup_initpid_in_store(pid);
3850 if (qpid <= 0) {
3851 /* Caller can check for EINVAL on 0. */
3852 errno = EINVAL;
3853 return 0;
3854 }
3855
3856 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3857 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3858 /* Caller can check for EINVAL on 0. */
3859 errno = EINVAL;
3860 return 0;
3861 }
3862
3863 f = fopen(path, "r");
3864 if (!f) {
3865 /* Caller can check for EINVAL on 0. */
3866 errno = EINVAL;
3867 return 0;
3868 }
3869
3870 /* Note that the *scanf() argument supression requires that length
3871 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3872 * at us. It's like telling someone you're not married and then asking
3873 * if you can bring your wife to the party.
3874 */
3875 ret = fscanf(f, "%*d " /* (1) pid %d */
3876 "%*s " /* (2) comm %s */
3877 "%*c " /* (3) state %c */
3878 "%*d " /* (4) ppid %d */
3879 "%*d " /* (5) pgrp %d */
3880 "%*d " /* (6) session %d */
3881 "%*d " /* (7) tty_nr %d */
3882 "%*d " /* (8) tpgid %d */
3883 "%*u " /* (9) flags %u */
3884 "%*u " /* (10) minflt %lu */
3885 "%*u " /* (11) cminflt %lu */
3886 "%*u " /* (12) majflt %lu */
3887 "%*u " /* (13) cmajflt %lu */
3888 "%*u " /* (14) utime %lu */
3889 "%*u " /* (15) stime %lu */
3890 "%*d " /* (16) cutime %ld */
3891 "%*d " /* (17) cstime %ld */
3892 "%*d " /* (18) priority %ld */
3893 "%*d " /* (19) nice %ld */
3894 "%*d " /* (20) num_threads %ld */
3895 "%*d " /* (21) itrealvalue %ld */
3896 "%" PRIu64, /* (22) starttime %llu */
3897 &starttime);
3898 if (ret != 1) {
3899 fclose(f);
3900 /* Caller can check for EINVAL on 0. */
3901 errno = EINVAL;
3902 return 0;
3903 }
3904
3905 fclose(f);
3906
3907 errno = 0;
3908 return starttime;
3909 }
3910
3911 static double get_reaper_start_time_in_sec(pid_t pid)
3912 {
3913 uint64_t clockticks, ticks_per_sec;
3914 int64_t ret;
3915 double res = 0;
3916
3917 clockticks = get_reaper_start_time(pid);
3918 if (clockticks == 0 && errno == EINVAL) {
3919 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3920 return 0;
3921 }
3922
3923 ret = sysconf(_SC_CLK_TCK);
3924 if (ret < 0 && errno == EINVAL) {
3925 lxcfs_debug(
3926 "%s\n",
3927 "failed to determine number of clock ticks in a second");
3928 return 0;
3929 }
3930
3931 ticks_per_sec = (uint64_t)ret;
3932 res = (double)clockticks / ticks_per_sec;
3933 return res;
3934 }
3935
3936 static double get_reaper_age(pid_t pid)
3937 {
3938 uint64_t uptime_ms;
3939 double procstart, procage;
3940
3941 /* We need to substract the time the process has started since system
3942 * boot minus the time when the system has started to get the actual
3943 * reaper age.
3944 */
3945 procstart = get_reaper_start_time_in_sec(pid);
3946 procage = procstart;
3947 if (procstart > 0) {
3948 int ret;
3949 struct timespec spec;
3950
3951 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3952 if (ret < 0)
3953 return 0;
3954
3955 /* We could make this more precise here by using the tv_nsec
3956 * field in the timespec struct and convert it to milliseconds
3957 * and then create a double for the seconds and milliseconds but
3958 * that seems more work than it is worth.
3959 */
3960 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
3961 procage = (uptime_ms - (procstart * 1000)) / 1000;
3962 }
3963
3964 return procage;
3965 }
3966
3967 /*
3968 * Returns 0 on success.
3969 * It is the caller's responsibility to free `return_usage`, unless this
3970 * function returns an error.
3971 */
3972 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
3973 {
3974 __do_free char *usage_str = NULL;
3975 __do_free struct cpuacct_usage *cpu_usage = NULL;
3976 int cpucount = get_nprocs_conf();
3977 int read_pos = 0, read_cnt=0;
3978 int i, j, ret;
3979 int cg_cpu;
3980 uint64_t cg_user, cg_system;
3981 int64_t ticks_per_sec;
3982
3983 ticks_per_sec = sysconf(_SC_CLK_TCK);
3984
3985 if (ticks_per_sec < 0 && errno == EINVAL) {
3986 lxcfs_v(
3987 "%s\n",
3988 "read_cpuacct_usage_all failed to determine number of clock ticks "
3989 "in a second");
3990 return -1;
3991 }
3992
3993 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
3994 if (!cpu_usage)
3995 return -ENOMEM;
3996
3997 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
3998 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
3999 char *data = NULL;
4000 int i = 0, read_pos = 0, read_cnt=0;
4001 size_t sz = 0, asz = 0;
4002
4003 /* read cpuacct.usage_percpu instead. */
4004 lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", "");
4005 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
4006 return -1;
4007 lxcfs_v("usage_str: %s\n", usage_str);
4008
4009 /* convert cpuacct.usage_percpu into cpuacct.usage_all. */
4010 lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", "");
4011
4012 must_strcat(&data, &sz, &asz, "cpu user system\n");
4013
4014 while (sscanf(usage_str + read_pos, "%lu %n", &cg_user, &read_cnt) > 0) {
4015 lxcfs_debug("i: %d, cg_user: %lu, read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt);
4016 must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user);
4017 i++;
4018 read_pos += read_cnt;
4019 }
4020
4021 usage_str = data;
4022
4023 lxcfs_v("usage_str: %s\n", usage_str);
4024 }
4025
4026 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4027 lxcfs_error("read_cpuacct_usage_all reading first line from "
4028 "%s/cpuacct.usage_all failed.\n", cg);
4029 return -1;
4030 }
4031
4032 read_pos += read_cnt;
4033
4034 for (i = 0, j = 0; i < cpucount; i++) {
4035 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4036 &cg_system, &read_cnt);
4037
4038 if (ret == EOF)
4039 break;
4040
4041 if (ret != 3) {
4042 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4043 "failed.\n", cg);
4044 return -1;
4045 }
4046
4047 read_pos += read_cnt;
4048
4049 /* Convert the time from nanoseconds to USER_HZ */
4050 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4051 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4052 j++;
4053 }
4054
4055 *return_usage = move_ptr(cpu_usage);
4056 *size = cpucount;
4057 return 0;
4058 }
4059
4060 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4061 {
4062 int i;
4063 unsigned long sum = 0;
4064
4065 for (i = 0; i < cpu_count; i++) {
4066 if (!newer[i].online)
4067 continue;
4068
4069 /* When cpuset is changed on the fly, the CPUs might get reordered.
4070 * We could either reset all counters, or check that the substractions
4071 * below will return expected results.
4072 */
4073 if (newer[i].user > older[i].user)
4074 diff[i].user = newer[i].user - older[i].user;
4075 else
4076 diff[i].user = 0;
4077
4078 if (newer[i].system > older[i].system)
4079 diff[i].system = newer[i].system - older[i].system;
4080 else
4081 diff[i].system = 0;
4082
4083 if (newer[i].idle > older[i].idle)
4084 diff[i].idle = newer[i].idle - older[i].idle;
4085 else
4086 diff[i].idle = 0;
4087
4088 sum += diff[i].user;
4089 sum += diff[i].system;
4090 sum += diff[i].idle;
4091 }
4092
4093 return sum;
4094 }
4095
4096 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4097 {
4098 unsigned long free_space, to_add;
4099
4100 free_space = threshold - usage->user - usage->system;
4101
4102 if (free_space > usage->idle)
4103 free_space = usage->idle;
4104
4105 to_add = free_space > *surplus ? *surplus : free_space;
4106
4107 *counter += to_add;
4108 usage->idle -= to_add;
4109 *surplus -= to_add;
4110 }
4111
4112 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4113 {
4114 struct cg_proc_stat *first = NULL, *prev, *tmp;
4115
4116 for (prev = NULL; node; ) {
4117 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4118 tmp = node;
4119 lxcfs_debug("Removing stat node for %s\n", node->cg);
4120
4121 if (prev)
4122 prev->next = node->next;
4123 else
4124 first = node->next;
4125
4126 node = node->next;
4127 free_proc_stat_node(tmp);
4128 } else {
4129 if (!first)
4130 first = node;
4131 prev = node;
4132 node = node->next;
4133 }
4134 }
4135
4136 return first;
4137 }
4138
4139 #define PROC_STAT_PRUNE_INTERVAL 10
4140 static void prune_proc_stat_history(void)
4141 {
4142 int i;
4143 time_t now = time(NULL);
4144
4145 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4146 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4147
4148 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4149 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4150 return;
4151 }
4152
4153 if (proc_stat_history[i]->next) {
4154 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4155 proc_stat_history[i]->lastcheck = now;
4156 }
4157
4158 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4159 }
4160 }
4161
4162 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4163 {
4164 struct cg_proc_stat *node;
4165
4166 pthread_rwlock_rdlock(&head->lock);
4167
4168 if (!head->next) {
4169 pthread_rwlock_unlock(&head->lock);
4170 return NULL;
4171 }
4172
4173 node = head->next;
4174
4175 do {
4176 if (strcmp(cg, node->cg) == 0)
4177 goto out;
4178 } while ((node = node->next));
4179
4180 node = NULL;
4181
4182 out:
4183 pthread_rwlock_unlock(&head->lock);
4184 prune_proc_stat_history();
4185 return node;
4186 }
4187
4188 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4189 {
4190 struct cg_proc_stat *node;
4191 int i;
4192
4193 node = malloc(sizeof(struct cg_proc_stat));
4194 if (!node)
4195 goto err;
4196
4197 node->cg = NULL;
4198 node->usage = NULL;
4199 node->view = NULL;
4200
4201 node->cg = malloc(strlen(cg) + 1);
4202 if (!node->cg)
4203 goto err;
4204
4205 strcpy(node->cg, cg);
4206
4207 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4208 if (!node->usage)
4209 goto err;
4210
4211 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4212
4213 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4214 if (!node->view)
4215 goto err;
4216
4217 node->cpu_count = cpu_count;
4218 node->next = NULL;
4219
4220 if (pthread_mutex_init(&node->lock, NULL) != 0) {
4221 lxcfs_error("%s\n", "Failed to initialize node lock");
4222 goto err;
4223 }
4224
4225 for (i = 0; i < cpu_count; i++) {
4226 node->view[i].user = 0;
4227 node->view[i].system = 0;
4228 node->view[i].idle = 0;
4229 }
4230
4231 return node;
4232
4233 err:
4234 if (node && node->cg)
4235 free(node->cg);
4236 if (node && node->usage)
4237 free(node->usage);
4238 if (node && node->view)
4239 free(node->view);
4240 if (node)
4241 free(node);
4242
4243 return NULL;
4244 }
4245
4246 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4247 {
4248 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4249 struct cg_proc_stat_head *head = proc_stat_history[hash];
4250 struct cg_proc_stat *node, *rv = new_node;
4251
4252 pthread_rwlock_wrlock(&head->lock);
4253
4254 if (!head->next) {
4255 head->next = new_node;
4256 goto out;
4257 }
4258
4259 node = head->next;
4260
4261 for (;;) {
4262 if (strcmp(node->cg, new_node->cg) == 0) {
4263 /* The node is already present, return it */
4264 free_proc_stat_node(new_node);
4265 rv = node;
4266 goto out;
4267 }
4268
4269 if (node->next) {
4270 node = node->next;
4271 continue;
4272 }
4273
4274 node->next = new_node;
4275 goto out;
4276 }
4277
4278 out:
4279 pthread_rwlock_unlock(&head->lock);
4280 return rv;
4281 }
4282
4283 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4284 {
4285 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
4286
4287 /* Allocate new memory */
4288 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4289 if (!new_usage)
4290 return false;
4291
4292 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4293 if (!new_view)
4294 return false;
4295
4296 /* Copy existing data & initialize new elements */
4297 for (int i = 0; i < cpu_count; i++) {
4298 if (i < node->cpu_count) {
4299 new_usage[i].user = node->usage[i].user;
4300 new_usage[i].system = node->usage[i].system;
4301 new_usage[i].idle = node->usage[i].idle;
4302
4303 new_view[i].user = node->view[i].user;
4304 new_view[i].system = node->view[i].system;
4305 new_view[i].idle = node->view[i].idle;
4306 } else {
4307 new_usage[i].user = 0;
4308 new_usage[i].system = 0;
4309 new_usage[i].idle = 0;
4310
4311 new_view[i].user = 0;
4312 new_view[i].system = 0;
4313 new_view[i].idle = 0;
4314 }
4315 }
4316
4317 free(node->usage);
4318 node->usage = move_ptr(new_usage);
4319
4320 free(node->view);
4321 node->view = move_ptr(new_view);
4322 node->cpu_count = cpu_count;
4323
4324 return true;
4325 }
4326
4327 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4328 {
4329 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4330 struct cg_proc_stat_head *head = proc_stat_history[hash];
4331 struct cg_proc_stat *node;
4332
4333 node = find_proc_stat_node(head, cg);
4334
4335 if (!node) {
4336 node = new_proc_stat_node(usage, cpu_count, cg);
4337 if (!node)
4338 return NULL;
4339
4340 node = add_proc_stat_node(node);
4341 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4342 }
4343
4344 pthread_mutex_lock(&node->lock);
4345
4346 /* If additional CPUs on the host have been enabled, CPU usage counter
4347 * arrays have to be expanded */
4348 if (node->cpu_count < cpu_count) {
4349 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4350 node->cpu_count, cpu_count, cg);
4351
4352 if (!expand_proc_stat_node(node, cpu_count)) {
4353 pthread_mutex_unlock(&node->lock);
4354 lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4355 node->cpu_count, cpu_count, cg);
4356 return NULL;
4357 }
4358 }
4359
4360 return node;
4361 }
4362
4363 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4364 {
4365 int i;
4366
4367 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4368 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4369
4370 for (i = 0; i < cpu_count; i++) {
4371 node->view[i].user = 0;
4372 node->view[i].system = 0;
4373 node->view[i].idle = 0;
4374 }
4375
4376 node->cpu_count = cpu_count;
4377 }
4378
4379 static int cpuview_proc_stat(const char *cg, const char *cpuset,
4380 struct cpuacct_usage *cg_cpu_usage,
4381 int cg_cpu_usage_size, FILE *f, char *buf,
4382 size_t buf_size)
4383 {
4384 __do_free char *line = NULL;
4385 __do_free struct cpuacct_usage *diff = NULL;
4386 size_t linelen = 0, total_len = 0, l;
4387 int curcpu = -1; /* cpu numbering starts at 0 */
4388 int physcpu, i;
4389 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4390 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0,
4391 irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4392 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4393 unsigned long user_surplus = 0, system_surplus = 0;
4394 unsigned long total_sum, threshold;
4395 struct cg_proc_stat *stat_node;
4396 int nprocs = get_nprocs_conf();
4397
4398 if (cg_cpu_usage_size < nprocs)
4399 nprocs = cg_cpu_usage_size;
4400
4401 /* Read all CPU stats and stop when we've encountered other lines */
4402 while (getline(&line, &linelen, f) != -1) {
4403 int ret;
4404 char cpu_char[10]; /* That's a lot of cores */
4405 uint64_t all_used, cg_used;
4406
4407 if (strlen(line) == 0)
4408 continue;
4409
4410 /* not a ^cpuN line containing a number N */
4411 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
4412 break;
4413
4414 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4415 continue;
4416
4417 if (physcpu >= cg_cpu_usage_size)
4418 continue;
4419
4420 curcpu ++;
4421 cpu_cnt ++;
4422
4423 if (!cpu_in_cpuset(physcpu, cpuset)) {
4424 for (i = curcpu; i <= physcpu; i++)
4425 cg_cpu_usage[i].online = false;
4426 continue;
4427 }
4428
4429 if (curcpu < physcpu) {
4430 /* Some CPUs may be disabled */
4431 for (i = curcpu; i < physcpu; i++)
4432 cg_cpu_usage[i].online = false;
4433
4434 curcpu = physcpu;
4435 }
4436
4437 cg_cpu_usage[curcpu].online = true;
4438
4439 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4440 &user,
4441 &nice,
4442 &system,
4443 &idle,
4444 &iowait,
4445 &irq,
4446 &softirq,
4447 &steal,
4448 &guest,
4449 &guest_nice);
4450
4451 if (ret != 10)
4452 continue;
4453
4454 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4455 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4456
4457 if (all_used >= cg_used) {
4458 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4459
4460 } else {
4461 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4462 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4463 curcpu, cg, all_used, cg_used);
4464 cg_cpu_usage[curcpu].idle = idle;
4465 }
4466 }
4467
4468 /* Cannot use more CPUs than is available due to cpuset */
4469 if (max_cpus > cpu_cnt)
4470 max_cpus = cpu_cnt;
4471
4472 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4473
4474 if (!stat_node) {
4475 lxcfs_error("unable to find/create stat node for %s\n", cg);
4476 return 0;
4477 }
4478
4479 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4480 if (!diff) {
4481 return 0;
4482 }
4483
4484 /*
4485 * If the new values are LOWER than values stored in memory, it means
4486 * the cgroup has been reset/recreated and we should reset too.
4487 */
4488 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4489 if (!cg_cpu_usage[curcpu].online)
4490 continue;
4491
4492 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4493 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4494
4495 break;
4496 }
4497
4498 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4499
4500 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4501 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4502
4503 if (!stat_node->usage[curcpu].online)
4504 continue;
4505
4506 i++;
4507
4508 stat_node->usage[curcpu].user += diff[curcpu].user;
4509 stat_node->usage[curcpu].system += diff[curcpu].system;
4510 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4511
4512 if (max_cpus > 0 && i >= max_cpus) {
4513 user_surplus += diff[curcpu].user;
4514 system_surplus += diff[curcpu].system;
4515 }
4516 }
4517
4518 /* Calculate usage counters of visible CPUs */
4519 if (max_cpus > 0) {
4520 unsigned long diff_user = 0;
4521 unsigned long diff_system = 0;
4522 unsigned long diff_idle = 0;
4523 unsigned long max_diff_idle = 0;
4524 unsigned long max_diff_idle_index = 0;
4525 double exact_cpus;
4526
4527 /* threshold = maximum usage per cpu, including idle */
4528 threshold = total_sum / cpu_cnt * max_cpus;
4529
4530 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4531 if (!stat_node->usage[curcpu].online)
4532 continue;
4533
4534 i++;
4535
4536 if (i == max_cpus)
4537 break;
4538
4539 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4540 continue;
4541
4542 /* Add user */
4543 add_cpu_usage(&user_surplus, &diff[curcpu],
4544 &diff[curcpu].user, threshold);
4545
4546 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4547 continue;
4548
4549 /* If there is still room, add system */
4550 add_cpu_usage(&system_surplus, &diff[curcpu],
4551 &diff[curcpu].system, threshold);
4552 }
4553
4554 if (user_surplus > 0)
4555 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4556 if (system_surplus > 0)
4557 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4558
4559 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4560 if (!stat_node->usage[curcpu].online)
4561 continue;
4562
4563 i++;
4564
4565 if (i == max_cpus)
4566 break;
4567
4568 stat_node->view[curcpu].user += diff[curcpu].user;
4569 stat_node->view[curcpu].system += diff[curcpu].system;
4570 stat_node->view[curcpu].idle += diff[curcpu].idle;
4571
4572 user_sum += stat_node->view[curcpu].user;
4573 system_sum += stat_node->view[curcpu].system;
4574 idle_sum += stat_node->view[curcpu].idle;
4575
4576 diff_user += diff[curcpu].user;
4577 diff_system += diff[curcpu].system;
4578 diff_idle += diff[curcpu].idle;
4579 if (diff[curcpu].idle > max_diff_idle) {
4580 max_diff_idle = diff[curcpu].idle;
4581 max_diff_idle_index = curcpu;
4582 }
4583
4584 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
4585 }
4586 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
4587
4588 /* revise cpu usage view to support partial cpu case. */
4589 exact_cpus = exact_cpu_count(cg);
4590 if (exact_cpus < (double)max_cpus){
4591 unsigned long delta = (unsigned long)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
4592
4593 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
4594 lxcfs_v("delta: %lu\n", delta);
4595 lxcfs_v("idle_sum before: %lu\n", idle_sum);
4596 idle_sum = idle_sum > delta ? idle_sum - delta : 0;
4597 lxcfs_v("idle_sum after: %lu\n", idle_sum);
4598
4599 curcpu = max_diff_idle_index;
4600 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
4601 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
4602 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
4603 }
4604 } else {
4605 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4606 if (!stat_node->usage[curcpu].online)
4607 continue;
4608
4609 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4610 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4611 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4612
4613 user_sum += stat_node->view[curcpu].user;
4614 system_sum += stat_node->view[curcpu].system;
4615 idle_sum += stat_node->view[curcpu].idle;
4616 }
4617 }
4618
4619 /* Render the file */
4620 /* cpu-all */
4621 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4622 user_sum,
4623 system_sum,
4624 idle_sum);
4625 lxcfs_v("cpu-all: %s\n", buf);
4626
4627 if (l < 0) {
4628 perror("Error writing to cache");
4629 return 0;
4630 }
4631 if (l >= buf_size) {
4632 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4633 return 0;
4634 }
4635
4636 buf += l;
4637 buf_size -= l;
4638 total_len += l;
4639
4640 /* Render visible CPUs */
4641 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4642 if (!stat_node->usage[curcpu].online)
4643 continue;
4644
4645 i++;
4646
4647 if (max_cpus > 0 && i == max_cpus)
4648 break;
4649
4650 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4651 i,
4652 stat_node->view[curcpu].user,
4653 stat_node->view[curcpu].system,
4654 stat_node->view[curcpu].idle);
4655 lxcfs_v("cpu: %s\n", buf);
4656
4657 if (l < 0) {
4658 perror("Error writing to cache");
4659 return 0;
4660
4661 }
4662 if (l >= buf_size) {
4663 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4664 return 0;
4665 }
4666
4667 buf += l;
4668 buf_size -= l;
4669 total_len += l;
4670 }
4671
4672 /* Pass the rest of /proc/stat, start with the last line read */
4673 l = snprintf(buf, buf_size, "%s", line);
4674
4675 if (l < 0) {
4676 perror("Error writing to cache");
4677 return 0;
4678
4679 }
4680 if (l >= buf_size) {
4681 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4682 return 0;
4683 }
4684
4685 buf += l;
4686 buf_size -= l;
4687 total_len += l;
4688
4689 /* Pass the rest of the host's /proc/stat */
4690 while (getline(&line, &linelen, f) != -1) {
4691 l = snprintf(buf, buf_size, "%s", line);
4692 if (l < 0) {
4693 perror("Error writing to cache");
4694 return 0;
4695 }
4696 if (l >= buf_size) {
4697 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4698 return 0;
4699 }
4700 buf += l;
4701 buf_size -= l;
4702 total_len += l;
4703 }
4704
4705 if (stat_node)
4706 pthread_mutex_unlock(&stat_node->lock);
4707 return total_len;
4708 }
4709
4710 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4711 static int proc_stat_read(char *buf, size_t size, off_t offset,
4712 struct fuse_file_info *fi)
4713 {
4714 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
4715 __do_free struct cpuacct_usage *cg_cpu_usage = NULL;
4716 __do_fclose FILE *f = NULL;
4717 struct fuse_context *fc = fuse_get_context();
4718 struct file_info *d = (struct file_info *)fi->fh;
4719 size_t linelen = 0, total_len = 0;
4720 int curcpu = -1; /* cpu numbering starts at 0 */
4721 int physcpu = 0;
4722 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0,
4723 irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4724 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0,
4725 iowait_sum = 0, irq_sum = 0, softirq_sum = 0,
4726 steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4727 char cpuall[CPUALL_MAX_SIZE];
4728 /* reserve for cpu all */
4729 char *cache = d->buf + CPUALL_MAX_SIZE;
4730 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4731 int cg_cpu_usage_size = 0;
4732
4733 if (offset){
4734 if (offset > d->size)
4735 return -EINVAL;
4736 if (!d->cached)
4737 return 0;
4738 int left = d->size - offset;
4739 total_len = left > size ? size: left;
4740 memcpy(buf, d->buf + offset, total_len);
4741 return total_len;
4742 }
4743
4744 pid_t initpid = lookup_initpid_in_store(fc->pid);
4745 lxcfs_v("initpid: %d\n", initpid);
4746 if (initpid <= 0)
4747 initpid = fc->pid;
4748
4749 /*
4750 * when container run with host pid namespace initpid == 1, cgroup will "/"
4751 * we should return host os's /proc contents.
4752 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
4753 */
4754 if (initpid == 1) {
4755 return read_file_fuse("/proc/stat", buf, size, d);
4756 }
4757
4758 cg = get_pid_cgroup(initpid, "cpuset");
4759 lxcfs_v("cg: %s\n", cg);
4760 if (!cg)
4761 return read_file_fuse("/proc/stat", buf, size, d);
4762 prune_init_slice(cg);
4763
4764 cpuset = get_cpuset(cg);
4765 if (!cpuset)
4766 return 0;
4767
4768 /*
4769 * Read cpuacct.usage_all for all CPUs.
4770 * If the cpuacct cgroup is present, it is used to calculate the container's
4771 * CPU usage. If not, values from the host's /proc/stat are used.
4772 */
4773 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
4774 lxcfs_v("%s\n", "proc_stat_read failed to read from cpuacct, "
4775 "falling back to the host's /proc/stat");
4776 }
4777
4778 f = fopen("/proc/stat", "r");
4779 if (!f)
4780 return 0;
4781
4782 //skip first line
4783 if (getline(&line, &linelen, f) < 0) {
4784 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4785 return 0;
4786 }
4787
4788 if (cgroup_ops->can_use_cpuview(cgroup_ops) && cg_cpu_usage) {
4789 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4790 f, d->buf, d->buflen);
4791 goto out;
4792 }
4793
4794 while (getline(&line, &linelen, f) != -1) {
4795 ssize_t l;
4796 char cpu_char[10]; /* That's a lot of cores */
4797 char *c;
4798 uint64_t all_used, cg_used, new_idle;
4799 int ret;
4800
4801 if (strlen(line) == 0)
4802 continue;
4803 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4804 /* not a ^cpuN line containing a number N, just print it */
4805 l = snprintf(cache, cache_size, "%s", line);
4806 if (l < 0) {
4807 perror("Error writing to cache");
4808 return 0;
4809 }
4810 if (l >= cache_size) {
4811 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4812 return 0;
4813 }
4814 cache += l;
4815 cache_size -= l;
4816 total_len += l;
4817 continue;
4818 }
4819
4820 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4821 continue;
4822 if (!cpu_in_cpuset(physcpu, cpuset))
4823 continue;
4824 curcpu ++;
4825
4826 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4827 &user,
4828 &nice,
4829 &system,
4830 &idle,
4831 &iowait,
4832 &irq,
4833 &softirq,
4834 &steal,
4835 &guest,
4836 &guest_nice);
4837
4838 if (ret != 10 || !cg_cpu_usage) {
4839 c = strchr(line, ' ');
4840 if (!c)
4841 continue;
4842 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4843 if (l < 0) {
4844 perror("Error writing to cache");
4845 return 0;
4846
4847 }
4848 if (l >= cache_size) {
4849 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4850 return 0;
4851 }
4852
4853 cache += l;
4854 cache_size -= l;
4855 total_len += l;
4856
4857 if (ret != 10)
4858 continue;
4859 }
4860
4861 if (cg_cpu_usage) {
4862 if (physcpu >= cg_cpu_usage_size)
4863 break;
4864
4865 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4866 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
4867
4868 if (all_used >= cg_used) {
4869 new_idle = idle + (all_used - cg_used);
4870
4871 } else {
4872 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4873 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4874 curcpu, cg, all_used, cg_used);
4875 new_idle = idle;
4876 }
4877
4878 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4879 curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
4880 new_idle);
4881
4882 if (l < 0) {
4883 perror("Error writing to cache");
4884 return 0;
4885
4886 }
4887 if (l >= cache_size) {
4888 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4889 return 0;
4890 }
4891
4892 cache += l;
4893 cache_size -= l;
4894 total_len += l;
4895
4896 user_sum += cg_cpu_usage[physcpu].user;
4897 system_sum += cg_cpu_usage[physcpu].system;
4898 idle_sum += new_idle;
4899
4900 } else {
4901 user_sum += user;
4902 nice_sum += nice;
4903 system_sum += system;
4904 idle_sum += idle;
4905 iowait_sum += iowait;
4906 irq_sum += irq;
4907 softirq_sum += softirq;
4908 steal_sum += steal;
4909 guest_sum += guest;
4910 guest_nice_sum += guest_nice;
4911 }
4912 }
4913
4914 cache = d->buf;
4915
4916 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4917 user_sum,
4918 nice_sum,
4919 system_sum,
4920 idle_sum,
4921 iowait_sum,
4922 irq_sum,
4923 softirq_sum,
4924 steal_sum,
4925 guest_sum,
4926 guest_nice_sum);
4927 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4928 memcpy(cache, cpuall, cpuall_len);
4929 cache += cpuall_len;
4930 } else {
4931 /* shouldn't happen */
4932 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4933 cpuall_len = 0;
4934 }
4935
4936 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4937 total_len += cpuall_len;
4938
4939 out:
4940 d->cached = 1;
4941 d->size = total_len;
4942 if (total_len > size)
4943 total_len = size;
4944
4945 memcpy(buf, d->buf, total_len);
4946 return total_len;
4947 }
4948
4949 /* This function retrieves the busy time of a group of tasks by looking at
4950 * cpuacct.usage. Unfortunately, this only makes sense when the container has
4951 * been given it's own cpuacct cgroup. If not, this function will take the busy
4952 * time of all other taks that do not actually belong to the container into
4953 * account as well. If someone has a clever solution for this please send a
4954 * patch!
4955 */
4956 static double get_reaper_busy(pid_t task)
4957 {
4958 __do_free char *cgroup = NULL, *usage_str = NULL;
4959 unsigned long usage = 0;
4960 pid_t initpid;
4961
4962 initpid = lookup_initpid_in_store(task);
4963 if (initpid <= 0)
4964 return 0;
4965
4966 cgroup = get_pid_cgroup(initpid, "cpuacct");
4967 if (!cgroup)
4968 return 0;
4969 prune_init_slice(cgroup);
4970 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cgroup, "cpuacct.usage",
4971 &usage_str))
4972 return 0;
4973
4974 usage = strtoul(usage_str, NULL, 10);
4975 return ((double)usage / 1000000000);
4976 }
4977
4978 #if RELOADTEST
4979 void iwashere(void)
4980 {
4981 int fd;
4982
4983 fd = creat("/tmp/lxcfs-iwashere", 0644);
4984 if (fd >= 0)
4985 close(fd);
4986 }
4987 #endif
4988
4989 /*
4990 * We read /proc/uptime and reuse its second field.
4991 * For the first field, we use the mtime for the reaper for
4992 * the calling pid as returned by getreaperage
4993 */
4994 static int proc_uptime_read(char *buf, size_t size, off_t offset,
4995 struct fuse_file_info *fi)
4996 {
4997 struct fuse_context *fc = fuse_get_context();
4998 struct file_info *d = (struct file_info *)fi->fh;
4999 double busytime = get_reaper_busy(fc->pid);
5000 char *cache = d->buf;
5001 ssize_t total_len = 0;
5002 double idletime, reaperage;
5003
5004 #if RELOADTEST
5005 iwashere();
5006 #endif
5007
5008 if (offset){
5009 if (!d->cached)
5010 return 0;
5011 if (offset > d->size)
5012 return -EINVAL;
5013 int left = d->size - offset;
5014 total_len = left > size ? size: left;
5015 memcpy(buf, cache + offset, total_len);
5016 return total_len;
5017 }
5018
5019 reaperage = get_reaper_age(fc->pid);
5020 /* To understand why this is done, please read the comment to the
5021 * get_reaper_busy() function.
5022 */
5023 idletime = reaperage;
5024 if (reaperage >= busytime)
5025 idletime = reaperage - busytime;
5026
5027 total_len = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
5028 if (total_len < 0 || total_len >= d->buflen){
5029 lxcfs_error("%s\n", "failed to write to cache");
5030 return 0;
5031 }
5032
5033 d->size = (int)total_len;
5034 d->cached = 1;
5035
5036 if (total_len > size) total_len = size;
5037
5038 memcpy(buf, d->buf, total_len);
5039 return total_len;
5040 }
5041
5042 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5043 struct fuse_file_info *fi)
5044 {
5045 __do_free char *cg = NULL, *io_serviced_str = NULL,
5046 *io_merged_str = NULL, *io_service_bytes_str = NULL,
5047 *io_wait_time_str = NULL, *io_service_time_str = NULL,
5048 *line = NULL;
5049 __do_fclose FILE *f = NULL;
5050 struct fuse_context *fc = fuse_get_context();
5051 struct file_info *d = (struct file_info *)fi->fh;
5052 unsigned long read = 0, write = 0;
5053 unsigned long read_merged = 0, write_merged = 0;
5054 unsigned long read_sectors = 0, write_sectors = 0;
5055 unsigned long read_ticks = 0, write_ticks = 0;
5056 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5057 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5058 char *cache = d->buf;
5059 size_t cache_size = d->buflen;
5060 size_t linelen = 0, total_len = 0;
5061 unsigned int major = 0, minor = 0;
5062 int i = 0;
5063 int ret;
5064 char dev_name[72];
5065
5066 if (offset){
5067 int left;
5068
5069 if (offset > d->size)
5070 return -EINVAL;
5071
5072 if (!d->cached)
5073 return 0;
5074
5075 left = d->size - offset;
5076 total_len = left > size ? size: left;
5077 memcpy(buf, cache + offset, total_len);
5078
5079 return total_len;
5080 }
5081
5082 pid_t initpid = lookup_initpid_in_store(fc->pid);
5083 if (initpid <= 1 || is_shared_pidns(initpid))
5084 initpid = fc->pid;
5085 cg = get_pid_cgroup(initpid, "blkio");
5086 if (!cg)
5087 return read_file_fuse("/proc/diskstats", buf, size, d);
5088 prune_init_slice(cg);
5089
5090 ret = cgroup_ops->get_io_serviced(cgroup_ops, cg, &io_serviced_str);
5091 if (ret < 0) {
5092 if (ret == -EOPNOTSUPP)
5093 return read_file_fuse("/proc/diskstats", buf, size, d);
5094 }
5095
5096 ret = cgroup_ops->get_io_merged(cgroup_ops, cg, &io_merged_str);
5097 if (ret < 0) {
5098 if (ret == -EOPNOTSUPP)
5099 return read_file_fuse("/proc/diskstats", buf, size, d);
5100 }
5101
5102 ret = cgroup_ops->get_io_service_bytes(cgroup_ops, cg, &io_service_bytes_str);
5103 if (ret < 0) {
5104 if (ret == -EOPNOTSUPP)
5105 return read_file_fuse("/proc/diskstats", buf, size, d);
5106 }
5107
5108 ret = cgroup_ops->get_io_wait_time(cgroup_ops, cg, &io_wait_time_str);
5109 if (ret < 0) {
5110 if (ret == -EOPNOTSUPP)
5111 return read_file_fuse("/proc/diskstats", buf, size, d);
5112 }
5113
5114 ret = cgroup_ops->get_io_service_time(cgroup_ops, cg, &io_service_time_str);
5115 if (ret < 0) {
5116 if (ret == -EOPNOTSUPP)
5117 return read_file_fuse("/proc/diskstats", buf, size, d);
5118 }
5119
5120 f = fopen("/proc/diskstats", "r");
5121 if (!f)
5122 return 0;
5123
5124 while (getline(&line, &linelen, f) != -1) {
5125 ssize_t l;
5126 char lbuf[256];
5127
5128 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5129 if (i != 3)
5130 continue;
5131
5132 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5133 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5134 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5135 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5136 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5137 read_sectors = read_sectors/512;
5138 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5139 write_sectors = write_sectors/512;
5140
5141 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5142 rd_svctm = rd_svctm/1000000;
5143 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5144 rd_wait = rd_wait/1000000;
5145 read_ticks = rd_svctm + rd_wait;
5146
5147 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5148 wr_svctm = wr_svctm/1000000;
5149 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5150 wr_wait = wr_wait/1000000;
5151 write_ticks = wr_svctm + wr_wait;
5152
5153 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5154 tot_ticks = tot_ticks/1000000;
5155
5156 memset(lbuf, 0, 256);
5157 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5158 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5159 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5160 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5161 else
5162 continue;
5163
5164 l = snprintf(cache, cache_size, "%s", lbuf);
5165 if (l < 0) {
5166 perror("Error writing to fuse buf");
5167 return 0;
5168 }
5169 if (l >= cache_size) {
5170 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5171 return 0;
5172 }
5173 cache += l;
5174 cache_size -= l;
5175 total_len += l;
5176 }
5177
5178 d->cached = 1;
5179 d->size = total_len;
5180 if (total_len > size ) total_len = size;
5181 memcpy(buf, d->buf, total_len);
5182
5183 return total_len;
5184 }
5185
5186 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5187 struct fuse_file_info *fi)
5188 {
5189 __do_free char *cg = NULL, *memswlimit_str = NULL, *memusage_str = NULL,
5190 *memswusage_str = NULL;
5191 struct fuse_context *fc = fuse_get_context();
5192 struct file_info *d = (struct file_info *)fi->fh;
5193 unsigned long memswlimit = 0, memlimit = 0, memusage = 0,
5194 memswusage = 0, swap_total = 0, swap_free = 0;
5195 ssize_t total_len = 0;
5196 ssize_t l = 0;
5197 char *cache = d->buf;
5198 int ret;
5199
5200 if (offset) {
5201 int left;
5202
5203 if (offset > d->size)
5204 return -EINVAL;
5205
5206 if (!d->cached)
5207 return 0;
5208
5209 left = d->size - offset;
5210 total_len = left > size ? size: left;
5211 memcpy(buf, cache + offset, total_len);
5212
5213 return total_len;
5214 }
5215
5216 pid_t initpid = lookup_initpid_in_store(fc->pid);
5217 if (initpid <= 1 || is_shared_pidns(initpid))
5218 initpid = fc->pid;
5219 cg = get_pid_cgroup(initpid, "memory");
5220 if (!cg)
5221 return read_file_fuse("/proc/swaps", buf, size, d);
5222 prune_init_slice(cg);
5223
5224 memlimit = get_min_memlimit(cg, false);
5225
5226 ret = cgroup_ops->get_memory_current(cgroup_ops, cg, &memusage_str);
5227 if (ret < 0)
5228 return 0;
5229
5230 memusage = strtoul(memusage_str, NULL, 10);
5231
5232 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cg, &memswlimit_str);
5233 if (ret >= 0)
5234 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cg, &memswusage_str);
5235 if (ret >= 0) {
5236 memswlimit = get_min_memlimit(cg, true);
5237 memswusage = strtoul(memswusage_str, NULL, 10);
5238 swap_total = (memswlimit - memlimit) / 1024;
5239 swap_free = (memswusage - memusage) / 1024;
5240 }
5241
5242 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5243
5244 /* When no mem + swap limit is specified or swapaccount=0*/
5245 if (!memswlimit) {
5246 __do_free char *line = NULL;
5247 __do_fclose FILE *f = NULL;
5248 size_t linelen = 0;
5249
5250 f = fopen("/proc/meminfo", "r");
5251 if (!f)
5252 return 0;
5253
5254 while (getline(&line, &linelen, f) != -1) {
5255 if (startswith(line, "SwapTotal:"))
5256 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
5257 else if (startswith(line, "SwapFree:"))
5258 sscanf(line, "SwapFree: %8lu kB", &swap_free);
5259 }
5260 }
5261
5262 if (swap_total > 0) {
5263 l = snprintf(d->buf + total_len, d->size - total_len,
5264 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5265 swap_total, swap_free);
5266 total_len += l;
5267 }
5268
5269 if (total_len < 0 || l < 0) {
5270 perror("Error writing to cache");
5271 return 0;
5272 }
5273
5274 d->cached = 1;
5275 d->size = (int)total_len;
5276
5277 if (total_len > size) total_len = size;
5278 memcpy(buf, d->buf, total_len);
5279 return total_len;
5280 }
5281
5282 /*
5283 * Find the process pid from cgroup path.
5284 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5285 * @pid_buf : put pid to pid_buf.
5286 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5287 * @depth : the depth of cgroup in container.
5288 * @sum : return the number of pid.
5289 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5290 */
5291 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5292 {
5293 __do_free char *path = NULL;
5294 __do_close_prot_errno int fd = -EBADF;
5295 __do_fclose FILE *f = NULL;
5296 __do_closedir DIR *dir = NULL;
5297 struct dirent *file;
5298 size_t linelen = 0;
5299 char *line = NULL;
5300 int pd;
5301 char **pid;
5302
5303 /* path = dpath + "/cgroup.procs" + /0 */
5304 path = malloc(strlen(dpath) + 20);
5305 if (!path)
5306 return sum;
5307
5308 strcpy(path, dpath);
5309 fd = openat(cfd, path, O_RDONLY | O_CLOEXEC | O_NOFOLLOW);
5310 if (fd < 0)
5311 return sum;
5312
5313 dir = fdopendir(move_fd(fd));
5314 if (!dir)
5315 return sum;
5316
5317 while (((file = readdir(dir)) != NULL) && depth > 0) {
5318 if (strcmp(file->d_name, ".") == 0)
5319 continue;
5320
5321 if (strcmp(file->d_name, "..") == 0)
5322 continue;
5323
5324 if (file->d_type == DT_DIR) {
5325 __do_free char *path_dir = NULL;
5326
5327 /* path + '/' + d_name +/0 */
5328 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5329 if (!path_dir)
5330 return sum;
5331
5332 strcpy(path_dir, path);
5333 strcat(path_dir, "/");
5334 strcat(path_dir, file->d_name);
5335 pd = depth - 1;
5336 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5337 }
5338 }
5339
5340 strcat(path, "/cgroup.procs");
5341 fd = openat(cfd, path, O_RDONLY);
5342 if (fd < 0)
5343 return sum;
5344
5345 f = fdopen(move_fd(fd), "r");
5346 if (!f)
5347 return sum;
5348
5349 while (getline(&line, &linelen, f) != -1) {
5350 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5351 if (!pid)
5352 return sum;
5353 *pid_buf = pid;
5354
5355 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5356 if (!*(*pid_buf + sum))
5357 return sum;
5358
5359 strcpy(*(*pid_buf + sum), line);
5360 sum++;
5361 }
5362
5363 return sum;
5364 }
5365
5366 /*
5367 * calc_load calculates the load according to the following formula:
5368 * load1 = load0 * exp + active * (1 - exp)
5369 *
5370 * @load1: the new loadavg.
5371 * @load0: the former loadavg.
5372 * @active: the total number of running pid at this moment.
5373 * @exp: the fixed-point defined in the beginning.
5374 */
5375 static unsigned long
5376 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5377 {
5378 unsigned long newload;
5379
5380 active = active > 0 ? active * FIXED_1 : 0;
5381 newload = load * exp + active * (FIXED_1 - exp);
5382 if (active >= load)
5383 newload += FIXED_1 - 1;
5384
5385 return newload / FIXED_1;
5386 }
5387
5388 /*
5389 * Return 0 means that container p->cg is closed.
5390 * Return -1 means that error occurred in refresh.
5391 * Positive num equals the total number of pid.
5392 */
5393 static int refresh_load(struct load_node *p, char *path)
5394 {
5395 __do_free char *line = NULL;
5396 char **idbuf;
5397 char proc_path[256];
5398 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5399 size_t linelen = 0;
5400 int sum, length;
5401 struct dirent *file;
5402
5403 idbuf = malloc(sizeof(char *));
5404 if (!idbuf)
5405 return -1;
5406
5407 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5408 /* normal exit */
5409 if (sum == 0)
5410 goto out;
5411
5412 for (i = 0; i < sum; i++) {
5413 __do_closedir DIR *dp = NULL;
5414
5415 /*clean up '\n' */
5416 length = strlen(idbuf[i])-1;
5417 idbuf[i][length] = '\0';
5418 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5419 if (ret < 0 || ret > 255) {
5420 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5421 i = sum;
5422 sum = -1;
5423 goto err_out;
5424 }
5425
5426 dp = opendir(proc_path);
5427 if (!dp) {
5428 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5429 continue;
5430 }
5431 while ((file = readdir(dp)) != NULL) {
5432 __do_fclose FILE *f = NULL;
5433
5434 if (strncmp(file->d_name, ".", 1) == 0)
5435 continue;
5436 if (strncmp(file->d_name, "..", 1) == 0)
5437 continue;
5438 total_pid++;
5439 /* We make the biggest pid become last_pid.*/
5440 ret = atof(file->d_name);
5441 last_pid = (ret > last_pid) ? ret : last_pid;
5442
5443 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5444 if (ret < 0 || ret > 255) {
5445 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5446 i = sum;
5447 sum = -1;
5448 goto err_out;
5449 }
5450
5451 f = fopen(proc_path, "r");
5452 if (f != NULL) {
5453 while (getline(&line, &linelen, f) != -1) {
5454 /* Find State */
5455 if ((line[0] == 'S') && (line[1] == 't'))
5456 break;
5457 }
5458
5459 if ((line[7] == 'R') || (line[7] == 'D'))
5460 run_pid++;
5461 }
5462 }
5463 }
5464 /*Calculate the loadavg.*/
5465 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5466 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5467 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5468 p->run_pid = run_pid;
5469 p->total_pid = total_pid;
5470 p->last_pid = last_pid;
5471
5472 err_out:
5473 for (; i > 0; i--)
5474 free(idbuf[i-1]);
5475 out:
5476 free(idbuf);
5477 return sum;
5478 }
5479
5480 /*
5481 * Traverse the hash table and update it.
5482 */
5483 void *load_begin(void *arg)
5484 {
5485
5486 int i, sum, length, ret;
5487 struct load_node *f;
5488 int first_node;
5489 clock_t time1, time2;
5490
5491 while (1) {
5492 if (loadavg_stop == 1)
5493 return NULL;
5494
5495 time1 = clock();
5496 for (i = 0; i < LOAD_SIZE; i++) {
5497 pthread_mutex_lock(&load_hash[i].lock);
5498 if (load_hash[i].next == NULL) {
5499 pthread_mutex_unlock(&load_hash[i].lock);
5500 continue;
5501 }
5502 f = load_hash[i].next;
5503 first_node = 1;
5504 while (f) {
5505 __do_free char *path = NULL;
5506
5507 length = strlen(f->cg) + 2;
5508 /* strlen(f->cg) + '.' or '' + \0 */
5509 path = malloc(length);
5510 if (!path)
5511 goto out;
5512
5513 ret = snprintf(path, length, "%s%s", dot_or_empty(f->cg), f->cg);
5514 if (ret < 0 || ret > length - 1) {
5515 /* snprintf failed, ignore the node.*/
5516 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5517 goto out;
5518 }
5519
5520 sum = refresh_load(f, path);
5521 if (sum == 0)
5522 f = del_node(f, i);
5523 else
5524 out: f = f->next;
5525 /* load_hash[i].lock locks only on the first node.*/
5526 if (first_node == 1) {
5527 first_node = 0;
5528 pthread_mutex_unlock(&load_hash[i].lock);
5529 }
5530 }
5531 }
5532
5533 if (loadavg_stop == 1)
5534 return NULL;
5535
5536 time2 = clock();
5537 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5538 }
5539 }
5540
5541 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5542 struct fuse_file_info *fi)
5543 {
5544 struct fuse_context *fc = fuse_get_context();
5545 struct file_info *d = (struct file_info *)fi->fh;
5546 pid_t initpid;
5547 char *cg;
5548 size_t total_len = 0;
5549 char *cache = d->buf;
5550 struct load_node *n;
5551 int hash;
5552 int cfd, rv = 0;
5553 unsigned long a, b, c;
5554
5555 if (offset) {
5556 if (offset > d->size)
5557 return -EINVAL;
5558 if (!d->cached)
5559 return 0;
5560 int left = d->size - offset;
5561 total_len = left > size ? size : left;
5562 memcpy(buf, cache + offset, total_len);
5563 return total_len;
5564 }
5565 if (!loadavg)
5566 return read_file_fuse("/proc/loadavg", buf, size, d);
5567
5568 initpid = lookup_initpid_in_store(fc->pid);
5569 if (initpid <= 1 || is_shared_pidns(initpid))
5570 initpid = fc->pid;
5571 cg = get_pid_cgroup(initpid, "cpu");
5572 if (!cg)
5573 return read_file_fuse("/proc/loadavg", buf, size, d);
5574
5575 prune_init_slice(cg);
5576 hash = calc_hash(cg) % LOAD_SIZE;
5577 n = locate_node(cg, hash);
5578
5579 /* First time */
5580 if (n == NULL) {
5581 cfd = find_mounted_controller("cpu");
5582 if (cfd >= 0) {
5583 /*
5584 * In locate_node() above, pthread_rwlock_unlock() isn't used
5585 * because delete is not allowed before read has ended.
5586 */
5587 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5588 rv = 0;
5589 goto err;
5590 }
5591 do {
5592 n = malloc(sizeof(struct load_node));
5593 } while (!n);
5594
5595 do {
5596 n->cg = malloc(strlen(cg)+1);
5597 } while (!n->cg);
5598 strcpy(n->cg, cg);
5599 n->avenrun[0] = 0;
5600 n->avenrun[1] = 0;
5601 n->avenrun[2] = 0;
5602 n->run_pid = 0;
5603 n->total_pid = 1;
5604 n->last_pid = initpid;
5605 n->cfd = cfd;
5606 insert_node(&n, hash);
5607 }
5608 a = n->avenrun[0] + (FIXED_1/200);
5609 b = n->avenrun[1] + (FIXED_1/200);
5610 c = n->avenrun[2] + (FIXED_1/200);
5611 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5612 LOAD_INT(a), LOAD_FRAC(a),
5613 LOAD_INT(b), LOAD_FRAC(b),
5614 LOAD_INT(c), LOAD_FRAC(c),
5615 n->run_pid, n->total_pid, n->last_pid);
5616 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5617 if (total_len < 0 || total_len >= d->buflen) {
5618 lxcfs_error("%s\n", "Failed to write to cache");
5619 rv = 0;
5620 goto err;
5621 }
5622 d->size = (int)total_len;
5623 d->cached = 1;
5624
5625 if (total_len > size)
5626 total_len = size;
5627 memcpy(buf, d->buf, total_len);
5628 rv = total_len;
5629
5630 err:
5631 free(cg);
5632 return rv;
5633 }
5634 /* Return a positive number on success, return 0 on failure.*/
5635 pthread_t load_daemon(int load_use)
5636 {
5637 int ret;
5638 pthread_t pid;
5639
5640 ret = init_load();
5641 if (ret == -1) {
5642 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5643 return 0;
5644 }
5645 ret = pthread_create(&pid, NULL, load_begin, NULL);
5646 if (ret != 0) {
5647 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5648 load_free();
5649 return 0;
5650 }
5651 /* use loadavg, here loadavg = 1*/
5652 loadavg = load_use;
5653 return pid;
5654 }
5655
5656 /* Returns 0 on success. */
5657 int stop_load_daemon(pthread_t pid)
5658 {
5659 int s;
5660
5661 /* Signal the thread to gracefully stop */
5662 loadavg_stop = 1;
5663
5664 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5665 if (s != 0) {
5666 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5667 return -1;
5668 }
5669
5670 load_free();
5671 loadavg_stop = 0;
5672
5673 return 0;
5674 }
5675
5676 static off_t get_procfile_size(const char *which)
5677 {
5678 FILE *f = fopen(which, "r");
5679 char *line = NULL;
5680 size_t len = 0;
5681 ssize_t sz, answer = 0;
5682 if (!f)
5683 return 0;
5684
5685 while ((sz = getline(&line, &len, f)) != -1)
5686 answer += sz;
5687 fclose (f);
5688 free(line);
5689
5690 return answer;
5691 }
5692
5693 int proc_getattr(const char *path, struct stat *sb)
5694 {
5695 struct timespec now;
5696
5697 memset(sb, 0, sizeof(struct stat));
5698 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5699 return -EINVAL;
5700 sb->st_uid = sb->st_gid = 0;
5701 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5702 if (strcmp(path, "/proc") == 0) {
5703 sb->st_mode = S_IFDIR | 00555;
5704 sb->st_nlink = 2;
5705 return 0;
5706 }
5707 if (strcmp(path, "/proc/meminfo") == 0 ||
5708 strcmp(path, "/proc/cpuinfo") == 0 ||
5709 strcmp(path, "/proc/uptime") == 0 ||
5710 strcmp(path, "/proc/stat") == 0 ||
5711 strcmp(path, "/proc/diskstats") == 0 ||
5712 strcmp(path, "/proc/swaps") == 0 ||
5713 strcmp(path, "/proc/loadavg") == 0) {
5714 sb->st_size = 0;
5715 sb->st_mode = S_IFREG | 00444;
5716 sb->st_nlink = 1;
5717 return 0;
5718 }
5719
5720 return -ENOENT;
5721 }
5722
5723 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5724 struct fuse_file_info *fi)
5725 {
5726 if (filler(buf, ".", NULL, 0) != 0 ||
5727 filler(buf, "..", NULL, 0) != 0 ||
5728 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5729 filler(buf, "meminfo", NULL, 0) != 0 ||
5730 filler(buf, "stat", NULL, 0) != 0 ||
5731 filler(buf, "uptime", NULL, 0) != 0 ||
5732 filler(buf, "diskstats", NULL, 0) != 0 ||
5733 filler(buf, "swaps", NULL, 0) != 0 ||
5734 filler(buf, "loadavg", NULL, 0) != 0)
5735 return -EINVAL;
5736 return 0;
5737 }
5738
5739 int proc_open(const char *path, struct fuse_file_info *fi)
5740 {
5741 int type = -1;
5742 struct file_info *info;
5743
5744 if (strcmp(path, "/proc/meminfo") == 0)
5745 type = LXC_TYPE_PROC_MEMINFO;
5746 else if (strcmp(path, "/proc/cpuinfo") == 0)
5747 type = LXC_TYPE_PROC_CPUINFO;
5748 else if (strcmp(path, "/proc/uptime") == 0)
5749 type = LXC_TYPE_PROC_UPTIME;
5750 else if (strcmp(path, "/proc/stat") == 0)
5751 type = LXC_TYPE_PROC_STAT;
5752 else if (strcmp(path, "/proc/diskstats") == 0)
5753 type = LXC_TYPE_PROC_DISKSTATS;
5754 else if (strcmp(path, "/proc/swaps") == 0)
5755 type = LXC_TYPE_PROC_SWAPS;
5756 else if (strcmp(path, "/proc/loadavg") == 0)
5757 type = LXC_TYPE_PROC_LOADAVG;
5758 if (type == -1)
5759 return -ENOENT;
5760
5761 info = malloc(sizeof(*info));
5762 if (!info)
5763 return -ENOMEM;
5764
5765 memset(info, 0, sizeof(*info));
5766 info->type = type;
5767
5768 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5769 do {
5770 info->buf = malloc(info->buflen);
5771 } while (!info->buf);
5772 memset(info->buf, 0, info->buflen);
5773 /* set actual size to buffer size */
5774 info->size = info->buflen;
5775
5776 fi->fh = (unsigned long)info;
5777 return 0;
5778 }
5779
5780 int proc_access(const char *path, int mask)
5781 {
5782 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5783 return 0;
5784
5785 /* these are all read-only */
5786 if ((mask & ~R_OK) != 0)
5787 return -EACCES;
5788 return 0;
5789 }
5790
5791 int proc_release(const char *path, struct fuse_file_info *fi)
5792 {
5793 do_release_file_info(fi);
5794 return 0;
5795 }
5796
5797 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5798 struct fuse_file_info *fi)
5799 {
5800 struct file_info *f = (struct file_info *) fi->fh;
5801
5802 switch (f->type) {
5803 case LXC_TYPE_PROC_MEMINFO:
5804 return proc_meminfo_read(buf, size, offset, fi);
5805 case LXC_TYPE_PROC_CPUINFO:
5806 return proc_cpuinfo_read(buf, size, offset, fi);
5807 case LXC_TYPE_PROC_UPTIME:
5808 return proc_uptime_read(buf, size, offset, fi);
5809 case LXC_TYPE_PROC_STAT:
5810 return proc_stat_read(buf, size, offset, fi);
5811 case LXC_TYPE_PROC_DISKSTATS:
5812 return proc_diskstats_read(buf, size, offset, fi);
5813 case LXC_TYPE_PROC_SWAPS:
5814 return proc_swaps_read(buf, size, offset, fi);
5815 case LXC_TYPE_PROC_LOADAVG:
5816 return proc_loadavg_read(buf, size, offset, fi);
5817 default:
5818 return -EINVAL;
5819 }
5820 }
5821
5822 /*
5823 * Functions needed to setup cgroups in the __constructor__.
5824 */
5825
5826 static bool umount_if_mounted(void)
5827 {
5828 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5829 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5830 return false;
5831 }
5832 return true;
5833 }
5834
5835 /* __typeof__ should be safe to use with all compilers. */
5836 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5837 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5838 {
5839 return (fs->f_type == (fs_type_magic)magic_val);
5840 }
5841
5842 /*
5843 * looking at fs/proc_namespace.c, it appears we can
5844 * actually expect the rootfs entry to very specifically contain
5845 * " - rootfs rootfs "
5846 * IIUC, so long as we've chrooted so that rootfs is not our root,
5847 * the rootfs entry should always be skipped in mountinfo contents.
5848 */
5849 static bool is_on_ramfs(void)
5850 {
5851 FILE *f;
5852 char *p, *p2;
5853 char *line = NULL;
5854 size_t len = 0;
5855 int i;
5856
5857 f = fopen("/proc/self/mountinfo", "r");
5858 if (!f)
5859 return false;
5860
5861 while (getline(&line, &len, f) != -1) {
5862 for (p = line, i = 0; p && i < 4; i++)
5863 p = strchr(p + 1, ' ');
5864 if (!p)
5865 continue;
5866 p2 = strchr(p + 1, ' ');
5867 if (!p2)
5868 continue;
5869 *p2 = '\0';
5870 if (strcmp(p + 1, "/") == 0) {
5871 // this is '/'. is it the ramfs?
5872 p = strchr(p2 + 1, '-');
5873 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5874 free(line);
5875 fclose(f);
5876 return true;
5877 }
5878 }
5879 }
5880 free(line);
5881 fclose(f);
5882 return false;
5883 }
5884
5885 static int pivot_enter()
5886 {
5887 int ret = -1, oldroot = -1, newroot = -1;
5888
5889 oldroot = open("/", O_DIRECTORY | O_RDONLY);
5890 if (oldroot < 0) {
5891 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5892 return ret;
5893 }
5894
5895 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5896 if (newroot < 0) {
5897 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5898 goto err;
5899 }
5900
5901 /* change into new root fs */
5902 if (fchdir(newroot) < 0) {
5903 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5904 goto err;
5905 }
5906
5907 /* pivot_root into our new root fs */
5908 if (pivot_root(".", ".") < 0) {
5909 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5910 goto err;
5911 }
5912
5913 /*
5914 * At this point the old-root is mounted on top of our new-root.
5915 * To unmounted it we must not be chdir'd into it, so escape back
5916 * to the old-root.
5917 */
5918 if (fchdir(oldroot) < 0) {
5919 lxcfs_error("%s\n", "Failed to enter old root.");
5920 goto err;
5921 }
5922
5923 if (umount2(".", MNT_DETACH) < 0) {
5924 lxcfs_error("%s\n", "Failed to detach old root.");
5925 goto err;
5926 }
5927
5928 if (fchdir(newroot) < 0) {
5929 lxcfs_error("%s\n", "Failed to re-enter new root.");
5930 goto err;
5931 }
5932
5933 ret = 0;
5934
5935 err:
5936 if (oldroot > 0)
5937 close(oldroot);
5938 if (newroot > 0)
5939 close(newroot);
5940
5941 return ret;
5942 }
5943
5944 static int chroot_enter()
5945 {
5946 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5947 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5948 return -1;
5949 }
5950
5951 if (chroot(".") < 0) {
5952 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
5953 return -1;
5954 }
5955
5956 if (chdir("/") < 0) {
5957 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
5958 return -1;
5959 }
5960
5961 return 0;
5962 }
5963
5964 static int permute_and_enter(void)
5965 {
5966 struct statfs sb;
5967
5968 if (statfs("/", &sb) < 0) {
5969 lxcfs_error("%s\n", "Could not stat / mountpoint.");
5970 return -1;
5971 }
5972
5973 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5974 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5975 * /proc/1/mountinfo. */
5976 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5977 return chroot_enter();
5978
5979 if (pivot_enter() < 0) {
5980 lxcfs_error("%s\n", "Could not perform pivot root.");
5981 return -1;
5982 }
5983
5984 return 0;
5985 }
5986
5987 /* Prepare our new clean root. */
5988 static int permute_prepare(void)
5989 {
5990 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
5991 lxcfs_error("%s\n", "Failed to create directory for new root.");
5992 return -1;
5993 }
5994
5995 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
5996 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
5997 return -1;
5998 }
5999
6000 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
6001 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
6002 return -1;
6003 }
6004
6005 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
6006 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
6007 return -1;
6008 }
6009
6010 return 0;
6011 }
6012
6013 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6014 static bool permute_root(void)
6015 {
6016 /* Prepare new root. */
6017 if (permute_prepare() < 0)
6018 return false;
6019
6020 /* Pivot into new root. */
6021 if (permute_and_enter() < 0)
6022 return false;
6023
6024 return true;
6025 }
6026
6027 static int preserve_mnt_ns(int pid)
6028 {
6029 int ret;
6030 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6031 char path[len];
6032
6033 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6034 if (ret < 0 || (size_t)ret >= len)
6035 return -1;
6036
6037 return open(path, O_RDONLY | O_CLOEXEC);
6038 }
6039
6040 static bool cgfs_prepare_mounts(void)
6041 {
6042 if (!mkdir_p(BASEDIR, 0700)) {
6043 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6044 return false;
6045 }
6046
6047 if (!umount_if_mounted()) {
6048 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6049 return false;
6050 }
6051
6052 if (unshare(CLONE_NEWNS) < 0) {
6053 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6054 return false;
6055 }
6056
6057 cgroup_ops->mntns_fd = preserve_mnt_ns(getpid());
6058 if (cgroup_ops->mntns_fd < 0) {
6059 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6060 return false;
6061 }
6062
6063 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6064 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6065 return false;
6066 }
6067
6068 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6069 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6070 return false;
6071 }
6072
6073 return true;
6074 }
6075
6076 static bool cgfs_mount_hierarchies(void)
6077 {
6078 if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
6079 return false;
6080
6081 if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
6082 return false;
6083
6084 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
6085 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
6086 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
6087 if ((*h)->fd < 0)
6088 return false;
6089 }
6090
6091 return true;
6092 }
6093
6094 static bool cgfs_setup_controllers(void)
6095 {
6096 if (!cgfs_prepare_mounts())
6097 return false;
6098
6099 if (!cgfs_mount_hierarchies()) {
6100 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6101 return false;
6102 }
6103
6104 if (!permute_root())
6105 return false;
6106
6107 return true;
6108 }
6109
6110 static void __attribute__((constructor)) lxcfs_init(void)
6111 {
6112 __do_close_prot_errno int init_ns = -EBADF;
6113 char *cret;
6114 char cwd[MAXPATHLEN];
6115
6116 cgroup_ops = cgroup_init();
6117 if (!cgroup_ops)
6118 log_exit("Failed to initialize cgroup support");
6119
6120 /* Preserve initial namespace. */
6121 init_ns = preserve_mnt_ns(getpid());
6122 if (init_ns < 0)
6123 log_exit("Failed to preserve initial mount namespace");
6124
6125 cret = getcwd(cwd, MAXPATHLEN);
6126 log_exit("%s - Could not retrieve current working directory", strerror(errno));
6127
6128 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6129 * to privately mount lxcfs cgroups. */
6130 if (!cgfs_setup_controllers())
6131 log_exit("Failed to setup private cgroup mounts for lxcfs");
6132
6133 if (setns(init_ns, 0) < 0)
6134 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
6135
6136 if (!cret || chdir(cwd) < 0)
6137 log_exit("%s - Could not change back to original working directory", strerror(errno));
6138
6139 if (!init_cpuview())
6140 log_exit("Failed to init CPU view");
6141
6142 print_subsystems();
6143 }
6144
6145 static void __attribute__((destructor)) lxcfs_exit(void)
6146 {
6147 lxcfs_debug("%s\n", "Running destructor for liblxcfs");
6148 free_cpuview();
6149 cgroup_exit(cgroup_ops);
6150 }