]> git.proxmox.com Git - mirror_lxcfs.git/blame - bindings.c
bindings: cgfs_set_value(), write_string()
[mirror_lxcfs.git] / bindings.c
CommitLineData
237e200e
SH
1/* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9#define FUSE_USE_VERSION 26
10
11#include <stdio.h>
12#include <dirent.h>
13#include <fcntl.h>
14#include <fuse.h>
15#include <unistd.h>
16#include <errno.h>
17#include <stdbool.h>
18#include <time.h>
19#include <string.h>
20#include <stdlib.h>
21#include <libgen.h>
22#include <sched.h>
23#include <pthread.h>
24#include <linux/sched.h>
25#include <sys/param.h>
26#include <sys/socket.h>
27#include <sys/mount.h>
28#include <sys/epoll.h>
29#include <wait.h>
30
237e200e
SH
31#include "bindings.h"
32
33#include "config.h" // for VERSION
34
35enum {
36 LXC_TYPE_CGDIR,
37 LXC_TYPE_CGFILE,
38 LXC_TYPE_PROC_MEMINFO,
39 LXC_TYPE_PROC_CPUINFO,
40 LXC_TYPE_PROC_UPTIME,
41 LXC_TYPE_PROC_STAT,
42 LXC_TYPE_PROC_DISKSTATS,
70dcc12e 43 LXC_TYPE_PROC_SWAPS,
237e200e
SH
44};
45
46struct file_info {
47 char *controller;
48 char *cgroup;
49 char *file;
50 int type;
51 char *buf; // unused as of yet
52 int buflen;
53 int size; //actual data size
54 int cached;
55};
56
57/* reserve buffer size, for cpuall in /proc/stat */
58#define BUF_RESERVE_SIZE 256
59
60/*
61 * A table caching which pid is init for a pid namespace.
62 * When looking up which pid is init for $qpid, we first
63 * 1. Stat /proc/$qpid/ns/pid.
64 * 2. Check whether the ino_t is in our store.
65 * a. if not, fork a child in qpid's ns to send us
66 * ucred.pid = 1, and read the initpid. Cache
67 * initpid and creation time for /proc/initpid
68 * in a new store entry.
69 * b. if so, verify that /proc/initpid still matches
70 * what we have saved. If not, clear the store
71 * entry and go back to a. If so, return the
72 * cached initpid.
73 */
74struct pidns_init_store {
75 ino_t ino; // inode number for /proc/$pid/ns/pid
76 pid_t initpid; // the pid of nit in that ns
77 long int ctime; // the time at which /proc/$initpid was created
78 struct pidns_init_store *next;
79 long int lastcheck;
80};
81
82/* lol - look at how they are allocated in the kernel */
83#define PIDNS_HASH_SIZE 4096
84#define HASH(x) ((x) % PIDNS_HASH_SIZE)
85
86static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
87static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
88static void lock_mutex(pthread_mutex_t *l)
89{
90 int ret;
91
92 if ((ret = pthread_mutex_lock(l)) != 0) {
93 fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret));
94 exit(1);
95 }
96}
97
98static void unlock_mutex(pthread_mutex_t *l)
99{
100 int ret;
101
102 if ((ret = pthread_mutex_unlock(l)) != 0) {
103 fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret));
104 exit(1);
105 }
106}
107
108static void store_lock(void)
109{
110 lock_mutex(&pidns_store_mutex);
111}
112
113static void store_unlock(void)
114{
115 unlock_mutex(&pidns_store_mutex);
116}
117
118/* Must be called under store_lock */
119static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
120{
121 struct stat initsb;
122 char fnam[100];
123
124 snprintf(fnam, 100, "/proc/%d", e->initpid);
125 if (stat(fnam, &initsb) < 0)
126 return false;
127#if DEBUG
128 fprintf(stderr, "comparing ctime %ld %ld for pid %d\n",
129 e->ctime, initsb.st_ctime, e->initpid);
130#endif
131 if (e->ctime != initsb.st_ctime)
132 return false;
133 return true;
134}
135
136/* Must be called under store_lock */
137static void remove_initpid(struct pidns_init_store *e)
138{
139 struct pidns_init_store *tmp;
140 int h;
141
142#if DEBUG
143 fprintf(stderr, "remove_initpid: removing entry for %d\n", e->initpid);
144#endif
145 h = HASH(e->ino);
146 if (pidns_hash_table[h] == e) {
147 pidns_hash_table[h] = e->next;
148 free(e);
149 return;
150 }
151
152 tmp = pidns_hash_table[h];
153 while (tmp) {
154 if (tmp->next == e) {
155 tmp->next = e->next;
156 free(e);
157 return;
158 }
159 tmp = tmp->next;
160 }
161}
162
163#define PURGE_SECS 5
164/* Must be called under store_lock */
165static void prune_initpid_store(void)
166{
167 static long int last_prune = 0;
168 struct pidns_init_store *e, *prev, *delme;
169 long int now, threshold;
170 int i;
171
172 if (!last_prune) {
173 last_prune = time(NULL);
174 return;
175 }
176 now = time(NULL);
177 if (now < last_prune + PURGE_SECS)
178 return;
179#if DEBUG
180 fprintf(stderr, "pruning\n");
181#endif
182 last_prune = now;
183 threshold = now - 2 * PURGE_SECS;
184
185 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
186 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
187 if (e->lastcheck < threshold) {
188#if DEBUG
189 fprintf(stderr, "Removing cached entry for %d\n", e->initpid);
190#endif
191 delme = e;
192 if (prev)
193 prev->next = e->next;
194 else
195 pidns_hash_table[i] = e->next;
196 e = e->next;
197 free(delme);
198 } else {
199 prev = e;
200 e = e->next;
201 }
202 }
203 }
204}
205
206/* Must be called under store_lock */
207static void save_initpid(struct stat *sb, pid_t pid)
208{
209 struct pidns_init_store *e;
210 char fpath[100];
211 struct stat procsb;
212 int h;
213
214#if DEBUG
215 fprintf(stderr, "save_initpid: adding entry for %d\n", pid);
216#endif
217 snprintf(fpath, 100, "/proc/%d", pid);
218 if (stat(fpath, &procsb) < 0)
219 return;
220 do {
221 e = malloc(sizeof(*e));
222 } while (!e);
223 e->ino = sb->st_ino;
224 e->initpid = pid;
225 e->ctime = procsb.st_ctime;
226 h = HASH(e->ino);
227 e->next = pidns_hash_table[h];
228 e->lastcheck = time(NULL);
229 pidns_hash_table[h] = e;
230}
231
232/*
233 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
234 * entry for the inode number and creation time. Verify that the init pid
235 * is still valid. If not, remove it. Return the entry if valid, NULL
236 * otherwise.
237 * Must be called under store_lock
238 */
239static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
240{
241 int h = HASH(sb->st_ino);
242 struct pidns_init_store *e = pidns_hash_table[h];
243
244 while (e) {
245 if (e->ino == sb->st_ino) {
246 if (initpid_still_valid(e, sb)) {
247 e->lastcheck = time(NULL);
248 return e;
249 }
250 remove_initpid(e);
251 return NULL;
252 }
253 e = e->next;
254 }
255
256 return NULL;
257}
258
259static int is_dir(const char *path)
260{
261 struct stat statbuf;
262 int ret = stat(path, &statbuf);
263 if (ret == 0 && S_ISDIR(statbuf.st_mode))
264 return 1;
265 return 0;
266}
267
268static char *must_copy_string(const char *str)
269{
270 char *dup = NULL;
271 if (!str)
272 return NULL;
273 do {
274 dup = strdup(str);
275 } while (!dup);
276
277 return dup;
278}
279
280static inline void drop_trailing_newlines(char *s)
281{
282 int l;
283
284 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
285 s[l-1] = '\0';
286}
287
288#define BATCH_SIZE 50
289static void dorealloc(char **mem, size_t oldlen, size_t newlen)
290{
291 int newbatches = (newlen / BATCH_SIZE) + 1;
292 int oldbatches = (oldlen / BATCH_SIZE) + 1;
293
294 if (!*mem || newbatches > oldbatches) {
295 char *tmp;
296 do {
297 tmp = realloc(*mem, newbatches * BATCH_SIZE);
298 } while (!tmp);
299 *mem = tmp;
300 }
301}
302static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
303{
304 size_t newlen = *len + linelen;
305 dorealloc(contents, *len, newlen + 1);
306 memcpy(*contents + *len, line, linelen+1);
307 *len = newlen;
308}
309
60f2ae53 310static char *slurp_file(const char *from, int fd)
237e200e
SH
311{
312 char *line = NULL;
313 char *contents = NULL;
60f2ae53 314 FILE *f = fdopen(fd, "r");
237e200e
SH
315 size_t len = 0, fulllen = 0;
316 ssize_t linelen;
317
318 if (!f)
319 return NULL;
320
321 while ((linelen = getline(&line, &len, f)) != -1) {
322 append_line(&contents, &fulllen, line, linelen);
323 }
324 fclose(f);
325
326 if (contents)
327 drop_trailing_newlines(contents);
328 free(line);
329 return contents;
330}
331
ba59ea09 332static bool write_string(const char *fnam, const char *string, int fd)
237e200e
SH
333{
334 FILE *f;
335 size_t len, ret;
336
ba59ea09 337 if (!(f = fdopen(fd, "w")))
237e200e
SH
338 return false;
339 len = strlen(string);
340 ret = fwrite(string, 1, len, f);
341 if (ret != len) {
342 fprintf(stderr, "Error writing to file: %s\n", strerror(errno));
343 fclose(f);
344 return false;
345 }
346 if (fclose(f) < 0) {
347 fprintf(stderr, "Error writing to file: %s\n", strerror(errno));
348 return false;
349 }
350 return true;
351}
352
237e200e
SH
353struct cgfs_files {
354 char *name;
355 uint32_t uid, gid;
356 uint32_t mode;
357};
358
0619767c 359#define ALLOC_NUM 20
237e200e
SH
360static bool store_hierarchy(char *stridx, char *h)
361{
0619767c
SH
362 if (num_hierarchies % ALLOC_NUM == 0) {
363 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
364 n *= ALLOC_NUM;
365 char **tmp = realloc(hierarchies, n * sizeof(char *));
0619767c
SH
366 if (!tmp) {
367 fprintf(stderr, "Out of memory\n");
368 exit(1);
369 }
237e200e 370 hierarchies = tmp;
237e200e 371 }
f676eb79 372
0619767c 373 hierarchies[num_hierarchies++] = must_copy_string(h);
237e200e
SH
374 return true;
375}
376
377static void print_subsystems(void)
378{
379 int i;
380
cc97d34c 381 fprintf(stderr, "hierarchies:\n");
237e200e
SH
382 for (i = 0; i < num_hierarchies; i++) {
383 if (hierarchies[i])
384 fprintf(stderr, " %d: %s\n", i, hierarchies[i]);
385 }
386}
387
388static bool in_comma_list(const char *needle, const char *haystack)
389{
390 const char *s = haystack, *e;
391 size_t nlen = strlen(needle);
392
393 while (*s && (e = index(s, ','))) {
394 if (nlen != e - s) {
395 s = e + 1;
396 continue;
397 }
398 if (strncmp(needle, s, nlen) == 0)
399 return true;
400 s = e + 1;
401 }
402 if (strcmp(needle, s) == 0)
403 return true;
404 return false;
405}
406
407/* do we need to do any massaging here? I'm not sure... */
5dd3e6fd
CB
408/* Return the mounted controller and store the corresponding open file descriptor
409 * referring to the controller mountpoint in the private lxcfs namespace in
410 * @cfd.
411 */
412static char *find_mounted_controller(const char *controller, int *cfd)
237e200e
SH
413{
414 int i;
415
416 for (i = 0; i < num_hierarchies; i++) {
417 if (!hierarchies[i])
418 continue;
5dd3e6fd
CB
419 if (strcmp(hierarchies[i], controller) == 0) {
420 *cfd = fd_hierarchies[i];
237e200e 421 return hierarchies[i];
5dd3e6fd
CB
422 }
423 if (in_comma_list(controller, hierarchies[i])) {
424 *cfd = fd_hierarchies[i];
237e200e 425 return hierarchies[i];
5dd3e6fd 426 }
237e200e
SH
427 }
428
429 return NULL;
430}
431
432bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
433 const char *value)
434{
ba59ea09 435 int ret, fd, cfd;
237e200e 436 size_t len;
5dd3e6fd 437 char *fnam, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
438
439 if (!tmpc)
440 return false;
ba59ea09
CB
441 /* . + /cgroup + / + file + \0 */
442 len = strlen(cgroup) + strlen(file) + 3;
237e200e 443 fnam = alloca(len);
ba59ea09
CB
444 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
445 if (ret < 0 || (size_t)ret >= len)
446 return false;
447
448 fd = openat(cfd, fnam, O_WRONLY);
449 if (fd < 0)
450 return false;
f676eb79 451
ba59ea09 452 return write_string(fnam, value, fd);
237e200e
SH
453}
454
455// Chown all the files in the cgroup directory. We do this when we create
456// a cgroup on behalf of a user.
457static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid)
458{
459 struct dirent dirent, *direntp;
460 char path[MAXPATHLEN];
461 size_t len;
462 DIR *d;
463 int ret;
464
465 len = strlen(dirname);
466 if (len >= MAXPATHLEN) {
467 fprintf(stderr, "chown_all_cgroup_files: pathname too long: %s\n", dirname);
468 return;
469 }
470
471 d = opendir(dirname);
472 if (!d) {
473 fprintf(stderr, "chown_all_cgroup_files: failed to open %s\n", dirname);
474 return;
475 }
476
477 while (readdir_r(d, &dirent, &direntp) == 0 && direntp) {
478 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
479 continue;
480 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
481 if (ret < 0 || ret >= MAXPATHLEN) {
482 fprintf(stderr, "chown_all_cgroup_files: pathname too long under %s\n", dirname);
483 continue;
484 }
485 if (chown(path, uid, gid) < 0)
486 fprintf(stderr, "Failed to chown file %s to %u:%u", path, uid, gid);
487 }
488 closedir(d);
489}
490
491int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
492{
5dd3e6fd 493 int cfd;
237e200e 494 size_t len;
5dd3e6fd 495 char *dirnam, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
496
497 if (!tmpc)
498 return -EINVAL;
cc97d34c
CB
499 /* BASEDIR / tmpc / cg \0 */
500 len = strlen(BASEDIR) + strlen(tmpc) + strlen(cg) + 3;
237e200e 501 dirnam = alloca(len);
cc97d34c 502 snprintf(dirnam, len, "%s/%s/%s", BASEDIR,tmpc, cg);
237e200e
SH
503
504 if (mkdir(dirnam, 0755) < 0)
505 return -errno;
506
507 if (uid == 0 && gid == 0)
508 return 0;
509
510 if (chown(dirnam, uid, gid) < 0)
511 return -errno;
512
513 chown_all_cgroup_files(dirnam, uid, gid);
514
515 return 0;
516}
517
518static bool recursive_rmdir(const char *dirname)
519{
520 struct dirent dirent, *direntp;
521 DIR *dir;
522 bool ret = false;
523 char pathname[MAXPATHLEN];
524
525 dir = opendir(dirname);
526 if (!dir) {
527#if DEBUG
528 fprintf(stderr, "%s: failed to open %s: %s\n", __func__, dirname, strerror(errno));
529#endif
530 return false;
531 }
532
533 while (!readdir_r(dir, &dirent, &direntp)) {
534 struct stat mystat;
535 int rc;
536
537 if (!direntp)
538 break;
539
540 if (!strcmp(direntp->d_name, ".") ||
541 !strcmp(direntp->d_name, ".."))
542 continue;
543
544 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
545 if (rc < 0 || rc >= MAXPATHLEN) {
546 fprintf(stderr, "pathname too long\n");
547 continue;
548 }
549
550 ret = lstat(pathname, &mystat);
551 if (ret) {
552#if DEBUG
553 fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno));
554#endif
555 continue;
556 }
557 if (S_ISDIR(mystat.st_mode)) {
558 if (!recursive_rmdir(pathname)) {
559#if DEBUG
560 fprintf(stderr, "Error removing %s\n", pathname);
561#endif
562 }
563 }
564 }
565
566 ret = true;
567 if (closedir(dir) < 0) {
568 fprintf(stderr, "%s: failed to close directory %s: %s\n", __func__, dirname, strerror(errno));
569 ret = false;
570 }
571
572 if (rmdir(dirname) < 0) {
573#if DEBUG
574 fprintf(stderr, "%s: failed to delete %s: %s\n", __func__, dirname, strerror(errno));
575#endif
576 ret = false;
577 }
578
579 return ret;
580}
581
582bool cgfs_remove(const char *controller, const char *cg)
583{
5dd3e6fd 584 int cfd;
237e200e 585 size_t len;
5dd3e6fd 586 char *dirnam, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
587
588 if (!tmpc)
589 return false;
cc97d34c
CB
590 /* BASEDIR / tmpc / cg \0 */
591 len = strlen(BASEDIR) + strlen(tmpc) + strlen(cg) + 3;
237e200e 592 dirnam = alloca(len);
cc97d34c 593 snprintf(dirnam, len, "%s/%s/%s", BASEDIR,tmpc, cg);
237e200e
SH
594 return recursive_rmdir(dirnam);
595}
596
597bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
598{
5dd3e6fd 599 int cfd;
237e200e 600 size_t len;
5dd3e6fd 601 char *pathname, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
602
603 if (!tmpc)
604 return false;
cc97d34c
CB
605 /* BASEDIR / tmpc / file \0 */
606 len = strlen(BASEDIR) + strlen(tmpc) + strlen(file) + 3;
237e200e 607 pathname = alloca(len);
cc97d34c 608 snprintf(pathname, len, "%s/%s/%s", BASEDIR, tmpc, file);
237e200e
SH
609 if (chmod(pathname, mode) < 0)
610 return false;
611 return true;
612}
613
614static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid)
615{
616 size_t len;
617 char *fname;
618
619 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
620 fname = alloca(len);
621 snprintf(fname, len, "%s/tasks", dirname);
622 if (chown(fname, uid, gid) != 0)
623 return -errno;
624 snprintf(fname, len, "%s/cgroup.procs", dirname);
625 if (chown(fname, uid, gid) != 0)
626 return -errno;
627 return 0;
628}
629
630int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
631{
5dd3e6fd 632 int cfd;
237e200e 633 size_t len;
5dd3e6fd 634 char *pathname, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
635
636 if (!tmpc)
637 return -EINVAL;
cc97d34c
CB
638 /* BASEDIR / tmpc / file \0 */
639 len = strlen(BASEDIR) + strlen(tmpc) + strlen(file) + 3;
237e200e 640 pathname = alloca(len);
cc97d34c 641 snprintf(pathname, len, "%s/%s/%s", BASEDIR, tmpc, file);
237e200e
SH
642 if (chown(pathname, uid, gid) < 0)
643 return -errno;
644
645 if (is_dir(pathname))
646 // like cgmanager did, we want to chown the tasks file as well
647 return chown_tasks_files(pathname, uid, gid);
648
649 return 0;
650}
651
652FILE *open_pids_file(const char *controller, const char *cgroup)
653{
5dd3e6fd 654 int cfd;
237e200e 655 size_t len;
5dd3e6fd 656 char *pathname, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
657
658 if (!tmpc)
659 return NULL;
cc97d34c
CB
660 /* BASEDIR / tmpc / cgroup / "cgroup.procs" \0 */
661 len = strlen(BASEDIR) + strlen(tmpc) + strlen(cgroup) + 4 + strlen("cgroup.procs");
237e200e 662 pathname = alloca(len);
cc97d34c 663 snprintf(pathname, len, "%s/%s/%s/cgroup.procs", BASEDIR, tmpc, cgroup);
237e200e
SH
664 return fopen(pathname, "w");
665}
666
f366da65
WB
667static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
668 void ***list, size_t typesize,
669 void* (*iterator)(const char*, const char*, const char*))
237e200e 670{
4ea38a4c 671 int cfd, fd, ret;
237e200e 672 size_t len;
4ea38a4c 673 char *cg, *tmpc;
237e200e 674 char pathname[MAXPATHLEN];
f366da65 675 size_t sz = 0, asz = 0;
4ea38a4c 676 struct dirent *dirent;
237e200e 677 DIR *dir;
237e200e 678
4ea38a4c 679 tmpc = find_mounted_controller(controller, &cfd);
f366da65 680 *list = NULL;
237e200e 681 if (!tmpc)
e97c834b 682 return false;
237e200e 683
4ea38a4c
CB
684 /* Make sure we pass a relative path to openat(). */
685 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
686 cg = alloca(len);
687 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
688 if (ret < 0 || (size_t)ret >= len) {
689 fprintf(stderr, "%s: pathname too long under %s\n", __func__, cgroup);
690 return false;
691 }
237e200e 692
4ea38a4c
CB
693 fd = openat(cfd, cg, O_DIRECTORY);
694 if (fd < 0)
695 return false;
696
697 dir = fdopendir(fd);
237e200e
SH
698 if (!dir)
699 return false;
700
4ea38a4c 701 while ((dirent = readdir(dir))) {
237e200e 702 struct stat mystat;
237e200e 703
4ea38a4c
CB
704 if (!strcmp(dirent->d_name, ".") ||
705 !strcmp(dirent->d_name, ".."))
237e200e
SH
706 continue;
707
4ea38a4c
CB
708 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
709 if (ret < 0 || ret >= MAXPATHLEN) {
710 fprintf(stderr, "%s: pathname too long under %s\n", __func__, cg);
237e200e
SH
711 continue;
712 }
713
4ea38a4c 714 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
237e200e
SH
715 if (ret) {
716 fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno));
717 continue;
718 }
f366da65
WB
719 if ((!directories && !S_ISREG(mystat.st_mode)) ||
720 (directories && !S_ISDIR(mystat.st_mode)))
237e200e
SH
721 continue;
722
723 if (sz+2 >= asz) {
f366da65 724 void **tmp;
237e200e
SH
725 asz += BATCH_SIZE;
726 do {
f366da65 727 tmp = realloc(*list, asz * typesize);
237e200e
SH
728 } while (!tmp);
729 *list = tmp;
730 }
4ea38a4c 731 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
237e200e
SH
732 (*list)[sz+1] = NULL;
733 sz++;
734 }
735 if (closedir(dir) < 0) {
4ea38a4c 736 fprintf(stderr, "%s: failed closedir for %s: %s\n", __func__, cgroup, strerror(errno));
237e200e
SH
737 return false;
738 }
739 return true;
740}
741
f366da65
WB
742static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
743{
744 char *dup;
745 do {
746 dup = strdup(dir_entry);
747 } while (!dup);
748 return dup;
749}
750
751bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
752{
753 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
754}
755
237e200e
SH
756void free_key(struct cgfs_files *k)
757{
758 if (!k)
759 return;
760 free(k->name);
761 free(k);
762}
763
764void free_keys(struct cgfs_files **keys)
765{
766 int i;
767
768 if (!keys)
769 return;
770 for (i = 0; keys[i]; i++) {
771 free_key(keys[i]);
772 }
773 free(keys);
774}
775
776bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
777{
60f2ae53 778 int ret, fd, cfd;
237e200e 779 size_t len;
5dd3e6fd 780 char *fnam, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
781
782 if (!tmpc)
783 return false;
60f2ae53
CB
784 /* . + /cgroup + / + file + \0 */
785 len = strlen(cgroup) + strlen(file) + 3;
237e200e 786 fnam = alloca(len);
60f2ae53
CB
787 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
788 if (ret < 0 || (size_t)ret >= len)
789 return NULL;
790
791 fd = openat(cfd, fnam, O_RDONLY);
792 if (fd < 0)
793 return NULL;
237e200e 794
60f2ae53 795 *value = slurp_file(fnam, fd);
237e200e
SH
796 return *value != NULL;
797}
798
799struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
800{
4ea38a4c 801 int ret, cfd;
237e200e 802 size_t len;
5dd3e6fd 803 char *fnam, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
804 struct stat sb;
805 struct cgfs_files *newkey;
237e200e
SH
806
807 if (!tmpc)
808 return false;
809
810 if (file && *file == '/')
811 file++;
812
813 if (file && index(file, '/'))
814 return NULL;
815
4ea38a4c
CB
816 /* . + /cgroup + / + file + \0 */
817 len = strlen(cgroup) + 3;
237e200e
SH
818 if (file)
819 len += strlen(file) + 1;
820 fnam = alloca(len);
4ea38a4c
CB
821 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
822 file ? "/" : "", file ? file : "");
237e200e 823
4ea38a4c 824 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
825 if (ret < 0)
826 return NULL;
827
828 do {
829 newkey = malloc(sizeof(struct cgfs_files));
830 } while (!newkey);
831 if (file)
832 newkey->name = must_copy_string(file);
833 else if (rindex(cgroup, '/'))
834 newkey->name = must_copy_string(rindex(cgroup, '/'));
835 else
836 newkey->name = must_copy_string(cgroup);
837 newkey->uid = sb.st_uid;
838 newkey->gid = sb.st_gid;
839 newkey->mode = sb.st_mode;
840
841 return newkey;
842}
843
f366da65 844static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
237e200e 845{
f366da65
WB
846 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
847 if (!entry) {
848 fprintf(stderr, "%s: Error getting files under %s:%s\n",
849 __func__, controller, cgroup);
237e200e 850 }
f366da65
WB
851 return entry;
852}
853
854bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
855{
856 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
237e200e
SH
857}
858
859bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
5dd3e6fd
CB
860{
861 int cfd;
862 size_t len;
863 char *fnam, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
864 int ret;
865 struct stat sb;
866
867 if (!tmpc)
868 return false;
d04232f2
CB
869 /* . + /cgroup + / + f + \0 */
870 len = strlen(cgroup) + strlen(f) + 3;
237e200e 871 fnam = alloca(len);
d04232f2
CB
872 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
873 if (ret < 0 || (size_t)ret >= len)
874 return false;
237e200e 875
d04232f2 876 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
877 if (ret < 0 || !S_ISDIR(sb.st_mode))
878 return false;
879 return true;
880}
881
882#define SEND_CREDS_OK 0
883#define SEND_CREDS_NOTSK 1
884#define SEND_CREDS_FAIL 2
885static bool recv_creds(int sock, struct ucred *cred, char *v);
886static int wait_for_pid(pid_t pid);
887static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
b10bdd6c 888static int send_creds_clone_wrapper(void *arg);
237e200e
SH
889
890/*
b10bdd6c 891 * clone a task which switches to @task's namespace and writes '1'.
237e200e
SH
892 * over a unix sock so we can read the task's reaper's pid in our
893 * namespace
b10bdd6c
FG
894 *
895 * Note: glibc's fork() does not respect pidns, which can lead to failed
896 * assertions inside glibc (and thus failed forks) if the child's pid in
897 * the pidns and the parent pid outside are identical. Using clone prevents
898 * this issue.
237e200e
SH
899 */
900static void write_task_init_pid_exit(int sock, pid_t target)
901{
237e200e
SH
902 char fnam[100];
903 pid_t pid;
237e200e 904 int fd, ret;
b10bdd6c
FG
905 size_t stack_size = sysconf(_SC_PAGESIZE);
906 void *stack = alloca(stack_size);
237e200e
SH
907
908 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
909 if (ret < 0 || ret >= sizeof(fnam))
910 _exit(1);
911
912 fd = open(fnam, O_RDONLY);
913 if (fd < 0) {
914 perror("write_task_init_pid_exit open of ns/pid");
915 _exit(1);
916 }
917 if (setns(fd, 0)) {
918 perror("write_task_init_pid_exit setns 1");
919 close(fd);
920 _exit(1);
921 }
b10bdd6c 922 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
237e200e
SH
923 if (pid < 0)
924 _exit(1);
925 if (pid != 0) {
926 if (!wait_for_pid(pid))
927 _exit(1);
928 _exit(0);
929 }
b10bdd6c
FG
930}
931
932static int send_creds_clone_wrapper(void *arg) {
933 struct ucred cred;
934 char v;
935 int sock = *(int *)arg;
237e200e
SH
936
937 /* we are the child */
938 cred.uid = 0;
939 cred.gid = 0;
940 cred.pid = 1;
941 v = '1';
942 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
b10bdd6c
FG
943 return 1;
944 return 0;
237e200e
SH
945}
946
947static pid_t get_init_pid_for_task(pid_t task)
948{
949 int sock[2];
950 pid_t pid;
951 pid_t ret = -1;
952 char v = '0';
953 struct ucred cred;
954
955 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
956 perror("socketpair");
957 return -1;
958 }
959
960 pid = fork();
961 if (pid < 0)
962 goto out;
963 if (!pid) {
964 close(sock[1]);
965 write_task_init_pid_exit(sock[0], task);
966 _exit(0);
967 }
968
969 if (!recv_creds(sock[1], &cred, &v))
970 goto out;
971 ret = cred.pid;
972
973out:
974 close(sock[0]);
975 close(sock[1]);
976 if (pid > 0)
977 wait_for_pid(pid);
978 return ret;
979}
980
981static pid_t lookup_initpid_in_store(pid_t qpid)
982{
983 pid_t answer = 0;
984 struct stat sb;
985 struct pidns_init_store *e;
986 char fnam[100];
987
988 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
989 store_lock();
990 if (stat(fnam, &sb) < 0)
991 goto out;
992 e = lookup_verify_initpid(&sb);
993 if (e) {
994 answer = e->initpid;
995 goto out;
996 }
997 answer = get_init_pid_for_task(qpid);
998 if (answer > 0)
999 save_initpid(&sb, answer);
1000
1001out:
1002 /* we prune at end in case we are returning
1003 * the value we were about to return */
1004 prune_initpid_store();
1005 store_unlock();
1006 return answer;
1007}
1008
1009static int wait_for_pid(pid_t pid)
1010{
1011 int status, ret;
1012
1013 if (pid <= 0)
1014 return -1;
1015
1016again:
1017 ret = waitpid(pid, &status, 0);
1018 if (ret == -1) {
1019 if (errno == EINTR)
1020 goto again;
1021 return -1;
1022 }
1023 if (ret != pid)
1024 goto again;
1025 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1026 return -1;
1027 return 0;
1028}
1029
1030
1031/*
1032 * append pid to *src.
1033 * src: a pointer to a char* in which ot append the pid.
1034 * sz: the number of characters printed so far, minus trailing \0.
1035 * asz: the allocated size so far
1036 * pid: the pid to append
1037 */
1038static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1039{
1040 char tmp[30];
1041
1042 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1043
1044 if (!*src || tmplen + *sz + 1 >= *asz) {
1045 char *tmp;
1046 do {
1047 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1048 } while (!tmp);
1049 *src = tmp;
1050 *asz += BUF_RESERVE_SIZE;
1051 }
bbfd0e33 1052 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
237e200e 1053 *sz += tmplen;
237e200e
SH
1054}
1055
1056/*
1057 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1058 * valid in the caller's namespace, return the id mapped into
1059 * pid's namespace.
1060 * Returns the mapped id, or -1 on error.
1061 */
1062unsigned int
1063convert_id_to_ns(FILE *idfile, unsigned int in_id)
1064{
1065 unsigned int nsuid, // base id for a range in the idfile's namespace
1066 hostuid, // base id for a range in the caller's namespace
1067 count; // number of ids in this range
1068 char line[400];
1069 int ret;
1070
1071 fseek(idfile, 0L, SEEK_SET);
1072 while (fgets(line, 400, idfile)) {
1073 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1074 if (ret != 3)
1075 continue;
1076 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1077 /*
1078 * uids wrapped around - unexpected as this is a procfile,
1079 * so just bail.
1080 */
1081 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
1082 nsuid, hostuid, count, line);
1083 return -1;
1084 }
1085 if (hostuid <= in_id && hostuid+count > in_id) {
1086 /*
1087 * now since hostuid <= in_id < hostuid+count, and
1088 * hostuid+count and nsuid+count do not wrap around,
1089 * we know that nsuid+(in_id-hostuid) which must be
1090 * less that nsuid+(count) must not wrap around
1091 */
1092 return (in_id - hostuid) + nsuid;
1093 }
1094 }
1095
1096 // no answer found
1097 return -1;
1098}
1099
1100/*
1101 * for is_privileged_over,
1102 * specify whether we require the calling uid to be root in his
1103 * namespace
1104 */
1105#define NS_ROOT_REQD true
1106#define NS_ROOT_OPT false
1107
1108#define PROCLEN 100
1109
1110static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1111{
1112 char fpath[PROCLEN];
1113 int ret;
1114 bool answer = false;
1115 uid_t nsuid;
1116
1117 if (victim == -1 || uid == -1)
1118 return false;
1119
1120 /*
1121 * If the request is one not requiring root in the namespace,
1122 * then having the same uid suffices. (i.e. uid 1000 has write
1123 * access to files owned by uid 1000
1124 */
1125 if (!req_ns_root && uid == victim)
1126 return true;
1127
1128 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1129 if (ret < 0 || ret >= PROCLEN)
1130 return false;
1131 FILE *f = fopen(fpath, "r");
1132 if (!f)
1133 return false;
1134
1135 /* if caller's not root in his namespace, reject */
1136 nsuid = convert_id_to_ns(f, uid);
1137 if (nsuid)
1138 goto out;
1139
1140 /*
1141 * If victim is not mapped into caller's ns, reject.
1142 * XXX I'm not sure this check is needed given that fuse
1143 * will be sending requests where the vfs has converted
1144 */
1145 nsuid = convert_id_to_ns(f, victim);
1146 if (nsuid == -1)
1147 goto out;
1148
1149 answer = true;
1150
1151out:
1152 fclose(f);
1153 return answer;
1154}
1155
1156static bool perms_include(int fmode, mode_t req_mode)
1157{
1158 mode_t r;
1159
1160 switch (req_mode & O_ACCMODE) {
1161 case O_RDONLY:
1162 r = S_IROTH;
1163 break;
1164 case O_WRONLY:
1165 r = S_IWOTH;
1166 break;
1167 case O_RDWR:
1168 r = S_IROTH | S_IWOTH;
1169 break;
1170 default:
1171 return false;
1172 }
1173 return ((fmode & r) == r);
1174}
1175
1176
1177/*
1178 * taskcg is a/b/c
1179 * querycg is /a/b/c/d/e
1180 * we return 'd'
1181 */
1182static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1183{
1184 char *start, *end;
1185
1186 if (strlen(taskcg) <= strlen(querycg)) {
1187 fprintf(stderr, "%s: I was fed bad input\n", __func__);
1188 return NULL;
1189 }
1190
1191 if (strcmp(querycg, "/") == 0)
1192 start = strdup(taskcg + 1);
1193 else
1194 start = strdup(taskcg + strlen(querycg) + 1);
1195 if (!start)
1196 return NULL;
1197 end = strchr(start, '/');
1198 if (end)
1199 *end = '\0';
1200 return start;
1201}
1202
1203static void stripnewline(char *x)
1204{
1205 size_t l = strlen(x);
1206 if (l && x[l-1] == '\n')
1207 x[l-1] = '\0';
1208}
1209
1210static char *get_pid_cgroup(pid_t pid, const char *contrl)
1211{
5dd3e6fd 1212 int cfd;
237e200e
SH
1213 char fnam[PROCLEN];
1214 FILE *f;
1215 char *answer = NULL;
1216 char *line = NULL;
1217 size_t len = 0;
1218 int ret;
5dd3e6fd 1219 const char *h = find_mounted_controller(contrl, &cfd);
237e200e
SH
1220 if (!h)
1221 return NULL;
1222
1223 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1224 if (ret < 0 || ret >= PROCLEN)
1225 return NULL;
1226 if (!(f = fopen(fnam, "r")))
1227 return NULL;
1228
1229 while (getline(&line, &len, f) != -1) {
1230 char *c1, *c2;
1231 if (!line[0])
1232 continue;
1233 c1 = strchr(line, ':');
1234 if (!c1)
1235 goto out;
1236 c1++;
1237 c2 = strchr(c1, ':');
1238 if (!c2)
1239 goto out;
1240 *c2 = '\0';
1241 if (strcmp(c1, h) != 0)
1242 continue;
1243 c2++;
1244 stripnewline(c2);
1245 do {
1246 answer = strdup(c2);
1247 } while (!answer);
1248 break;
1249 }
1250
1251out:
1252 fclose(f);
1253 free(line);
1254 return answer;
1255}
1256
1257/*
1258 * check whether a fuse context may access a cgroup dir or file
1259 *
1260 * If file is not null, it is a cgroup file to check under cg.
1261 * If file is null, then we are checking perms on cg itself.
1262 *
1263 * For files we can check the mode of the list_keys result.
1264 * For cgroups, we must make assumptions based on the files under the
1265 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1266 * yet.
1267 */
1268static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1269{
1270 struct cgfs_files *k = NULL;
1271 bool ret = false;
1272
1273 k = cgfs_get_key(contrl, cg, file);
1274 if (!k)
1275 return false;
1276
1277 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1278 if (perms_include(k->mode >> 6, mode)) {
1279 ret = true;
1280 goto out;
1281 }
1282 }
1283 if (fc->gid == k->gid) {
1284 if (perms_include(k->mode >> 3, mode)) {
1285 ret = true;
1286 goto out;
1287 }
1288 }
1289 ret = perms_include(k->mode, mode);
1290
1291out:
1292 free_key(k);
1293 return ret;
1294}
1295
1296#define INITSCOPE "/init.scope"
1297static void prune_init_slice(char *cg)
1298{
1299 char *point;
1300 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1301
1302 if (cg_len < initscope_len)
1303 return;
1304
1305 point = cg + cg_len - initscope_len;
1306 if (strcmp(point, INITSCOPE) == 0) {
1307 if (point == cg)
1308 *(point+1) = '\0';
1309 else
1310 *point = '\0';
1311 }
1312}
1313
1314/*
1315 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1316 * If pid is in /a, he may act on /a/b, but not on /b.
1317 * if the answer is false and nextcg is not NULL, then *nextcg will point
1318 * to a string containing the next cgroup directory under cg, which must be
1319 * freed by the caller.
1320 */
1321static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1322{
1323 bool answer = false;
1324 char *c2 = get_pid_cgroup(pid, contrl);
1325 char *linecmp;
1326
1327 if (!c2)
1328 return false;
1329 prune_init_slice(c2);
1330
1331 /*
12c31268
CB
1332 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1333 * they pass in a cgroup without leading '/'
1334 *
1335 * The original line here was:
1336 * linecmp = *cg == '/' ? c2 : c2+1;
1337 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1338 * Serge, do you know?
237e200e 1339 */
12c31268
CB
1340 if (*cg == '/' || !strncmp(cg, "./", 2))
1341 linecmp = c2;
1342 else
1343 linecmp = c2 + 1;
237e200e
SH
1344 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1345 if (nextcg) {
1346 *nextcg = get_next_cgroup_dir(linecmp, cg);
1347 }
1348 goto out;
1349 }
1350 answer = true;
1351
1352out:
1353 free(c2);
1354 return answer;
1355}
1356
1357/*
1358 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1359 */
1360static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1361{
1362 bool answer = false;
1363 char *c2, *task_cg;
1364 size_t target_len, task_len;
1365
f7bff426 1366 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
237e200e
SH
1367 return true;
1368
1369 c2 = get_pid_cgroup(pid, contrl);
1370 if (!c2)
1371 return false;
1372 prune_init_slice(c2);
1373
1374 task_cg = c2 + 1;
1375 target_len = strlen(cg);
1376 task_len = strlen(task_cg);
1377 if (task_len == 0) {
1378 /* Task is in the root cg, it can see everything. This case is
1379 * not handled by the strmcps below, since they test for the
1380 * last /, but that is the first / that we've chopped off
1381 * above.
1382 */
1383 answer = true;
1384 goto out;
1385 }
1386 if (strcmp(cg, task_cg) == 0) {
1387 answer = true;
1388 goto out;
1389 }
1390 if (target_len < task_len) {
1391 /* looking up a parent dir */
1392 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1393 answer = true;
1394 goto out;
1395 }
1396 if (target_len > task_len) {
1397 /* looking up a child dir */
1398 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1399 answer = true;
1400 goto out;
1401 }
1402
1403out:
1404 free(c2);
1405 return answer;
1406}
1407
1408/*
1409 * given /cgroup/freezer/a/b, return "freezer".
1410 * the returned char* should NOT be freed.
1411 */
1412static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1413{
1414 const char *p1;
1415 char *contr, *slash;
1416
1417 if (strlen(path) < 9)
1418 return NULL;
1419 if (*(path+7) != '/')
1420 return NULL;
1421 p1 = path+8;
1422 contr = strdupa(p1);
1423 if (!contr)
1424 return NULL;
1425 slash = strstr(contr, "/");
1426 if (slash)
1427 *slash = '\0';
1428
1429 int i;
1430 for (i = 0; i < num_hierarchies; i++) {
1431 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1432 return hierarchies[i];
1433 }
1434 return NULL;
1435}
1436
1437/*
1438 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1439 * Note that the returned value may include files (keynames) etc
1440 */
1441static const char *find_cgroup_in_path(const char *path)
1442{
1443 const char *p1;
1444
1445 if (strlen(path) < 9)
1446 return NULL;
1447 p1 = strstr(path+8, "/");
1448 if (!p1)
1449 return NULL;
1450 return p1+1;
1451}
1452
1453/*
1454 * split the last path element from the path in @cg.
1455 * @dir is newly allocated and should be freed, @last not
1456*/
1457static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1458{
1459 char *p;
1460
1461 do {
1462 *dir = strdup(cg);
1463 } while (!*dir);
1464 *last = strrchr(cg, '/');
1465 if (!*last) {
1466 *last = NULL;
1467 return;
1468 }
1469 p = strrchr(*dir, '/');
1470 *p = '\0';
1471}
1472
1473/*
1474 * FUSE ops for /cgroup
1475 */
1476
1477int cg_getattr(const char *path, struct stat *sb)
1478{
1479 struct timespec now;
1480 struct fuse_context *fc = fuse_get_context();
1481 char * cgdir = NULL;
1482 char *last = NULL, *path1, *path2;
1483 struct cgfs_files *k = NULL;
1484 const char *cgroup;
1485 const char *controller = NULL;
1486 int ret = -ENOENT;
1487
1488
1489 if (!fc)
1490 return -EIO;
1491
1492 memset(sb, 0, sizeof(struct stat));
1493
1494 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1495 return -EINVAL;
1496
1497 sb->st_uid = sb->st_gid = 0;
1498 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1499 sb->st_size = 0;
1500
1501 if (strcmp(path, "/cgroup") == 0) {
1502 sb->st_mode = S_IFDIR | 00755;
1503 sb->st_nlink = 2;
1504 return 0;
1505 }
1506
1507 controller = pick_controller_from_path(fc, path);
1508 if (!controller)
1509 return -EIO;
1510 cgroup = find_cgroup_in_path(path);
1511 if (!cgroup) {
1512 /* this is just /cgroup/controller, return it as a dir */
1513 sb->st_mode = S_IFDIR | 00755;
1514 sb->st_nlink = 2;
1515 return 0;
1516 }
1517
1518 get_cgdir_and_path(cgroup, &cgdir, &last);
1519
1520 if (!last) {
1521 path1 = "/";
1522 path2 = cgdir;
1523 } else {
1524 path1 = cgdir;
1525 path2 = last;
1526 }
1527
1528 pid_t initpid = lookup_initpid_in_store(fc->pid);
1529 if (initpid <= 0)
1530 initpid = fc->pid;
1531 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1532 * Then check that caller's cgroup is under path if last is a child
1533 * cgroup, or cgdir if last is a file */
1534
1535 if (is_child_cgroup(controller, path1, path2)) {
1536 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1537 ret = -ENOENT;
1538 goto out;
1539 }
1540 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1541 /* this is just /cgroup/controller, return it as a dir */
1542 sb->st_mode = S_IFDIR | 00555;
1543 sb->st_nlink = 2;
1544 ret = 0;
1545 goto out;
1546 }
1547 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1548 ret = -EACCES;
1549 goto out;
1550 }
1551
1552 // get uid, gid, from '/tasks' file and make up a mode
1553 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1554 sb->st_mode = S_IFDIR | 00755;
1555 k = cgfs_get_key(controller, cgroup, NULL);
1556 if (!k) {
1557 sb->st_uid = sb->st_gid = 0;
1558 } else {
1559 sb->st_uid = k->uid;
1560 sb->st_gid = k->gid;
1561 }
1562 free_key(k);
1563 sb->st_nlink = 2;
1564 ret = 0;
1565 goto out;
1566 }
1567
1568 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1569 sb->st_mode = S_IFREG | k->mode;
1570 sb->st_nlink = 1;
1571 sb->st_uid = k->uid;
1572 sb->st_gid = k->gid;
1573 sb->st_size = 0;
1574 free_key(k);
1575 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1576 ret = -ENOENT;
1577 goto out;
1578 }
1579 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
1580 ret = -EACCES;
1581 goto out;
1582 }
1583
1584 ret = 0;
1585 }
1586
1587out:
1588 free(cgdir);
1589 return ret;
1590}
1591
1592int cg_opendir(const char *path, struct fuse_file_info *fi)
1593{
1594 struct fuse_context *fc = fuse_get_context();
1595 const char *cgroup;
1596 struct file_info *dir_info;
1597 char *controller = NULL;
1598
1599 if (!fc)
1600 return -EIO;
1601
1602 if (strcmp(path, "/cgroup") == 0) {
1603 cgroup = NULL;
1604 controller = NULL;
1605 } else {
1606 // return list of keys for the controller, and list of child cgroups
1607 controller = pick_controller_from_path(fc, path);
1608 if (!controller)
1609 return -EIO;
1610
1611 cgroup = find_cgroup_in_path(path);
1612 if (!cgroup) {
1613 /* this is just /cgroup/controller, return its contents */
1614 cgroup = "/";
1615 }
1616 }
1617
1618 pid_t initpid = lookup_initpid_in_store(fc->pid);
1619 if (initpid <= 0)
1620 initpid = fc->pid;
1621 if (cgroup) {
1622 if (!caller_may_see_dir(initpid, controller, cgroup))
1623 return -ENOENT;
1624 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1625 return -EACCES;
1626 }
1627
1628 /* we'll free this at cg_releasedir */
1629 dir_info = malloc(sizeof(*dir_info));
1630 if (!dir_info)
1631 return -ENOMEM;
1632 dir_info->controller = must_copy_string(controller);
1633 dir_info->cgroup = must_copy_string(cgroup);
1634 dir_info->type = LXC_TYPE_CGDIR;
1635 dir_info->buf = NULL;
1636 dir_info->file = NULL;
1637 dir_info->buflen = 0;
1638
1639 fi->fh = (unsigned long)dir_info;
1640 return 0;
1641}
1642
1643int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1644 struct fuse_file_info *fi)
1645{
1646 struct file_info *d = (struct file_info *)fi->fh;
1647 struct cgfs_files **list = NULL;
1648 int i, ret;
1649 char *nextcg = NULL;
1650 struct fuse_context *fc = fuse_get_context();
1651 char **clist = NULL;
1652
1653 if (d->type != LXC_TYPE_CGDIR) {
1654 fprintf(stderr, "Internal error: file cache info used in readdir\n");
1655 return -EIO;
1656 }
1657 if (!d->cgroup && !d->controller) {
1658 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1659 int i;
1660
1661 for (i = 0; i < num_hierarchies; i++) {
1662 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1663 return -EIO;
1664 }
1665 }
1666 return 0;
1667 }
1668
1669 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1670 // not a valid cgroup
1671 ret = -EINVAL;
1672 goto out;
1673 }
1674
1675 pid_t initpid = lookup_initpid_in_store(fc->pid);
1676 if (initpid <= 0)
1677 initpid = fc->pid;
1678 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1679 if (nextcg) {
1680 ret = filler(buf, nextcg, NULL, 0);
1681 free(nextcg);
1682 if (ret != 0) {
1683 ret = -EIO;
1684 goto out;
1685 }
1686 }
1687 ret = 0;
1688 goto out;
1689 }
1690
1691 for (i = 0; list[i]; i++) {
1692 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1693 ret = -EIO;
1694 goto out;
1695 }
1696 }
1697
1698 // now get the list of child cgroups
1699
1700 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1701 ret = 0;
1702 goto out;
1703 }
f366da65
WB
1704 if (clist) {
1705 for (i = 0; clist[i]; i++) {
1706 if (filler(buf, clist[i], NULL, 0) != 0) {
1707 ret = -EIO;
1708 goto out;
1709 }
237e200e
SH
1710 }
1711 }
1712 ret = 0;
1713
1714out:
1715 free_keys(list);
1716 if (clist) {
1717 for (i = 0; clist[i]; i++)
1718 free(clist[i]);
1719 free(clist);
1720 }
1721 return ret;
1722}
1723
43215927 1724static void do_release_file_info(struct fuse_file_info *fi)
237e200e 1725{
43215927
SH
1726 struct file_info *f = (struct file_info *)fi->fh;
1727
237e200e
SH
1728 if (!f)
1729 return;
43215927
SH
1730
1731 fi->fh = 0;
1732
237e200e 1733 free(f->controller);
43215927 1734 f->controller = NULL;
237e200e 1735 free(f->cgroup);
43215927 1736 f->cgroup = NULL;
237e200e 1737 free(f->file);
43215927 1738 f->file = NULL;
237e200e 1739 free(f->buf);
43215927 1740 f->buf = NULL;
237e200e
SH
1741 free(f);
1742}
1743
1744int cg_releasedir(const char *path, struct fuse_file_info *fi)
1745{
43215927 1746 do_release_file_info(fi);
237e200e
SH
1747 return 0;
1748}
1749
1750int cg_open(const char *path, struct fuse_file_info *fi)
1751{
1752 const char *cgroup;
1753 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1754 struct cgfs_files *k = NULL;
1755 struct file_info *file_info;
1756 struct fuse_context *fc = fuse_get_context();
1757 int ret;
1758
1759 if (!fc)
1760 return -EIO;
1761
1762 controller = pick_controller_from_path(fc, path);
1763 if (!controller)
1764 return -EIO;
1765 cgroup = find_cgroup_in_path(path);
1766 if (!cgroup)
1767 return -EINVAL;
1768
1769 get_cgdir_and_path(cgroup, &cgdir, &last);
1770 if (!last) {
1771 path1 = "/";
1772 path2 = cgdir;
1773 } else {
1774 path1 = cgdir;
1775 path2 = last;
1776 }
1777
1778 k = cgfs_get_key(controller, path1, path2);
1779 if (!k) {
1780 ret = -EINVAL;
1781 goto out;
1782 }
1783 free_key(k);
1784
1785 pid_t initpid = lookup_initpid_in_store(fc->pid);
1786 if (initpid <= 0)
1787 initpid = fc->pid;
1788 if (!caller_may_see_dir(initpid, controller, path1)) {
1789 ret = -ENOENT;
1790 goto out;
1791 }
1792 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
237e200e
SH
1793 ret = -EACCES;
1794 goto out;
1795 }
1796
1797 /* we'll free this at cg_release */
1798 file_info = malloc(sizeof(*file_info));
1799 if (!file_info) {
1800 ret = -ENOMEM;
1801 goto out;
1802 }
1803 file_info->controller = must_copy_string(controller);
1804 file_info->cgroup = must_copy_string(path1);
1805 file_info->file = must_copy_string(path2);
1806 file_info->type = LXC_TYPE_CGFILE;
1807 file_info->buf = NULL;
1808 file_info->buflen = 0;
1809
1810 fi->fh = (unsigned long)file_info;
1811 ret = 0;
1812
1813out:
1814 free(cgdir);
1815 return ret;
1816}
1817
bddbb106
SH
1818int cg_access(const char *path, int mode)
1819{
1820 const char *cgroup;
1821 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1822 struct cgfs_files *k = NULL;
1823 struct fuse_context *fc = fuse_get_context();
1824 int ret;
1825
1826 if (!fc)
1827 return -EIO;
1828
1829 controller = pick_controller_from_path(fc, path);
1830 if (!controller)
1831 return -EIO;
1832 cgroup = find_cgroup_in_path(path);
575316c4
SH
1833 if (!cgroup) {
1834 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
3f441bc7
SH
1835 if ((mode & W_OK) == 0)
1836 return 0;
1837 return -EACCES;
575316c4 1838 }
bddbb106
SH
1839
1840 get_cgdir_and_path(cgroup, &cgdir, &last);
1841 if (!last) {
1842 path1 = "/";
1843 path2 = cgdir;
1844 } else {
1845 path1 = cgdir;
1846 path2 = last;
1847 }
1848
1849 k = cgfs_get_key(controller, path1, path2);
1850 if (!k) {
3f441bc7
SH
1851 if ((mode & W_OK) == 0)
1852 ret = 0;
1853 else
1854 ret = -EACCES;
bddbb106
SH
1855 goto out;
1856 }
1857 free_key(k);
1858
1859 pid_t initpid = lookup_initpid_in_store(fc->pid);
1860 if (initpid <= 0)
1861 initpid = fc->pid;
1862 if (!caller_may_see_dir(initpid, controller, path1)) {
1863 ret = -ENOENT;
1864 goto out;
1865 }
1866 if (!fc_may_access(fc, controller, path1, path2, mode)) {
1867 ret = -EACCES;
1868 goto out;
1869 }
1870
1871 ret = 0;
1872
1873out:
1874 free(cgdir);
1875 return ret;
1876}
1877
237e200e
SH
1878int cg_release(const char *path, struct fuse_file_info *fi)
1879{
43215927 1880 do_release_file_info(fi);
237e200e
SH
1881 return 0;
1882}
1883
1884#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
1885
1886static bool wait_for_sock(int sock, int timeout)
1887{
1888 struct epoll_event ev;
1889 int epfd, ret, now, starttime, deltatime, saved_errno;
1890
1891 if ((starttime = time(NULL)) < 0)
1892 return false;
1893
1894 if ((epfd = epoll_create(1)) < 0) {
1895 fprintf(stderr, "Failed to create epoll socket: %m\n");
1896 return false;
1897 }
1898
1899 ev.events = POLLIN_SET;
1900 ev.data.fd = sock;
1901 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
1902 fprintf(stderr, "Failed adding socket to epoll: %m\n");
1903 close(epfd);
1904 return false;
1905 }
1906
1907again:
1908 if ((now = time(NULL)) < 0) {
1909 close(epfd);
1910 return false;
1911 }
1912
1913 deltatime = (starttime + timeout) - now;
1914 if (deltatime < 0) { // timeout
1915 errno = 0;
1916 close(epfd);
1917 return false;
1918 }
1919 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
1920 if (ret < 0 && errno == EINTR)
1921 goto again;
1922 saved_errno = errno;
1923 close(epfd);
1924
1925 if (ret <= 0) {
1926 errno = saved_errno;
1927 return false;
1928 }
1929 return true;
1930}
1931
1932static int msgrecv(int sockfd, void *buf, size_t len)
1933{
1934 if (!wait_for_sock(sockfd, 2))
1935 return -1;
1936 return recv(sockfd, buf, len, MSG_DONTWAIT);
1937}
1938
1939static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
1940{
1941 struct msghdr msg = { 0 };
1942 struct iovec iov;
1943 struct cmsghdr *cmsg;
1944 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
1945 char buf[1];
1946 buf[0] = 'p';
1947
1948 if (pingfirst) {
1949 if (msgrecv(sock, buf, 1) != 1) {
1950 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
1951 __func__);
1952 return SEND_CREDS_FAIL;
1953 }
1954 }
1955
1956 msg.msg_control = cmsgbuf;
1957 msg.msg_controllen = sizeof(cmsgbuf);
1958
1959 cmsg = CMSG_FIRSTHDR(&msg);
1960 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
1961 cmsg->cmsg_level = SOL_SOCKET;
1962 cmsg->cmsg_type = SCM_CREDENTIALS;
1963 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
1964
1965 msg.msg_name = NULL;
1966 msg.msg_namelen = 0;
1967
1968 buf[0] = v;
1969 iov.iov_base = buf;
1970 iov.iov_len = sizeof(buf);
1971 msg.msg_iov = &iov;
1972 msg.msg_iovlen = 1;
1973
1974 if (sendmsg(sock, &msg, 0) < 0) {
1975 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
1976 strerror(errno));
1977 if (errno == 3)
1978 return SEND_CREDS_NOTSK;
1979 return SEND_CREDS_FAIL;
1980 }
1981
1982 return SEND_CREDS_OK;
1983}
1984
1985static bool recv_creds(int sock, struct ucred *cred, char *v)
1986{
1987 struct msghdr msg = { 0 };
1988 struct iovec iov;
1989 struct cmsghdr *cmsg;
1990 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
1991 char buf[1];
1992 int ret;
1993 int optval = 1;
1994
1995 *v = '1';
1996
1997 cred->pid = -1;
1998 cred->uid = -1;
1999 cred->gid = -1;
2000
2001 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2002 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
2003 return false;
2004 }
2005 buf[0] = '1';
2006 if (write(sock, buf, 1) != 1) {
2007 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
2008 return false;
2009 }
2010
2011 msg.msg_name = NULL;
2012 msg.msg_namelen = 0;
2013 msg.msg_control = cmsgbuf;
2014 msg.msg_controllen = sizeof(cmsgbuf);
2015
2016 iov.iov_base = buf;
2017 iov.iov_len = sizeof(buf);
2018 msg.msg_iov = &iov;
2019 msg.msg_iovlen = 1;
2020
2021 if (!wait_for_sock(sock, 2)) {
2022 fprintf(stderr, "Timed out waiting for scm_cred: %s\n",
2023 strerror(errno));
2024 return false;
2025 }
2026 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2027 if (ret < 0) {
2028 fprintf(stderr, "Failed to receive scm_cred: %s\n",
2029 strerror(errno));
2030 return false;
2031 }
2032
2033 cmsg = CMSG_FIRSTHDR(&msg);
2034
2035 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2036 cmsg->cmsg_level == SOL_SOCKET &&
2037 cmsg->cmsg_type == SCM_CREDENTIALS) {
2038 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2039 }
2040 *v = buf[0];
2041
2042 return true;
2043}
2044
35174b0f
FG
2045struct pid_ns_clone_args {
2046 int *cpipe;
2047 int sock;
2048 pid_t tpid;
2049 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2050};
2051
2052/*
2053 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2054 * with clone(). This simply writes '1' as ACK back to the parent
2055 * before calling the actual wrapped function.
2056 */
2057static int pid_ns_clone_wrapper(void *arg) {
2058 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2059 char b = '1';
2060
2061 close(args->cpipe[0]);
2062 if (write(args->cpipe[1], &b, sizeof(char)) < 0) {
2063 fprintf(stderr, "%s (child): error on write: %s\n",
2064 __func__, strerror(errno));
2065 }
2066 close(args->cpipe[1]);
2067 return args->wrapped(args->sock, args->tpid);
2068}
237e200e
SH
2069
2070/*
2071 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2072 * int value back over the socket. This shifts the pid from the
2073 * sender's pidns into tpid's pidns.
2074 */
35174b0f 2075static int pid_to_ns(int sock, pid_t tpid)
237e200e
SH
2076{
2077 char v = '0';
2078 struct ucred cred;
2079
2080 while (recv_creds(sock, &cred, &v)) {
2081 if (v == '1')
35174b0f 2082 return 0;
237e200e 2083 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
35174b0f 2084 return 1;
237e200e 2085 }
35174b0f 2086 return 0;
237e200e
SH
2087}
2088
35174b0f 2089
237e200e
SH
2090/*
2091 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
35174b0f
FG
2092 * in your old pidns. Only children which you clone will be in the target
2093 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2094 * actually convert pids.
2095 *
2096 * Note: glibc's fork() does not respect pidns, which can lead to failed
2097 * assertions inside glibc (and thus failed forks) if the child's pid in
2098 * the pidns and the parent pid outside are identical. Using clone prevents
2099 * this issue.
237e200e
SH
2100 */
2101static void pid_to_ns_wrapper(int sock, pid_t tpid)
2102{
2103 int newnsfd = -1, ret, cpipe[2];
2104 char fnam[100];
2105 pid_t cpid;
2106 char v;
2107
2108 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2109 if (ret < 0 || ret >= sizeof(fnam))
2110 _exit(1);
2111 newnsfd = open(fnam, O_RDONLY);
2112 if (newnsfd < 0)
2113 _exit(1);
2114 if (setns(newnsfd, 0) < 0)
2115 _exit(1);
2116 close(newnsfd);
2117
2118 if (pipe(cpipe) < 0)
2119 _exit(1);
2120
35174b0f
FG
2121 struct pid_ns_clone_args args = {
2122 .cpipe = cpipe,
2123 .sock = sock,
2124 .tpid = tpid,
2125 .wrapped = &pid_to_ns
2126 };
2127 size_t stack_size = sysconf(_SC_PAGESIZE);
2128 void *stack = alloca(stack_size);
2129
2130 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2131 if (cpid < 0)
2132 _exit(1);
2133
237e200e
SH
2134 // give the child 1 second to be done forking and
2135 // write its ack
2136 if (!wait_for_sock(cpipe[0], 1))
2137 _exit(1);
2138 ret = read(cpipe[0], &v, 1);
2139 if (ret != sizeof(char) || v != '1')
2140 _exit(1);
2141
2142 if (!wait_for_pid(cpid))
2143 _exit(1);
2144 _exit(0);
2145}
2146
2147/*
2148 * To read cgroup files with a particular pid, we will setns into the child
2149 * pidns, open a pipe, fork a child - which will be the first to really be in
2150 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2151 */
2152bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2153{
2154 int sock[2] = {-1, -1};
2155 char *tmpdata = NULL;
2156 int ret;
2157 pid_t qpid, cpid = -1;
2158 bool answer = false;
2159 char v = '0';
2160 struct ucred cred;
2161 size_t sz = 0, asz = 0;
2162
2163 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2164 return false;
2165
2166 /*
2167 * Now we read the pids from returned data one by one, pass
2168 * them into a child in the target namespace, read back the
2169 * translated pids, and put them into our to-return data
2170 */
2171
2172 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2173 perror("socketpair");
2174 free(tmpdata);
2175 return false;
2176 }
2177
2178 cpid = fork();
2179 if (cpid == -1)
2180 goto out;
2181
2182 if (!cpid) // child - exits when done
2183 pid_to_ns_wrapper(sock[1], tpid);
2184
2185 char *ptr = tmpdata;
2186 cred.uid = 0;
2187 cred.gid = 0;
2188 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2189 cred.pid = qpid;
2190 ret = send_creds(sock[0], &cred, v, true);
2191
2192 if (ret == SEND_CREDS_NOTSK)
2193 goto next;
2194 if (ret == SEND_CREDS_FAIL)
2195 goto out;
2196
2197 // read converted results
2198 if (!wait_for_sock(sock[0], 2)) {
2199 fprintf(stderr, "%s: timed out waiting for pid from child: %s\n",
2200 __func__, strerror(errno));
2201 goto out;
2202 }
2203 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2204 fprintf(stderr, "%s: error reading pid from child: %s\n",
2205 __func__, strerror(errno));
2206 goto out;
2207 }
2208 must_strcat_pid(d, &sz, &asz, qpid);
2209next:
2210 ptr = strchr(ptr, '\n');
2211 if (!ptr)
2212 break;
2213 ptr++;
2214 }
2215
2216 cred.pid = getpid();
2217 v = '1';
2218 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2219 // failed to ask child to exit
2220 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
2221 __func__, strerror(errno));
2222 goto out;
2223 }
2224
2225 answer = true;
2226
2227out:
2228 free(tmpdata);
2229 if (cpid != -1)
2230 wait_for_pid(cpid);
2231 if (sock[0] != -1) {
2232 close(sock[0]);
2233 close(sock[1]);
2234 }
2235 return answer;
2236}
2237
2238int cg_read(const char *path, char *buf, size_t size, off_t offset,
2239 struct fuse_file_info *fi)
2240{
2241 struct fuse_context *fc = fuse_get_context();
2242 struct file_info *f = (struct file_info *)fi->fh;
2243 struct cgfs_files *k = NULL;
2244 char *data = NULL;
2245 int ret, s;
2246 bool r;
2247
2248 if (f->type != LXC_TYPE_CGFILE) {
2249 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
2250 return -EIO;
2251 }
2252
2253 if (offset)
2254 return 0;
2255
2256 if (!fc)
2257 return -EIO;
2258
2259 if (!f->controller)
2260 return -EINVAL;
2261
2262 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2263 return -EINVAL;
2264 }
2265 free_key(k);
2266
2267
888f8f3c 2268 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
237e200e
SH
2269 ret = -EACCES;
2270 goto out;
2271 }
2272
2273 if (strcmp(f->file, "tasks") == 0 ||
2274 strcmp(f->file, "/tasks") == 0 ||
2275 strcmp(f->file, "/cgroup.procs") == 0 ||
2276 strcmp(f->file, "cgroup.procs") == 0)
2277 // special case - we have to translate the pids
2278 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2279 else
2280 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2281
2282 if (!r) {
2283 ret = -EINVAL;
2284 goto out;
2285 }
2286
2287 if (!data) {
2288 ret = 0;
2289 goto out;
2290 }
2291 s = strlen(data);
2292 if (s > size)
2293 s = size;
2294 memcpy(buf, data, s);
2295 if (s > 0 && s < size && data[s-1] != '\n')
2296 buf[s++] = '\n';
2297
2298 ret = s;
2299
2300out:
2301 free(data);
2302 return ret;
2303}
2304
35174b0f 2305static int pid_from_ns(int sock, pid_t tpid)
237e200e
SH
2306{
2307 pid_t vpid;
2308 struct ucred cred;
2309 char v;
2310 int ret;
2311
2312 cred.uid = 0;
2313 cred.gid = 0;
2314 while (1) {
2315 if (!wait_for_sock(sock, 2)) {
2316 fprintf(stderr, "%s: timeout reading from parent\n", __func__);
35174b0f 2317 return 1;
237e200e
SH
2318 }
2319 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2320 fprintf(stderr, "%s: bad read from parent: %s\n",
2321 __func__, strerror(errno));
35174b0f 2322 return 1;
237e200e
SH
2323 }
2324 if (vpid == -1) // done
2325 break;
2326 v = '0';
2327 cred.pid = vpid;
2328 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2329 v = '1';
2330 cred.pid = getpid();
2331 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
35174b0f 2332 return 1;
237e200e
SH
2333 }
2334 }
35174b0f 2335 return 0;
237e200e
SH
2336}
2337
2338static void pid_from_ns_wrapper(int sock, pid_t tpid)
2339{
2340 int newnsfd = -1, ret, cpipe[2];
2341 char fnam[100];
2342 pid_t cpid;
2343 char v;
2344
2345 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2346 if (ret < 0 || ret >= sizeof(fnam))
2347 _exit(1);
2348 newnsfd = open(fnam, O_RDONLY);
2349 if (newnsfd < 0)
2350 _exit(1);
2351 if (setns(newnsfd, 0) < 0)
2352 _exit(1);
2353 close(newnsfd);
2354
2355 if (pipe(cpipe) < 0)
2356 _exit(1);
2357
35174b0f
FG
2358 struct pid_ns_clone_args args = {
2359 .cpipe = cpipe,
2360 .sock = sock,
2361 .tpid = tpid,
2362 .wrapped = &pid_from_ns
2363 };
f0f8b851
SH
2364 size_t stack_size = sysconf(_SC_PAGESIZE);
2365 void *stack = alloca(stack_size);
35174b0f
FG
2366
2367 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2368 if (cpid < 0)
2369 _exit(1);
2370
237e200e
SH
2371 // give the child 1 second to be done forking and
2372 // write its ack
2373 if (!wait_for_sock(cpipe[0], 1))
f0f8b851 2374 _exit(1);
237e200e 2375 ret = read(cpipe[0], &v, 1);
f0f8b851
SH
2376 if (ret != sizeof(char) || v != '1')
2377 _exit(1);
237e200e
SH
2378
2379 if (!wait_for_pid(cpid))
2380 _exit(1);
2381 _exit(0);
237e200e
SH
2382}
2383
2384/*
2385 * Given host @uid, return the uid to which it maps in
2386 * @pid's user namespace, or -1 if none.
2387 */
2388bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2389{
2390 FILE *f;
2391 char line[400];
2392
2393 sprintf(line, "/proc/%d/uid_map", pid);
2394 if ((f = fopen(line, "r")) == NULL) {
2395 return false;
2396 }
2397
2398 *answer = convert_id_to_ns(f, uid);
2399 fclose(f);
2400
2401 if (*answer == -1)
2402 return false;
2403 return true;
2404}
2405
2406/*
2407 * get_pid_creds: get the real uid and gid of @pid from
2408 * /proc/$$/status
2409 * (XXX should we use euid here?)
2410 */
2411void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2412{
2413 char line[400];
2414 uid_t u;
2415 gid_t g;
2416 FILE *f;
2417
2418 *uid = -1;
2419 *gid = -1;
2420 sprintf(line, "/proc/%d/status", pid);
2421 if ((f = fopen(line, "r")) == NULL) {
2422 fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno));
2423 return;
2424 }
2425 while (fgets(line, 400, f)) {
2426 if (strncmp(line, "Uid:", 4) == 0) {
2427 if (sscanf(line+4, "%u", &u) != 1) {
2428 fprintf(stderr, "bad uid line for pid %u\n", pid);
2429 fclose(f);
2430 return;
2431 }
2432 *uid = u;
2433 } else if (strncmp(line, "Gid:", 4) == 0) {
2434 if (sscanf(line+4, "%u", &g) != 1) {
2435 fprintf(stderr, "bad gid line for pid %u\n", pid);
2436 fclose(f);
2437 return;
2438 }
2439 *gid = g;
2440 }
2441 }
2442 fclose(f);
2443}
2444
2445/*
2446 * May the requestor @r move victim @v to a new cgroup?
2447 * This is allowed if
2448 * . they are the same task
2449 * . they are ownedy by the same uid
2450 * . @r is root on the host, or
2451 * . @v's uid is mapped into @r's where @r is root.
2452 */
2453bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2454{
2455 uid_t v_uid, tmpuid;
2456 gid_t v_gid;
2457
2458 if (r == v)
2459 return true;
2460 if (r_uid == 0)
2461 return true;
2462 get_pid_creds(v, &v_uid, &v_gid);
2463 if (r_uid == v_uid)
2464 return true;
2465 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2466 && hostuid_to_ns(v_uid, r, &tmpuid))
2467 return true;
2468 return false;
2469}
2470
2471static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2472 const char *file, const char *buf)
2473{
2474 int sock[2] = {-1, -1};
2475 pid_t qpid, cpid = -1;
2476 FILE *pids_file = NULL;
2477 bool answer = false, fail = false;
2478
2479 pids_file = open_pids_file(contrl, cg);
2480 if (!pids_file)
2481 return false;
2482
2483 /*
2484 * write the pids to a socket, have helper in writer's pidns
2485 * call movepid for us
2486 */
2487 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2488 perror("socketpair");
2489 goto out;
2490 }
2491
2492 cpid = fork();
2493 if (cpid == -1)
2494 goto out;
2495
2496 if (!cpid) { // child
2497 fclose(pids_file);
2498 pid_from_ns_wrapper(sock[1], tpid);
2499 }
2500
2501 const char *ptr = buf;
2502 while (sscanf(ptr, "%d", &qpid) == 1) {
2503 struct ucred cred;
2504 char v;
2505
2506 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2507 fprintf(stderr, "%s: error writing pid to child: %s\n",
2508 __func__, strerror(errno));
2509 goto out;
2510 }
2511
2512 if (recv_creds(sock[0], &cred, &v)) {
2513 if (v == '0') {
2514 if (!may_move_pid(tpid, tuid, cred.pid)) {
2515 fail = true;
2516 break;
2517 }
2518 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2519 fail = true;
2520 }
2521 }
2522
2523 ptr = strchr(ptr, '\n');
2524 if (!ptr)
2525 break;
2526 ptr++;
2527 }
2528
2529 /* All good, write the value */
2530 qpid = -1;
2531 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2532 fprintf(stderr, "Warning: failed to ask child to exit\n");
2533
2534 if (!fail)
2535 answer = true;
2536
2537out:
2538 if (cpid != -1)
2539 wait_for_pid(cpid);
2540 if (sock[0] != -1) {
2541 close(sock[0]);
2542 close(sock[1]);
2543 }
2544 if (pids_file) {
2545 if (fclose(pids_file) != 0)
2546 answer = false;
2547 }
2548 return answer;
2549}
2550
2551int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2552 struct fuse_file_info *fi)
2553{
2554 struct fuse_context *fc = fuse_get_context();
2555 char *localbuf = NULL;
2556 struct cgfs_files *k = NULL;
2557 struct file_info *f = (struct file_info *)fi->fh;
2558 bool r;
2559
2560 if (f->type != LXC_TYPE_CGFILE) {
2561 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
2562 return -EIO;
2563 }
2564
2565 if (offset)
2566 return 0;
2567
2568 if (!fc)
2569 return -EIO;
2570
2571 localbuf = alloca(size+1);
2572 localbuf[size] = '\0';
2573 memcpy(localbuf, buf, size);
2574
2575 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2576 size = -EINVAL;
2577 goto out;
2578 }
2579
2580 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2581 size = -EACCES;
2582 goto out;
2583 }
2584
2585 if (strcmp(f->file, "tasks") == 0 ||
2586 strcmp(f->file, "/tasks") == 0 ||
2587 strcmp(f->file, "/cgroup.procs") == 0 ||
2588 strcmp(f->file, "cgroup.procs") == 0)
2589 // special case - we have to translate the pids
2590 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2591 else
2592 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2593
2594 if (!r)
2595 size = -EINVAL;
2596
2597out:
2598 free_key(k);
2599 return size;
2600}
2601
2602int cg_chown(const char *path, uid_t uid, gid_t gid)
2603{
2604 struct fuse_context *fc = fuse_get_context();
2605 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2606 struct cgfs_files *k = NULL;
2607 const char *cgroup;
2608 int ret;
2609
2610 if (!fc)
2611 return -EIO;
2612
2613 if (strcmp(path, "/cgroup") == 0)
2614 return -EINVAL;
2615
2616 controller = pick_controller_from_path(fc, path);
2617 if (!controller)
2618 return -EINVAL;
2619 cgroup = find_cgroup_in_path(path);
2620 if (!cgroup)
2621 /* this is just /cgroup/controller */
2622 return -EINVAL;
2623
2624 get_cgdir_and_path(cgroup, &cgdir, &last);
2625
2626 if (!last) {
2627 path1 = "/";
2628 path2 = cgdir;
2629 } else {
2630 path1 = cgdir;
2631 path2 = last;
2632 }
2633
2634 if (is_child_cgroup(controller, path1, path2)) {
2635 // get uid, gid, from '/tasks' file and make up a mode
2636 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2637 k = cgfs_get_key(controller, cgroup, "tasks");
2638
2639 } else
2640 k = cgfs_get_key(controller, path1, path2);
2641
2642 if (!k) {
2643 ret = -EINVAL;
2644 goto out;
2645 }
2646
2647 /*
2648 * This being a fuse request, the uid and gid must be valid
2649 * in the caller's namespace. So we can just check to make
2650 * sure that the caller is root in his uid, and privileged
2651 * over the file's current owner.
2652 */
2653 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2654 ret = -EACCES;
2655 goto out;
2656 }
2657
2658 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2659
2660out:
2661 free_key(k);
2662 free(cgdir);
2663
2664 return ret;
2665}
2666
2667int cg_chmod(const char *path, mode_t mode)
2668{
2669 struct fuse_context *fc = fuse_get_context();
2670 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2671 struct cgfs_files *k = NULL;
2672 const char *cgroup;
2673 int ret;
2674
2675 if (!fc)
2676 return -EIO;
2677
2678 if (strcmp(path, "/cgroup") == 0)
2679 return -EINVAL;
2680
2681 controller = pick_controller_from_path(fc, path);
2682 if (!controller)
2683 return -EINVAL;
2684 cgroup = find_cgroup_in_path(path);
2685 if (!cgroup)
2686 /* this is just /cgroup/controller */
2687 return -EINVAL;
2688
2689 get_cgdir_and_path(cgroup, &cgdir, &last);
2690
2691 if (!last) {
2692 path1 = "/";
2693 path2 = cgdir;
2694 } else {
2695 path1 = cgdir;
2696 path2 = last;
2697 }
2698
2699 if (is_child_cgroup(controller, path1, path2)) {
2700 // get uid, gid, from '/tasks' file and make up a mode
2701 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2702 k = cgfs_get_key(controller, cgroup, "tasks");
2703
2704 } else
2705 k = cgfs_get_key(controller, path1, path2);
2706
2707 if (!k) {
2708 ret = -EINVAL;
2709 goto out;
2710 }
2711
2712 /*
2713 * This being a fuse request, the uid and gid must be valid
2714 * in the caller's namespace. So we can just check to make
2715 * sure that the caller is root in his uid, and privileged
2716 * over the file's current owner.
2717 */
2718 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2719 ret = -EPERM;
2720 goto out;
2721 }
2722
2723 if (!cgfs_chmod_file(controller, cgroup, mode)) {
2724 ret = -EINVAL;
2725 goto out;
2726 }
2727
2728 ret = 0;
2729out:
2730 free_key(k);
2731 free(cgdir);
2732 return ret;
2733}
2734
2735int cg_mkdir(const char *path, mode_t mode)
2736{
2737 struct fuse_context *fc = fuse_get_context();
2738 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2739 const char *cgroup;
2740 int ret;
2741
2742 if (!fc)
2743 return -EIO;
2744
2745
2746 controller = pick_controller_from_path(fc, path);
2747 if (!controller)
2748 return -EINVAL;
2749
2750 cgroup = find_cgroup_in_path(path);
2751 if (!cgroup)
2752 return -EINVAL;
2753
2754 get_cgdir_and_path(cgroup, &cgdir, &last);
2755 if (!last)
2756 path1 = "/";
2757 else
2758 path1 = cgdir;
2759
2760 pid_t initpid = lookup_initpid_in_store(fc->pid);
2761 if (initpid <= 0)
2762 initpid = fc->pid;
2763 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
2764 if (!next)
2765 ret = -EINVAL;
2766 else if (last && strcmp(next, last) == 0)
2767 ret = -EEXIST;
2768 else
2769 ret = -ENOENT;
2770 goto out;
2771 }
2772
2773 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
2774 ret = -EACCES;
2775 goto out;
2776 }
2777 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2778 ret = -EACCES;
2779 goto out;
2780 }
2781
2782 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
2783
2784out:
2785 free(cgdir);
2786 free(next);
2787 return ret;
2788}
2789
2790int cg_rmdir(const char *path)
2791{
2792 struct fuse_context *fc = fuse_get_context();
2793 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
2794 const char *cgroup;
2795 int ret;
2796
2797 if (!fc)
2798 return -EIO;
2799
2800 controller = pick_controller_from_path(fc, path);
2801 if (!controller)
2802 return -EINVAL;
2803
2804 cgroup = find_cgroup_in_path(path);
2805 if (!cgroup)
2806 return -EINVAL;
2807
2808 get_cgdir_and_path(cgroup, &cgdir, &last);
2809 if (!last) {
2810 ret = -EINVAL;
2811 goto out;
2812 }
2813
2814 pid_t initpid = lookup_initpid_in_store(fc->pid);
2815 if (initpid <= 0)
2816 initpid = fc->pid;
2817 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
2818 if (!last || strcmp(next, last) == 0)
2819 ret = -EBUSY;
2820 else
2821 ret = -ENOENT;
2822 goto out;
2823 }
2824
2825 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
2826 ret = -EACCES;
2827 goto out;
2828 }
2829 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2830 ret = -EACCES;
2831 goto out;
2832 }
2833
2834 if (!cgfs_remove(controller, cgroup)) {
2835 ret = -EINVAL;
2836 goto out;
2837 }
2838
2839 ret = 0;
2840
2841out:
2842 free(cgdir);
2843 free(next);
2844 return ret;
2845}
2846
2847static bool startswith(const char *line, const char *pref)
2848{
2849 if (strncmp(line, pref, strlen(pref)) == 0)
2850 return true;
2851 return false;
2852}
2853
2854static void get_mem_cached(char *memstat, unsigned long *v)
2855{
2856 char *eol;
2857
2858 *v = 0;
2859 while (*memstat) {
2860 if (startswith(memstat, "total_cache")) {
2861 sscanf(memstat + 11, "%lu", v);
2862 *v /= 1024;
2863 return;
2864 }
2865 eol = strchr(memstat, '\n');
2866 if (!eol)
2867 return;
2868 memstat = eol+1;
2869 }
2870}
2871
2872static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2873{
2874 char *eol;
2875 char key[32];
2876
2877 memset(key, 0, 32);
2878 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2879
2880 size_t len = strlen(key);
2881 *v = 0;
2882
2883 while (*str) {
2884 if (startswith(str, key)) {
2885 sscanf(str + len, "%lu", v);
2886 return;
2887 }
2888 eol = strchr(str, '\n');
2889 if (!eol)
2890 return;
2891 str = eol+1;
2892 }
2893}
2894
2895static int read_file(const char *path, char *buf, size_t size,
2896 struct file_info *d)
2897{
2898 size_t linelen = 0, total_len = 0, rv = 0;
2899 char *line = NULL;
2900 char *cache = d->buf;
2901 size_t cache_size = d->buflen;
2902 FILE *f = fopen(path, "r");
2903 if (!f)
2904 return 0;
2905
2906 while (getline(&line, &linelen, f) != -1) {
a262ddb7 2907 ssize_t l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
2908 if (l < 0) {
2909 perror("Error writing to cache");
2910 rv = 0;
2911 goto err;
2912 }
2913 if (l >= cache_size) {
2914 fprintf(stderr, "Internal error: truncated write to cache\n");
2915 rv = 0;
2916 goto err;
2917 }
2918 cache += l;
2919 cache_size -= l;
2920 total_len += l;
2921 }
2922
2923 d->size = total_len;
a262ddb7
CB
2924 if (total_len > size)
2925 total_len = size;
237e200e
SH
2926
2927 /* read from off 0 */
2928 memcpy(buf, d->buf, total_len);
2929 rv = total_len;
2930 err:
2931 fclose(f);
2932 free(line);
2933 return rv;
2934}
2935
2936/*
2937 * FUSE ops for /proc
2938 */
2939
2940static unsigned long get_memlimit(const char *cgroup)
2941{
2942 char *memlimit_str = NULL;
2943 unsigned long memlimit = -1;
2944
2945 if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
2946 memlimit = strtoul(memlimit_str, NULL, 10);
2947
2948 free(memlimit_str);
2949
2950 return memlimit;
2951}
2952
2953static unsigned long get_min_memlimit(const char *cgroup)
2954{
2955 char *copy = strdupa(cgroup);
2956 unsigned long memlimit = 0, retlimit;
2957
2958 retlimit = get_memlimit(copy);
2959
2960 while (strcmp(copy, "/") != 0) {
2961 copy = dirname(copy);
2962 memlimit = get_memlimit(copy);
2963 if (memlimit != -1 && memlimit < retlimit)
2964 retlimit = memlimit;
2965 };
2966
2967 return retlimit;
2968}
2969
2970static int proc_meminfo_read(char *buf, size_t size, off_t offset,
2971 struct fuse_file_info *fi)
2972{
2973 struct fuse_context *fc = fuse_get_context();
2974 struct file_info *d = (struct file_info *)fi->fh;
2975 char *cg;
2976 char *memusage_str = NULL, *memstat_str = NULL,
2977 *memswlimit_str = NULL, *memswusage_str = NULL,
2978 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
2979 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
2980 cached = 0, hosttotal = 0;
2981 char *line = NULL;
2982 size_t linelen = 0, total_len = 0, rv = 0;
2983 char *cache = d->buf;
2984 size_t cache_size = d->buflen;
2985 FILE *f = NULL;
2986
2987 if (offset){
2988 if (offset > d->size)
2989 return -EINVAL;
2990 if (!d->cached)
2991 return 0;
2992 int left = d->size - offset;
2993 total_len = left > size ? size: left;
2994 memcpy(buf, cache + offset, total_len);
2995 return total_len;
2996 }
2997
2998 pid_t initpid = lookup_initpid_in_store(fc->pid);
2999 if (initpid <= 0)
3000 initpid = fc->pid;
3001 cg = get_pid_cgroup(initpid, "memory");
3002 if (!cg)
3003 return read_file("/proc/meminfo", buf, size, d);
6d2f6996 3004 prune_init_slice(cg);
237e200e
SH
3005
3006 memlimit = get_min_memlimit(cg);
3007 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3008 goto err;
3009 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3010 goto err;
3011
3012 // Following values are allowed to fail, because swapaccount might be turned
3013 // off for current kernel
3014 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3015 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3016 {
3017 /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
3018 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
3019 goto err;
3020 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
3021 goto err;
3022
3023 memswlimit = strtoul(memswlimit_str, NULL, 10);
3024 memswusage = strtoul(memswusage_str, NULL, 10);
3025
3026 if (!strcmp(memswlimit_str, memswlimit_default_str))
3027 memswlimit = 0;
3028 if (!strcmp(memswusage_str, memswusage_default_str))
3029 memswusage = 0;
3030
3031 memswlimit = memswlimit / 1024;
3032 memswusage = memswusage / 1024;
3033 }
3034
3035 memusage = strtoul(memusage_str, NULL, 10);
3036 memlimit /= 1024;
3037 memusage /= 1024;
3038
3039 get_mem_cached(memstat_str, &cached);
3040
3041 f = fopen("/proc/meminfo", "r");
3042 if (!f)
3043 goto err;
3044
3045 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3046 ssize_t l;
237e200e
SH
3047 char *printme, lbuf[100];
3048
3049 memset(lbuf, 0, 100);
3050 if (startswith(line, "MemTotal:")) {
3051 sscanf(line+14, "%lu", &hosttotal);
3052 if (hosttotal < memlimit)
3053 memlimit = hosttotal;
3054 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3055 printme = lbuf;
3056 } else if (startswith(line, "MemFree:")) {
3057 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3058 printme = lbuf;
3059 } else if (startswith(line, "MemAvailable:")) {
3060 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
3061 printme = lbuf;
3062 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3063 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit);
3064 printme = lbuf;
3065 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
f676eb79 3066 snprintf(lbuf, 100, "SwapFree: %8lu kB\n",
237e200e
SH
3067 (memswlimit - memlimit) - (memswusage - memusage));
3068 printme = lbuf;
da35d72a
SH
3069 } else if (startswith(line, "Slab:")) {
3070 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3071 printme = lbuf;
237e200e
SH
3072 } else if (startswith(line, "Buffers:")) {
3073 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3074 printme = lbuf;
3075 } else if (startswith(line, "Cached:")) {
3076 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3077 printme = lbuf;
3078 } else if (startswith(line, "SwapCached:")) {
3079 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3080 printme = lbuf;
3081 } else
3082 printme = line;
3083
3084 l = snprintf(cache, cache_size, "%s", printme);
3085 if (l < 0) {
3086 perror("Error writing to cache");
3087 rv = 0;
3088 goto err;
3089
3090 }
3091 if (l >= cache_size) {
3092 fprintf(stderr, "Internal error: truncated write to cache\n");
3093 rv = 0;
3094 goto err;
3095 }
3096
3097 cache += l;
3098 cache_size -= l;
3099 total_len += l;
3100 }
3101
3102 d->cached = 1;
3103 d->size = total_len;
3104 if (total_len > size ) total_len = size;
3105 memcpy(buf, d->buf, total_len);
3106
3107 rv = total_len;
3108err:
3109 if (f)
3110 fclose(f);
3111 free(line);
3112 free(cg);
3113 free(memusage_str);
3114 free(memswlimit_str);
3115 free(memswusage_str);
3116 free(memstat_str);
3117 free(memswlimit_default_str);
3118 free(memswusage_default_str);
3119 return rv;
3120}
3121
3122/*
3123 * Read the cpuset.cpus for cg
3124 * Return the answer in a newly allocated string which must be freed
3125 */
3126static char *get_cpuset(const char *cg)
3127{
3128 char *answer;
3129
3130 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3131 return NULL;
3132 return answer;
3133}
3134
3135bool cpu_in_cpuset(int cpu, const char *cpuset);
3136
3137static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3138{
3139 int cpu;
3140
3141 if (sscanf(line, "processor : %d", &cpu) != 1)
3142 return false;
3143 return cpu_in_cpuset(cpu, cpuset);
3144}
3145
3146/*
3147 * check whether this is a '^processor" line in /proc/cpuinfo
3148 */
3149static bool is_processor_line(const char *line)
3150{
3151 int cpu;
3152
3153 if (sscanf(line, "processor : %d", &cpu) == 1)
3154 return true;
3155 return false;
3156}
3157
3158static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3159 struct fuse_file_info *fi)
3160{
3161 struct fuse_context *fc = fuse_get_context();
3162 struct file_info *d = (struct file_info *)fi->fh;
3163 char *cg;
3164 char *cpuset = NULL;
3165 char *line = NULL;
3166 size_t linelen = 0, total_len = 0, rv = 0;
f676eb79
SH
3167 bool am_printing = false, firstline = true, is_s390x = false;
3168 int curcpu = -1, cpu;
237e200e
SH
3169 char *cache = d->buf;
3170 size_t cache_size = d->buflen;
3171 FILE *f = NULL;
3172
3173 if (offset){
3174 if (offset > d->size)
3175 return -EINVAL;
3176 if (!d->cached)
3177 return 0;
3178 int left = d->size - offset;
3179 total_len = left > size ? size: left;
3180 memcpy(buf, cache + offset, total_len);
3181 return total_len;
3182 }
3183
3184 pid_t initpid = lookup_initpid_in_store(fc->pid);
3185 if (initpid <= 0)
3186 initpid = fc->pid;
3187 cg = get_pid_cgroup(initpid, "cpuset");
3188 if (!cg)
3189 return read_file("proc/cpuinfo", buf, size, d);
6d2f6996 3190 prune_init_slice(cg);
237e200e
SH
3191
3192 cpuset = get_cpuset(cg);
3193 if (!cpuset)
3194 goto err;
3195
3196 f = fopen("/proc/cpuinfo", "r");
3197 if (!f)
3198 goto err;
3199
3200 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3201 ssize_t l;
f676eb79
SH
3202 if (firstline) {
3203 firstline = false;
3204 if (strstr(line, "IBM/S390") != NULL) {
3205 is_s390x = true;
3206 am_printing = true;
5ed9d4e2 3207 continue;
f676eb79
SH
3208 }
3209 }
5ed9d4e2
SH
3210 if (strncmp(line, "# processors:", 12) == 0)
3211 continue;
237e200e
SH
3212 if (is_processor_line(line)) {
3213 am_printing = cpuline_in_cpuset(line, cpuset);
3214 if (am_printing) {
3215 curcpu ++;
3216 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3217 if (l < 0) {
3218 perror("Error writing to cache");
3219 rv = 0;
3220 goto err;
3221 }
3222 if (l >= cache_size) {
3223 fprintf(stderr, "Internal error: truncated write to cache\n");
3224 rv = 0;
3225 goto err;
3226 }
3227 cache += l;
3228 cache_size -= l;
3229 total_len += l;
3230 }
3231 continue;
f676eb79
SH
3232 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3233 char *p;
3234 if (!cpu_in_cpuset(cpu, cpuset))
3235 continue;
3236 curcpu ++;
3237 p = strchr(line, ':');
3238 if (!p || !*p)
3239 goto err;
3240 p++;
5ed9d4e2 3241 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
f676eb79
SH
3242 if (l < 0) {
3243 perror("Error writing to cache");
3244 rv = 0;
3245 goto err;
3246 }
3247 if (l >= cache_size) {
3248 fprintf(stderr, "Internal error: truncated write to cache\n");
3249 rv = 0;
3250 goto err;
3251 }
3252 cache += l;
3253 cache_size -= l;
3254 total_len += l;
3255 continue;
3256
237e200e
SH
3257 }
3258 if (am_printing) {
3259 l = snprintf(cache, cache_size, "%s", line);
3260 if (l < 0) {
3261 perror("Error writing to cache");
3262 rv = 0;
3263 goto err;
3264 }
3265 if (l >= cache_size) {
3266 fprintf(stderr, "Internal error: truncated write to cache\n");
3267 rv = 0;
3268 goto err;
3269 }
3270 cache += l;
3271 cache_size -= l;
3272 total_len += l;
3273 }
3274 }
3275
5ed9d4e2
SH
3276 if (is_s390x) {
3277 char *origcache = d->buf;
a262ddb7 3278 ssize_t l;
5ed9d4e2
SH
3279 do {
3280 d->buf = malloc(d->buflen);
3281 } while (!d->buf);
3282 cache = d->buf;
3283 cache_size = d->buflen;
3284 total_len = 0;
3285 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3286 if (l < 0 || l >= cache_size) {
3287 free(origcache);
3288 goto err;
3289 }
3290 cache_size -= l;
3291 cache += l;
3292 total_len += l;
3293 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3294 if (l < 0 || l >= cache_size) {
3295 free(origcache);
3296 goto err;
3297 }
3298 cache_size -= l;
3299 cache += l;
3300 total_len += l;
3301 l = snprintf(cache, cache_size, "%s", origcache);
3302 free(origcache);
3303 if (l < 0 || l >= cache_size)
3304 goto err;
3305 total_len += l;
3306 }
3307
237e200e
SH
3308 d->cached = 1;
3309 d->size = total_len;
3310 if (total_len > size ) total_len = size;
3311
3312 /* read from off 0 */
3313 memcpy(buf, d->buf, total_len);
3314 rv = total_len;
3315err:
3316 if (f)
3317 fclose(f);
3318 free(line);
3319 free(cpuset);
3320 free(cg);
3321 return rv;
3322}
3323
3324static int proc_stat_read(char *buf, size_t size, off_t offset,
3325 struct fuse_file_info *fi)
3326{
3327 struct fuse_context *fc = fuse_get_context();
3328 struct file_info *d = (struct file_info *)fi->fh;
3329 char *cg;
3330 char *cpuset = NULL;
3331 char *line = NULL;
3332 size_t linelen = 0, total_len = 0, rv = 0;
3333 int curcpu = -1; /* cpu numbering starts at 0 */
3334 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
3335 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3336 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
3337#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
3338 char cpuall[CPUALL_MAX_SIZE];
3339 /* reserve for cpu all */
3340 char *cache = d->buf + CPUALL_MAX_SIZE;
3341 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3342 FILE *f = NULL;
3343
3344 if (offset){
3345 if (offset > d->size)
3346 return -EINVAL;
3347 if (!d->cached)
3348 return 0;
3349 int left = d->size - offset;
3350 total_len = left > size ? size: left;
3351 memcpy(buf, d->buf + offset, total_len);
3352 return total_len;
3353 }
3354
3355 pid_t initpid = lookup_initpid_in_store(fc->pid);
3356 if (initpid <= 0)
3357 initpid = fc->pid;
3358 cg = get_pid_cgroup(initpid, "cpuset");
3359 if (!cg)
3360 return read_file("/proc/stat", buf, size, d);
6d2f6996 3361 prune_init_slice(cg);
237e200e
SH
3362
3363 cpuset = get_cpuset(cg);
3364 if (!cpuset)
3365 goto err;
3366
3367 f = fopen("/proc/stat", "r");
3368 if (!f)
3369 goto err;
3370
3371 //skip first line
3372 if (getline(&line, &linelen, f) < 0) {
3373 fprintf(stderr, "proc_stat_read read first line failed\n");
3374 goto err;
3375 }
3376
3377 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3378 ssize_t l;
237e200e
SH
3379 int cpu;
3380 char cpu_char[10]; /* That's a lot of cores */
3381 char *c;
3382
3383 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3384 /* not a ^cpuN line containing a number N, just print it */
3385 l = snprintf(cache, cache_size, "%s", line);
3386 if (l < 0) {
3387 perror("Error writing to cache");
3388 rv = 0;
3389 goto err;
3390 }
3391 if (l >= cache_size) {
3392 fprintf(stderr, "Internal error: truncated write to cache\n");
3393 rv = 0;
3394 goto err;
3395 }
3396 cache += l;
3397 cache_size -= l;
3398 total_len += l;
3399 continue;
3400 }
3401
3402 if (sscanf(cpu_char, "%d", &cpu) != 1)
3403 continue;
3404 if (!cpu_in_cpuset(cpu, cpuset))
3405 continue;
3406 curcpu ++;
3407
3408 c = strchr(line, ' ');
3409 if (!c)
3410 continue;
3411 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3412 if (l < 0) {
3413 perror("Error writing to cache");
3414 rv = 0;
3415 goto err;
3416
3417 }
3418 if (l >= cache_size) {
3419 fprintf(stderr, "Internal error: truncated write to cache\n");
3420 rv = 0;
3421 goto err;
3422 }
3423
3424 cache += l;
3425 cache_size -= l;
3426 total_len += l;
3427
3428 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
3429 &softirq, &steal, &guest) != 9)
3430 continue;
3431 user_sum += user;
3432 nice_sum += nice;
3433 system_sum += system;
3434 idle_sum += idle;
3435 iowait_sum += iowait;
3436 irq_sum += irq;
3437 softirq_sum += softirq;
3438 steal_sum += steal;
3439 guest_sum += guest;
3440 }
3441
3442 cache = d->buf;
3443
3444 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3445 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
3446 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
3447 memcpy(cache, cpuall, cpuall_len);
3448 cache += cpuall_len;
3449 } else{
3450 /* shouldn't happen */
3451 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
3452 cpuall_len = 0;
3453 }
3454
3455 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3456 total_len += cpuall_len;
3457 d->cached = 1;
3458 d->size = total_len;
3459 if (total_len > size ) total_len = size;
3460
3461 memcpy(buf, d->buf, total_len);
3462 rv = total_len;
3463
3464err:
3465 if (f)
3466 fclose(f);
3467 free(line);
3468 free(cpuset);
3469 free(cg);
3470 return rv;
3471}
3472
3473static long int getreaperage(pid_t pid)
3474{
3475 char fnam[100];
3476 struct stat sb;
3477 int ret;
3478 pid_t qpid;
3479
3480 qpid = lookup_initpid_in_store(pid);
3481 if (qpid <= 0)
3482 return 0;
3483
3484 ret = snprintf(fnam, 100, "/proc/%d", qpid);
3485 if (ret < 0 || ret >= 100)
3486 return 0;
3487
3488 if (lstat(fnam, &sb) < 0)
3489 return 0;
3490
3491 return time(NULL) - sb.st_ctime;
3492}
3493
3494static unsigned long get_reaper_busy(pid_t task)
3495{
3496 pid_t initpid = lookup_initpid_in_store(task);
3497 char *cgroup = NULL, *usage_str = NULL;
3498 unsigned long usage = 0;
3499
3500 if (initpid <= 0)
3501 return 0;
3502
3503 cgroup = get_pid_cgroup(initpid, "cpuacct");
3504 if (!cgroup)
3505 goto out;
6d2f6996 3506 prune_init_slice(cgroup);
237e200e
SH
3507 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3508 goto out;
3509 usage = strtoul(usage_str, NULL, 10);
3510 usage /= 1000000000;
3511
3512out:
3513 free(cgroup);
3514 free(usage_str);
3515 return usage;
3516}
3517
3518#if RELOADTEST
3519void iwashere(void)
3520{
3521 char *name, *cwd = get_current_dir_name();
3522 size_t len;
3523 int fd;
3524
3525 if (!cwd)
3526 exit(1);
3527 len = strlen(cwd) + strlen("/iwashere") + 1;
3528 name = alloca(len);
3529 snprintf(name, len, "%s/iwashere", cwd);
3530 free(cwd);
3531 fd = creat(name, 0755);
3532 if (fd >= 0)
3533 close(fd);
3534}
3535#endif
3536
3537/*
3538 * We read /proc/uptime and reuse its second field.
3539 * For the first field, we use the mtime for the reaper for
3540 * the calling pid as returned by getreaperage
3541 */
3542static int proc_uptime_read(char *buf, size_t size, off_t offset,
3543 struct fuse_file_info *fi)
3544{
3545 struct fuse_context *fc = fuse_get_context();
3546 struct file_info *d = (struct file_info *)fi->fh;
3547 long int reaperage = getreaperage(fc->pid);
3548 unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
3549 char *cache = d->buf;
a262ddb7 3550 ssize_t total_len = 0;
237e200e
SH
3551
3552#if RELOADTEST
3553 iwashere();
3554#endif
3555
3556 if (offset){
3557 if (offset > d->size)
3558 return -EINVAL;
3559 if (!d->cached)
3560 return 0;
3561 int left = d->size - offset;
3562 total_len = left > size ? size: left;
3563 memcpy(buf, cache + offset, total_len);
3564 return total_len;
3565 }
3566
3567 idletime = reaperage - busytime;
3568 if (idletime > reaperage)
3569 idletime = reaperage;
3570
3571 total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
3572 if (total_len < 0){
3573 perror("Error writing to cache");
3574 return 0;
3575 }
3576
3577 d->size = (int)total_len;
3578 d->cached = 1;
3579
3580 if (total_len > size) total_len = size;
3581
3582 memcpy(buf, d->buf, total_len);
3583 return total_len;
3584}
3585
3586static int proc_diskstats_read(char *buf, size_t size, off_t offset,
3587 struct fuse_file_info *fi)
3588{
3589 char dev_name[72];
3590 struct fuse_context *fc = fuse_get_context();
3591 struct file_info *d = (struct file_info *)fi->fh;
3592 char *cg;
3593 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
3594 *io_wait_time_str = NULL, *io_service_time_str = NULL;
3595 unsigned long read = 0, write = 0;
3596 unsigned long read_merged = 0, write_merged = 0;
3597 unsigned long read_sectors = 0, write_sectors = 0;
3598 unsigned long read_ticks = 0, write_ticks = 0;
3599 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
3600 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
3601 char *cache = d->buf;
3602 size_t cache_size = d->buflen;
3603 char *line = NULL;
3604 size_t linelen = 0, total_len = 0, rv = 0;
3605 unsigned int major = 0, minor = 0;
3606 int i = 0;
3607 FILE *f = NULL;
3608
3609 if (offset){
3610 if (offset > d->size)
3611 return -EINVAL;
3612 if (!d->cached)
3613 return 0;
3614 int left = d->size - offset;
3615 total_len = left > size ? size: left;
3616 memcpy(buf, cache + offset, total_len);
3617 return total_len;
3618 }
3619
3620 pid_t initpid = lookup_initpid_in_store(fc->pid);
3621 if (initpid <= 0)
3622 initpid = fc->pid;
3623 cg = get_pid_cgroup(initpid, "blkio");
3624 if (!cg)
3625 return read_file("/proc/diskstats", buf, size, d);
6d2f6996 3626 prune_init_slice(cg);
237e200e 3627
2209fe50 3628 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
237e200e 3629 goto err;
2209fe50 3630 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
237e200e 3631 goto err;
2209fe50 3632 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
237e200e 3633 goto err;
2209fe50 3634 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
237e200e 3635 goto err;
2209fe50 3636 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
237e200e
SH
3637 goto err;
3638
3639
3640 f = fopen("/proc/diskstats", "r");
3641 if (!f)
3642 goto err;
3643
3644 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3645 ssize_t l;
2209fe50 3646 char lbuf[256];
237e200e
SH
3647
3648 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2209fe50 3649 if (i != 3)
237e200e 3650 continue;
2209fe50
SH
3651
3652 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
3653 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
3654 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
3655 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
3656 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
3657 read_sectors = read_sectors/512;
3658 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
3659 write_sectors = write_sectors/512;
3660
3661 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
3662 rd_svctm = rd_svctm/1000000;
3663 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
3664 rd_wait = rd_wait/1000000;
3665 read_ticks = rd_svctm + rd_wait;
3666
3667 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
3668 wr_svctm = wr_svctm/1000000;
3669 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
3670 wr_wait = wr_wait/1000000;
3671 write_ticks = wr_svctm + wr_wait;
3672
3673 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
3674 tot_ticks = tot_ticks/1000000;
237e200e
SH
3675
3676 memset(lbuf, 0, 256);
2db31eb6
SH
3677 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
3678 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3679 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
3680 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
3681 else
3682 continue;
237e200e 3683
2209fe50 3684 l = snprintf(cache, cache_size, "%s", lbuf);
237e200e
SH
3685 if (l < 0) {
3686 perror("Error writing to fuse buf");
3687 rv = 0;
3688 goto err;
3689 }
3690 if (l >= cache_size) {
3691 fprintf(stderr, "Internal error: truncated write to cache\n");
3692 rv = 0;
3693 goto err;
3694 }
3695 cache += l;
3696 cache_size -= l;
3697 total_len += l;
3698 }
3699
3700 d->cached = 1;
3701 d->size = total_len;
3702 if (total_len > size ) total_len = size;
3703 memcpy(buf, d->buf, total_len);
3704
3705 rv = total_len;
3706err:
3707 free(cg);
3708 if (f)
3709 fclose(f);
3710 free(line);
3711 free(io_serviced_str);
3712 free(io_merged_str);
3713 free(io_service_bytes_str);
3714 free(io_wait_time_str);
3715 free(io_service_time_str);
3716 return rv;
3717}
3718
70dcc12e
SH
3719static int proc_swaps_read(char *buf, size_t size, off_t offset,
3720 struct fuse_file_info *fi)
3721{
3722 struct fuse_context *fc = fuse_get_context();
3723 struct file_info *d = (struct file_info *)fi->fh;
3724 char *cg = NULL;
3725 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL,
3726 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
3727 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
a262ddb7
CB
3728 ssize_t total_len = 0, rv = 0;
3729 ssize_t l = 0;
70dcc12e
SH
3730 char *cache = d->buf;
3731
3732 if (offset) {
3733 if (offset > d->size)
3734 return -EINVAL;
3735 if (!d->cached)
3736 return 0;
3737 int left = d->size - offset;
3738 total_len = left > size ? size: left;
3739 memcpy(buf, cache + offset, total_len);
3740 return total_len;
3741 }
3742
3743 pid_t initpid = lookup_initpid_in_store(fc->pid);
3744 if (initpid <= 0)
3745 initpid = fc->pid;
3746 cg = get_pid_cgroup(initpid, "memory");
3747 if (!cg)
3748 return read_file("/proc/swaps", buf, size, d);
6d2f6996 3749 prune_init_slice(cg);
70dcc12e
SH
3750
3751 if (!cgfs_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
3752 goto err;
3753
3754 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3755 goto err;
3756
3757 memlimit = strtoul(memlimit_str, NULL, 10);
3758 memusage = strtoul(memusage_str, NULL, 10);
3759
3760 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
3761 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
3762
3763 /* If swap accounting is turned on, then default value is assumed to be that of cgroup / */
3764 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
3765 goto err;
3766 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
3767 goto err;
3768
3769 memswlimit = strtoul(memswlimit_str, NULL, 10);
3770 memswusage = strtoul(memswusage_str, NULL, 10);
3771
3772 if (!strcmp(memswlimit_str, memswlimit_default_str))
3773 memswlimit = 0;
3774 if (!strcmp(memswusage_str, memswusage_default_str))
3775 memswusage = 0;
3776
3777 swap_total = (memswlimit - memlimit) / 1024;
3778 swap_free = (memswusage - memusage) / 1024;
3779 }
3780
3781 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
3782
3783 /* When no mem + swap limit is specified or swapaccount=0*/
3784 if (!memswlimit) {
3785 char *line = NULL;
3786 size_t linelen = 0;
3787 FILE *f = fopen("/proc/meminfo", "r");
3788
3789 if (!f)
3790 goto err;
3791
3792 while (getline(&line, &linelen, f) != -1) {
3793 if (startswith(line, "SwapTotal:")) {
3794 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
3795 } else if (startswith(line, "SwapFree:")) {
3796 sscanf(line, "SwapFree: %8lu kB", &swap_free);
3797 }
3798 }
3799
3800 free(line);
3801 fclose(f);
3802 }
3803
3804 if (swap_total > 0) {
a262ddb7
CB
3805 l = snprintf(d->buf + total_len, d->size - total_len,
3806 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
3807 swap_total, swap_free);
3808 total_len += l;
70dcc12e
SH
3809 }
3810
a262ddb7 3811 if (total_len < 0 || l < 0) {
70dcc12e
SH
3812 perror("Error writing to cache");
3813 rv = 0;
3814 goto err;
3815 }
3816
3817 d->cached = 1;
3818 d->size = (int)total_len;
3819
3820 if (total_len > size) total_len = size;
3821 memcpy(buf, d->buf, total_len);
3822 rv = total_len;
3823
3824err:
3825 free(cg);
3826 free(memswlimit_str);
3827 free(memlimit_str);
3828 free(memusage_str);
3829 free(memswusage_str);
3830 free(memswusage_default_str);
3831 free(memswlimit_default_str);
3832 return rv;
3833}
3834
237e200e
SH
3835static off_t get_procfile_size(const char *which)
3836{
3837 FILE *f = fopen(which, "r");
3838 char *line = NULL;
3839 size_t len = 0;
3840 ssize_t sz, answer = 0;
3841 if (!f)
3842 return 0;
3843
3844 while ((sz = getline(&line, &len, f)) != -1)
3845 answer += sz;
3846 fclose (f);
3847 free(line);
3848
3849 return answer;
3850}
3851
3852int proc_getattr(const char *path, struct stat *sb)
3853{
3854 struct timespec now;
3855
3856 memset(sb, 0, sizeof(struct stat));
3857 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
3858 return -EINVAL;
3859 sb->st_uid = sb->st_gid = 0;
3860 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
3861 if (strcmp(path, "/proc") == 0) {
3862 sb->st_mode = S_IFDIR | 00555;
3863 sb->st_nlink = 2;
3864 return 0;
3865 }
3866 if (strcmp(path, "/proc/meminfo") == 0 ||
3867 strcmp(path, "/proc/cpuinfo") == 0 ||
3868 strcmp(path, "/proc/uptime") == 0 ||
3869 strcmp(path, "/proc/stat") == 0 ||
70dcc12e
SH
3870 strcmp(path, "/proc/diskstats") == 0 ||
3871 strcmp(path, "/proc/swaps") == 0) {
237e200e
SH
3872 sb->st_size = 0;
3873 sb->st_mode = S_IFREG | 00444;
3874 sb->st_nlink = 1;
3875 return 0;
3876 }
3877
3878 return -ENOENT;
3879}
3880
3881int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
3882 struct fuse_file_info *fi)
3883{
3884 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
3885 filler(buf, "meminfo", NULL, 0) != 0 ||
3886 filler(buf, "stat", NULL, 0) != 0 ||
3887 filler(buf, "uptime", NULL, 0) != 0 ||
70dcc12e
SH
3888 filler(buf, "diskstats", NULL, 0) != 0 ||
3889 filler(buf, "swaps", NULL, 0) != 0)
237e200e
SH
3890 return -EINVAL;
3891 return 0;
3892}
3893
3894int proc_open(const char *path, struct fuse_file_info *fi)
3895{
3896 int type = -1;
3897 struct file_info *info;
3898
3899 if (strcmp(path, "/proc/meminfo") == 0)
3900 type = LXC_TYPE_PROC_MEMINFO;
3901 else if (strcmp(path, "/proc/cpuinfo") == 0)
3902 type = LXC_TYPE_PROC_CPUINFO;
3903 else if (strcmp(path, "/proc/uptime") == 0)
3904 type = LXC_TYPE_PROC_UPTIME;
3905 else if (strcmp(path, "/proc/stat") == 0)
3906 type = LXC_TYPE_PROC_STAT;
3907 else if (strcmp(path, "/proc/diskstats") == 0)
3908 type = LXC_TYPE_PROC_DISKSTATS;
70dcc12e
SH
3909 else if (strcmp(path, "/proc/swaps") == 0)
3910 type = LXC_TYPE_PROC_SWAPS;
237e200e
SH
3911 if (type == -1)
3912 return -ENOENT;
3913
3914 info = malloc(sizeof(*info));
3915 if (!info)
3916 return -ENOMEM;
3917
3918 memset(info, 0, sizeof(*info));
3919 info->type = type;
3920
3921 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
3922 do {
3923 info->buf = malloc(info->buflen);
3924 } while (!info->buf);
3925 memset(info->buf, 0, info->buflen);
3926 /* set actual size to buffer size */
3927 info->size = info->buflen;
3928
3929 fi->fh = (unsigned long)info;
3930 return 0;
3931}
3932
bddbb106
SH
3933int proc_access(const char *path, int mask)
3934{
3935 /* these are all read-only */
3936 if ((mask & ~R_OK) != 0)
1b060d0a 3937 return -EACCES;
bddbb106
SH
3938 return 0;
3939}
3940
237e200e
SH
3941int proc_release(const char *path, struct fuse_file_info *fi)
3942{
43215927 3943 do_release_file_info(fi);
237e200e
SH
3944 return 0;
3945}
3946
3947int proc_read(const char *path, char *buf, size_t size, off_t offset,
3948 struct fuse_file_info *fi)
3949{
3950 struct file_info *f = (struct file_info *) fi->fh;
3951
3952 switch (f->type) {
3953 case LXC_TYPE_PROC_MEMINFO:
3954 return proc_meminfo_read(buf, size, offset, fi);
3955 case LXC_TYPE_PROC_CPUINFO:
3956 return proc_cpuinfo_read(buf, size, offset, fi);
3957 case LXC_TYPE_PROC_UPTIME:
3958 return proc_uptime_read(buf, size, offset, fi);
3959 case LXC_TYPE_PROC_STAT:
3960 return proc_stat_read(buf, size, offset, fi);
3961 case LXC_TYPE_PROC_DISKSTATS:
3962 return proc_diskstats_read(buf, size, offset, fi);
70dcc12e
SH
3963 case LXC_TYPE_PROC_SWAPS:
3964 return proc_swaps_read(buf, size, offset, fi);
237e200e
SH
3965 default:
3966 return -EINVAL;
3967 }
3968}
3969
3970static void __attribute__((constructor)) collect_subsystems(void)
3971{
3972 FILE *f;
3973 char *line = NULL;
3974 size_t len = 0;
3975
3976 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
3977 fprintf(stderr, "Error opening /proc/self/cgroup: %s\n", strerror(errno));
3978 return;
3979 }
3980 while (getline(&line, &len, f) != -1) {
3981 char *p, *p2;
3982
3983 p = strchr(line, ':');
3984 if (!p)
3985 goto out;
3986 *(p++) = '\0';
3987
3988 p2 = strrchr(p, ':');
3989 if (!p2)
3990 goto out;
3991 *p2 = '\0';
3992
a67719f6
CB
3993 /* With cgroupv2 /proc/self/cgroup can contain entries of the
3994 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
3995 * because it parses out the empty string "" and later on passes
3996 * it to mount(). Let's skip such entries.
3997 */
3998 if (!strcmp(p, ""))
3999 continue;
4000
237e200e
SH
4001 if (!store_hierarchy(line, p))
4002 goto out;
4003 }
4004
4005 print_subsystems();
4006
4007out:
4008 free(line);
4009 fclose(f);
4010}
4011
4012static void __attribute__((destructor)) free_subsystems(void)
4013{
4014 int i;
4015
4016 for (i = 0; i < num_hierarchies; i++)
4017 if (hierarchies[i])
4018 free(hierarchies[i]);
4019 free(hierarchies);
4020}