]> git.proxmox.com Git - mirror_lxcfs.git/blame - bindings.c
bindings: cgfs_chmod_file()
[mirror_lxcfs.git] / bindings.c
CommitLineData
237e200e
SH
1/* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9#define FUSE_USE_VERSION 26
10
11#include <stdio.h>
12#include <dirent.h>
13#include <fcntl.h>
14#include <fuse.h>
15#include <unistd.h>
16#include <errno.h>
17#include <stdbool.h>
18#include <time.h>
19#include <string.h>
20#include <stdlib.h>
21#include <libgen.h>
22#include <sched.h>
23#include <pthread.h>
24#include <linux/sched.h>
25#include <sys/param.h>
26#include <sys/socket.h>
27#include <sys/mount.h>
28#include <sys/epoll.h>
29#include <wait.h>
30
237e200e
SH
31#include "bindings.h"
32
33#include "config.h" // for VERSION
34
35enum {
36 LXC_TYPE_CGDIR,
37 LXC_TYPE_CGFILE,
38 LXC_TYPE_PROC_MEMINFO,
39 LXC_TYPE_PROC_CPUINFO,
40 LXC_TYPE_PROC_UPTIME,
41 LXC_TYPE_PROC_STAT,
42 LXC_TYPE_PROC_DISKSTATS,
70dcc12e 43 LXC_TYPE_PROC_SWAPS,
237e200e
SH
44};
45
46struct file_info {
47 char *controller;
48 char *cgroup;
49 char *file;
50 int type;
51 char *buf; // unused as of yet
52 int buflen;
53 int size; //actual data size
54 int cached;
55};
56
57/* reserve buffer size, for cpuall in /proc/stat */
58#define BUF_RESERVE_SIZE 256
59
60/*
61 * A table caching which pid is init for a pid namespace.
62 * When looking up which pid is init for $qpid, we first
63 * 1. Stat /proc/$qpid/ns/pid.
64 * 2. Check whether the ino_t is in our store.
65 * a. if not, fork a child in qpid's ns to send us
66 * ucred.pid = 1, and read the initpid. Cache
67 * initpid and creation time for /proc/initpid
68 * in a new store entry.
69 * b. if so, verify that /proc/initpid still matches
70 * what we have saved. If not, clear the store
71 * entry and go back to a. If so, return the
72 * cached initpid.
73 */
74struct pidns_init_store {
75 ino_t ino; // inode number for /proc/$pid/ns/pid
76 pid_t initpid; // the pid of nit in that ns
77 long int ctime; // the time at which /proc/$initpid was created
78 struct pidns_init_store *next;
79 long int lastcheck;
80};
81
82/* lol - look at how they are allocated in the kernel */
83#define PIDNS_HASH_SIZE 4096
84#define HASH(x) ((x) % PIDNS_HASH_SIZE)
85
86static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
87static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
88static void lock_mutex(pthread_mutex_t *l)
89{
90 int ret;
91
92 if ((ret = pthread_mutex_lock(l)) != 0) {
93 fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret));
94 exit(1);
95 }
96}
97
98static void unlock_mutex(pthread_mutex_t *l)
99{
100 int ret;
101
102 if ((ret = pthread_mutex_unlock(l)) != 0) {
103 fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret));
104 exit(1);
105 }
106}
107
108static void store_lock(void)
109{
110 lock_mutex(&pidns_store_mutex);
111}
112
113static void store_unlock(void)
114{
115 unlock_mutex(&pidns_store_mutex);
116}
117
118/* Must be called under store_lock */
119static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
120{
121 struct stat initsb;
122 char fnam[100];
123
124 snprintf(fnam, 100, "/proc/%d", e->initpid);
125 if (stat(fnam, &initsb) < 0)
126 return false;
127#if DEBUG
128 fprintf(stderr, "comparing ctime %ld %ld for pid %d\n",
129 e->ctime, initsb.st_ctime, e->initpid);
130#endif
131 if (e->ctime != initsb.st_ctime)
132 return false;
133 return true;
134}
135
136/* Must be called under store_lock */
137static void remove_initpid(struct pidns_init_store *e)
138{
139 struct pidns_init_store *tmp;
140 int h;
141
142#if DEBUG
143 fprintf(stderr, "remove_initpid: removing entry for %d\n", e->initpid);
144#endif
145 h = HASH(e->ino);
146 if (pidns_hash_table[h] == e) {
147 pidns_hash_table[h] = e->next;
148 free(e);
149 return;
150 }
151
152 tmp = pidns_hash_table[h];
153 while (tmp) {
154 if (tmp->next == e) {
155 tmp->next = e->next;
156 free(e);
157 return;
158 }
159 tmp = tmp->next;
160 }
161}
162
163#define PURGE_SECS 5
164/* Must be called under store_lock */
165static void prune_initpid_store(void)
166{
167 static long int last_prune = 0;
168 struct pidns_init_store *e, *prev, *delme;
169 long int now, threshold;
170 int i;
171
172 if (!last_prune) {
173 last_prune = time(NULL);
174 return;
175 }
176 now = time(NULL);
177 if (now < last_prune + PURGE_SECS)
178 return;
179#if DEBUG
180 fprintf(stderr, "pruning\n");
181#endif
182 last_prune = now;
183 threshold = now - 2 * PURGE_SECS;
184
185 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
186 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
187 if (e->lastcheck < threshold) {
188#if DEBUG
189 fprintf(stderr, "Removing cached entry for %d\n", e->initpid);
190#endif
191 delme = e;
192 if (prev)
193 prev->next = e->next;
194 else
195 pidns_hash_table[i] = e->next;
196 e = e->next;
197 free(delme);
198 } else {
199 prev = e;
200 e = e->next;
201 }
202 }
203 }
204}
205
206/* Must be called under store_lock */
207static void save_initpid(struct stat *sb, pid_t pid)
208{
209 struct pidns_init_store *e;
210 char fpath[100];
211 struct stat procsb;
212 int h;
213
214#if DEBUG
215 fprintf(stderr, "save_initpid: adding entry for %d\n", pid);
216#endif
217 snprintf(fpath, 100, "/proc/%d", pid);
218 if (stat(fpath, &procsb) < 0)
219 return;
220 do {
221 e = malloc(sizeof(*e));
222 } while (!e);
223 e->ino = sb->st_ino;
224 e->initpid = pid;
225 e->ctime = procsb.st_ctime;
226 h = HASH(e->ino);
227 e->next = pidns_hash_table[h];
228 e->lastcheck = time(NULL);
229 pidns_hash_table[h] = e;
230}
231
232/*
233 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
234 * entry for the inode number and creation time. Verify that the init pid
235 * is still valid. If not, remove it. Return the entry if valid, NULL
236 * otherwise.
237 * Must be called under store_lock
238 */
239static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
240{
241 int h = HASH(sb->st_ino);
242 struct pidns_init_store *e = pidns_hash_table[h];
243
244 while (e) {
245 if (e->ino == sb->st_ino) {
246 if (initpid_still_valid(e, sb)) {
247 e->lastcheck = time(NULL);
248 return e;
249 }
250 remove_initpid(e);
251 return NULL;
252 }
253 e = e->next;
254 }
255
256 return NULL;
257}
258
259static int is_dir(const char *path)
260{
261 struct stat statbuf;
262 int ret = stat(path, &statbuf);
263 if (ret == 0 && S_ISDIR(statbuf.st_mode))
264 return 1;
265 return 0;
266}
267
268static char *must_copy_string(const char *str)
269{
270 char *dup = NULL;
271 if (!str)
272 return NULL;
273 do {
274 dup = strdup(str);
275 } while (!dup);
276
277 return dup;
278}
279
280static inline void drop_trailing_newlines(char *s)
281{
282 int l;
283
284 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
285 s[l-1] = '\0';
286}
287
288#define BATCH_SIZE 50
289static void dorealloc(char **mem, size_t oldlen, size_t newlen)
290{
291 int newbatches = (newlen / BATCH_SIZE) + 1;
292 int oldbatches = (oldlen / BATCH_SIZE) + 1;
293
294 if (!*mem || newbatches > oldbatches) {
295 char *tmp;
296 do {
297 tmp = realloc(*mem, newbatches * BATCH_SIZE);
298 } while (!tmp);
299 *mem = tmp;
300 }
301}
302static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
303{
304 size_t newlen = *len + linelen;
305 dorealloc(contents, *len, newlen + 1);
306 memcpy(*contents + *len, line, linelen+1);
307 *len = newlen;
308}
309
60f2ae53 310static char *slurp_file(const char *from, int fd)
237e200e
SH
311{
312 char *line = NULL;
313 char *contents = NULL;
60f2ae53 314 FILE *f = fdopen(fd, "r");
237e200e
SH
315 size_t len = 0, fulllen = 0;
316 ssize_t linelen;
317
318 if (!f)
319 return NULL;
320
321 while ((linelen = getline(&line, &len, f)) != -1) {
322 append_line(&contents, &fulllen, line, linelen);
323 }
324 fclose(f);
325
326 if (contents)
327 drop_trailing_newlines(contents);
328 free(line);
329 return contents;
330}
331
ba59ea09 332static bool write_string(const char *fnam, const char *string, int fd)
237e200e
SH
333{
334 FILE *f;
335 size_t len, ret;
336
ba59ea09 337 if (!(f = fdopen(fd, "w")))
237e200e
SH
338 return false;
339 len = strlen(string);
340 ret = fwrite(string, 1, len, f);
341 if (ret != len) {
342 fprintf(stderr, "Error writing to file: %s\n", strerror(errno));
343 fclose(f);
344 return false;
345 }
346 if (fclose(f) < 0) {
347 fprintf(stderr, "Error writing to file: %s\n", strerror(errno));
348 return false;
349 }
350 return true;
351}
352
237e200e
SH
353struct cgfs_files {
354 char *name;
355 uint32_t uid, gid;
356 uint32_t mode;
357};
358
0619767c 359#define ALLOC_NUM 20
237e200e
SH
360static bool store_hierarchy(char *stridx, char *h)
361{
0619767c
SH
362 if (num_hierarchies % ALLOC_NUM == 0) {
363 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
364 n *= ALLOC_NUM;
365 char **tmp = realloc(hierarchies, n * sizeof(char *));
0619767c
SH
366 if (!tmp) {
367 fprintf(stderr, "Out of memory\n");
368 exit(1);
369 }
237e200e 370 hierarchies = tmp;
237e200e 371 }
f676eb79 372
0619767c 373 hierarchies[num_hierarchies++] = must_copy_string(h);
237e200e
SH
374 return true;
375}
376
377static void print_subsystems(void)
378{
379 int i;
380
cc97d34c 381 fprintf(stderr, "hierarchies:\n");
237e200e
SH
382 for (i = 0; i < num_hierarchies; i++) {
383 if (hierarchies[i])
384 fprintf(stderr, " %d: %s\n", i, hierarchies[i]);
385 }
386}
387
388static bool in_comma_list(const char *needle, const char *haystack)
389{
390 const char *s = haystack, *e;
391 size_t nlen = strlen(needle);
392
393 while (*s && (e = index(s, ','))) {
394 if (nlen != e - s) {
395 s = e + 1;
396 continue;
397 }
398 if (strncmp(needle, s, nlen) == 0)
399 return true;
400 s = e + 1;
401 }
402 if (strcmp(needle, s) == 0)
403 return true;
404 return false;
405}
406
407/* do we need to do any massaging here? I'm not sure... */
5dd3e6fd
CB
408/* Return the mounted controller and store the corresponding open file descriptor
409 * referring to the controller mountpoint in the private lxcfs namespace in
410 * @cfd.
411 */
412static char *find_mounted_controller(const char *controller, int *cfd)
237e200e
SH
413{
414 int i;
415
416 for (i = 0; i < num_hierarchies; i++) {
417 if (!hierarchies[i])
418 continue;
5dd3e6fd
CB
419 if (strcmp(hierarchies[i], controller) == 0) {
420 *cfd = fd_hierarchies[i];
237e200e 421 return hierarchies[i];
5dd3e6fd
CB
422 }
423 if (in_comma_list(controller, hierarchies[i])) {
424 *cfd = fd_hierarchies[i];
237e200e 425 return hierarchies[i];
5dd3e6fd 426 }
237e200e
SH
427 }
428
429 return NULL;
430}
431
432bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
433 const char *value)
434{
ba59ea09 435 int ret, fd, cfd;
237e200e 436 size_t len;
5dd3e6fd 437 char *fnam, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
438
439 if (!tmpc)
440 return false;
ba59ea09
CB
441 /* . + /cgroup + / + file + \0 */
442 len = strlen(cgroup) + strlen(file) + 3;
237e200e 443 fnam = alloca(len);
ba59ea09
CB
444 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
445 if (ret < 0 || (size_t)ret >= len)
446 return false;
447
448 fd = openat(cfd, fnam, O_WRONLY);
449 if (fd < 0)
450 return false;
f676eb79 451
ba59ea09 452 return write_string(fnam, value, fd);
237e200e
SH
453}
454
455// Chown all the files in the cgroup directory. We do this when we create
456// a cgroup on behalf of a user.
f23fe717 457static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e 458{
f23fe717 459 struct dirent *direntp;
237e200e
SH
460 char path[MAXPATHLEN];
461 size_t len;
462 DIR *d;
f23fe717 463 int fd1, ret;
237e200e
SH
464
465 len = strlen(dirname);
466 if (len >= MAXPATHLEN) {
467 fprintf(stderr, "chown_all_cgroup_files: pathname too long: %s\n", dirname);
468 return;
469 }
470
f23fe717
CB
471 fd1 = openat(fd, dirname, O_DIRECTORY);
472 if (fd1 < 0)
473 return;
474
475 d = fdopendir(fd1);
237e200e
SH
476 if (!d) {
477 fprintf(stderr, "chown_all_cgroup_files: failed to open %s\n", dirname);
478 return;
479 }
480
f23fe717 481 while ((direntp = readdir(d))) {
237e200e
SH
482 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
483 continue;
484 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
485 if (ret < 0 || ret >= MAXPATHLEN) {
486 fprintf(stderr, "chown_all_cgroup_files: pathname too long under %s\n", dirname);
487 continue;
488 }
f23fe717 489 if (fchownat(fd, path, uid, gid, 0) < 0)
237e200e
SH
490 fprintf(stderr, "Failed to chown file %s to %u:%u", path, uid, gid);
491 }
492 closedir(d);
493}
494
495int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
496{
5dd3e6fd 497 int cfd;
237e200e 498 size_t len;
5dd3e6fd 499 char *dirnam, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
500
501 if (!tmpc)
502 return -EINVAL;
f23fe717
CB
503 /* . + /cg + \0 */
504 len = strlen(cg) + 2;
237e200e 505 dirnam = alloca(len);
f23fe717 506 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
237e200e 507
f23fe717 508 if (mkdirat(cfd, dirnam, 0755) < 0)
237e200e
SH
509 return -errno;
510
511 if (uid == 0 && gid == 0)
512 return 0;
513
f23fe717 514 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
237e200e
SH
515 return -errno;
516
f23fe717 517 chown_all_cgroup_files(dirnam, uid, gid, cfd);
237e200e
SH
518
519 return 0;
520}
521
b7672ded 522static bool recursive_rmdir(const char *dirname, int fd)
237e200e 523{
b7672ded 524 struct dirent *direntp;
237e200e
SH
525 DIR *dir;
526 bool ret = false;
527 char pathname[MAXPATHLEN];
b7672ded
CB
528 int dupfd;
529
530 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
531 if (dupfd < 0)
532 return false;
237e200e 533
b7672ded 534 dir = fdopendir(dupfd);
237e200e
SH
535 if (!dir) {
536#if DEBUG
537 fprintf(stderr, "%s: failed to open %s: %s\n", __func__, dirname, strerror(errno));
538#endif
539 return false;
540 }
541
b7672ded 542 while ((direntp = readdir(dir))) {
237e200e
SH
543 struct stat mystat;
544 int rc;
545
546 if (!direntp)
547 break;
548
549 if (!strcmp(direntp->d_name, ".") ||
550 !strcmp(direntp->d_name, ".."))
551 continue;
552
553 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
554 if (rc < 0 || rc >= MAXPATHLEN) {
555 fprintf(stderr, "pathname too long\n");
556 continue;
557 }
558
b7672ded 559 ret = fstatat(fd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
237e200e
SH
560 if (ret) {
561#if DEBUG
562 fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno));
563#endif
564 continue;
565 }
566 if (S_ISDIR(mystat.st_mode)) {
b7672ded 567 if (!recursive_rmdir(pathname, fd)) {
237e200e
SH
568#if DEBUG
569 fprintf(stderr, "Error removing %s\n", pathname);
570#endif
571 }
572 }
573 }
574
575 ret = true;
576 if (closedir(dir) < 0) {
577 fprintf(stderr, "%s: failed to close directory %s: %s\n", __func__, dirname, strerror(errno));
578 ret = false;
579 }
580
b7672ded 581 if (unlinkat(fd, dirname, AT_REMOVEDIR) < 0) {
237e200e
SH
582#if DEBUG
583 fprintf(stderr, "%s: failed to delete %s: %s\n", __func__, dirname, strerror(errno));
584#endif
585 ret = false;
586 }
b7672ded 587 close(fd);
237e200e
SH
588
589 return ret;
590}
591
592bool cgfs_remove(const char *controller, const char *cg)
593{
b7672ded 594 int fd, cfd;
237e200e 595 size_t len;
5dd3e6fd 596 char *dirnam, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
597
598 if (!tmpc)
599 return false;
b7672ded
CB
600 /* . + /cg + \0 */
601 len = strlen(cg) + 2;
237e200e 602 dirnam = alloca(len);
b7672ded
CB
603 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
604
605 fd = openat(cfd, dirnam, O_DIRECTORY);
606 if (fd < 0)
607 return false;
608
609 return recursive_rmdir(dirnam, fd);
237e200e
SH
610}
611
612bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
613{
5dd3e6fd 614 int cfd;
237e200e 615 size_t len;
5dd3e6fd 616 char *pathname, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
617
618 if (!tmpc)
619 return false;
534690b4
CB
620 /* . + /file + \0 */
621 len = strlen(file) + 2;
237e200e 622 pathname = alloca(len);
534690b4
CB
623 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
624 if (fchmodat(cfd, pathname, mode, 0) < 0)
237e200e
SH
625 return false;
626 return true;
627}
628
629static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid)
630{
631 size_t len;
632 char *fname;
633
634 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
635 fname = alloca(len);
636 snprintf(fname, len, "%s/tasks", dirname);
637 if (chown(fname, uid, gid) != 0)
638 return -errno;
639 snprintf(fname, len, "%s/cgroup.procs", dirname);
640 if (chown(fname, uid, gid) != 0)
641 return -errno;
642 return 0;
643}
644
645int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
646{
5dd3e6fd 647 int cfd;
237e200e 648 size_t len;
5dd3e6fd 649 char *pathname, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
650
651 if (!tmpc)
652 return -EINVAL;
cc97d34c
CB
653 /* BASEDIR / tmpc / file \0 */
654 len = strlen(BASEDIR) + strlen(tmpc) + strlen(file) + 3;
237e200e 655 pathname = alloca(len);
cc97d34c 656 snprintf(pathname, len, "%s/%s/%s", BASEDIR, tmpc, file);
237e200e
SH
657 if (chown(pathname, uid, gid) < 0)
658 return -errno;
659
660 if (is_dir(pathname))
661 // like cgmanager did, we want to chown the tasks file as well
662 return chown_tasks_files(pathname, uid, gid);
663
664 return 0;
665}
666
667FILE *open_pids_file(const char *controller, const char *cgroup)
668{
5dd3e6fd 669 int cfd;
237e200e 670 size_t len;
5dd3e6fd 671 char *pathname, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
672
673 if (!tmpc)
674 return NULL;
cc97d34c
CB
675 /* BASEDIR / tmpc / cgroup / "cgroup.procs" \0 */
676 len = strlen(BASEDIR) + strlen(tmpc) + strlen(cgroup) + 4 + strlen("cgroup.procs");
237e200e 677 pathname = alloca(len);
cc97d34c 678 snprintf(pathname, len, "%s/%s/%s/cgroup.procs", BASEDIR, tmpc, cgroup);
237e200e
SH
679 return fopen(pathname, "w");
680}
681
f366da65
WB
682static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
683 void ***list, size_t typesize,
684 void* (*iterator)(const char*, const char*, const char*))
237e200e 685{
4ea38a4c 686 int cfd, fd, ret;
237e200e 687 size_t len;
4ea38a4c 688 char *cg, *tmpc;
237e200e 689 char pathname[MAXPATHLEN];
f366da65 690 size_t sz = 0, asz = 0;
4ea38a4c 691 struct dirent *dirent;
237e200e 692 DIR *dir;
237e200e 693
4ea38a4c 694 tmpc = find_mounted_controller(controller, &cfd);
f366da65 695 *list = NULL;
237e200e 696 if (!tmpc)
e97c834b 697 return false;
237e200e 698
4ea38a4c
CB
699 /* Make sure we pass a relative path to openat(). */
700 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
701 cg = alloca(len);
702 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
703 if (ret < 0 || (size_t)ret >= len) {
704 fprintf(stderr, "%s: pathname too long under %s\n", __func__, cgroup);
705 return false;
706 }
237e200e 707
4ea38a4c
CB
708 fd = openat(cfd, cg, O_DIRECTORY);
709 if (fd < 0)
710 return false;
711
712 dir = fdopendir(fd);
237e200e
SH
713 if (!dir)
714 return false;
715
4ea38a4c 716 while ((dirent = readdir(dir))) {
237e200e 717 struct stat mystat;
237e200e 718
4ea38a4c
CB
719 if (!strcmp(dirent->d_name, ".") ||
720 !strcmp(dirent->d_name, ".."))
237e200e
SH
721 continue;
722
4ea38a4c
CB
723 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
724 if (ret < 0 || ret >= MAXPATHLEN) {
725 fprintf(stderr, "%s: pathname too long under %s\n", __func__, cg);
237e200e
SH
726 continue;
727 }
728
4ea38a4c 729 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
237e200e
SH
730 if (ret) {
731 fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno));
732 continue;
733 }
f366da65
WB
734 if ((!directories && !S_ISREG(mystat.st_mode)) ||
735 (directories && !S_ISDIR(mystat.st_mode)))
237e200e
SH
736 continue;
737
738 if (sz+2 >= asz) {
f366da65 739 void **tmp;
237e200e
SH
740 asz += BATCH_SIZE;
741 do {
f366da65 742 tmp = realloc(*list, asz * typesize);
237e200e
SH
743 } while (!tmp);
744 *list = tmp;
745 }
4ea38a4c 746 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
237e200e
SH
747 (*list)[sz+1] = NULL;
748 sz++;
749 }
750 if (closedir(dir) < 0) {
4ea38a4c 751 fprintf(stderr, "%s: failed closedir for %s: %s\n", __func__, cgroup, strerror(errno));
237e200e
SH
752 return false;
753 }
754 return true;
755}
756
f366da65
WB
757static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
758{
759 char *dup;
760 do {
761 dup = strdup(dir_entry);
762 } while (!dup);
763 return dup;
764}
765
766bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
767{
768 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
769}
770
237e200e
SH
771void free_key(struct cgfs_files *k)
772{
773 if (!k)
774 return;
775 free(k->name);
776 free(k);
777}
778
779void free_keys(struct cgfs_files **keys)
780{
781 int i;
782
783 if (!keys)
784 return;
785 for (i = 0; keys[i]; i++) {
786 free_key(keys[i]);
787 }
788 free(keys);
789}
790
791bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
792{
60f2ae53 793 int ret, fd, cfd;
237e200e 794 size_t len;
5dd3e6fd 795 char *fnam, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
796
797 if (!tmpc)
798 return false;
60f2ae53
CB
799 /* . + /cgroup + / + file + \0 */
800 len = strlen(cgroup) + strlen(file) + 3;
237e200e 801 fnam = alloca(len);
60f2ae53
CB
802 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
803 if (ret < 0 || (size_t)ret >= len)
804 return NULL;
805
806 fd = openat(cfd, fnam, O_RDONLY);
807 if (fd < 0)
808 return NULL;
237e200e 809
60f2ae53 810 *value = slurp_file(fnam, fd);
237e200e
SH
811 return *value != NULL;
812}
813
814struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
815{
4ea38a4c 816 int ret, cfd;
237e200e 817 size_t len;
5dd3e6fd 818 char *fnam, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
819 struct stat sb;
820 struct cgfs_files *newkey;
237e200e
SH
821
822 if (!tmpc)
823 return false;
824
825 if (file && *file == '/')
826 file++;
827
828 if (file && index(file, '/'))
829 return NULL;
830
4ea38a4c
CB
831 /* . + /cgroup + / + file + \0 */
832 len = strlen(cgroup) + 3;
237e200e
SH
833 if (file)
834 len += strlen(file) + 1;
835 fnam = alloca(len);
4ea38a4c
CB
836 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
837 file ? "/" : "", file ? file : "");
237e200e 838
4ea38a4c 839 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
840 if (ret < 0)
841 return NULL;
842
843 do {
844 newkey = malloc(sizeof(struct cgfs_files));
845 } while (!newkey);
846 if (file)
847 newkey->name = must_copy_string(file);
848 else if (rindex(cgroup, '/'))
849 newkey->name = must_copy_string(rindex(cgroup, '/'));
850 else
851 newkey->name = must_copy_string(cgroup);
852 newkey->uid = sb.st_uid;
853 newkey->gid = sb.st_gid;
854 newkey->mode = sb.st_mode;
855
856 return newkey;
857}
858
f366da65 859static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
237e200e 860{
f366da65
WB
861 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
862 if (!entry) {
863 fprintf(stderr, "%s: Error getting files under %s:%s\n",
864 __func__, controller, cgroup);
237e200e 865 }
f366da65
WB
866 return entry;
867}
868
869bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
870{
871 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
237e200e
SH
872}
873
874bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
5dd3e6fd
CB
875{
876 int cfd;
877 size_t len;
878 char *fnam, *tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
879 int ret;
880 struct stat sb;
881
882 if (!tmpc)
883 return false;
d04232f2
CB
884 /* . + /cgroup + / + f + \0 */
885 len = strlen(cgroup) + strlen(f) + 3;
237e200e 886 fnam = alloca(len);
d04232f2
CB
887 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
888 if (ret < 0 || (size_t)ret >= len)
889 return false;
237e200e 890
d04232f2 891 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
892 if (ret < 0 || !S_ISDIR(sb.st_mode))
893 return false;
894 return true;
895}
896
897#define SEND_CREDS_OK 0
898#define SEND_CREDS_NOTSK 1
899#define SEND_CREDS_FAIL 2
900static bool recv_creds(int sock, struct ucred *cred, char *v);
901static int wait_for_pid(pid_t pid);
902static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
b10bdd6c 903static int send_creds_clone_wrapper(void *arg);
237e200e
SH
904
905/*
b10bdd6c 906 * clone a task which switches to @task's namespace and writes '1'.
237e200e
SH
907 * over a unix sock so we can read the task's reaper's pid in our
908 * namespace
b10bdd6c
FG
909 *
910 * Note: glibc's fork() does not respect pidns, which can lead to failed
911 * assertions inside glibc (and thus failed forks) if the child's pid in
912 * the pidns and the parent pid outside are identical. Using clone prevents
913 * this issue.
237e200e
SH
914 */
915static void write_task_init_pid_exit(int sock, pid_t target)
916{
237e200e
SH
917 char fnam[100];
918 pid_t pid;
237e200e 919 int fd, ret;
b10bdd6c
FG
920 size_t stack_size = sysconf(_SC_PAGESIZE);
921 void *stack = alloca(stack_size);
237e200e
SH
922
923 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
924 if (ret < 0 || ret >= sizeof(fnam))
925 _exit(1);
926
927 fd = open(fnam, O_RDONLY);
928 if (fd < 0) {
929 perror("write_task_init_pid_exit open of ns/pid");
930 _exit(1);
931 }
932 if (setns(fd, 0)) {
933 perror("write_task_init_pid_exit setns 1");
934 close(fd);
935 _exit(1);
936 }
b10bdd6c 937 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
237e200e
SH
938 if (pid < 0)
939 _exit(1);
940 if (pid != 0) {
941 if (!wait_for_pid(pid))
942 _exit(1);
943 _exit(0);
944 }
b10bdd6c
FG
945}
946
947static int send_creds_clone_wrapper(void *arg) {
948 struct ucred cred;
949 char v;
950 int sock = *(int *)arg;
237e200e
SH
951
952 /* we are the child */
953 cred.uid = 0;
954 cred.gid = 0;
955 cred.pid = 1;
956 v = '1';
957 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
b10bdd6c
FG
958 return 1;
959 return 0;
237e200e
SH
960}
961
962static pid_t get_init_pid_for_task(pid_t task)
963{
964 int sock[2];
965 pid_t pid;
966 pid_t ret = -1;
967 char v = '0';
968 struct ucred cred;
969
970 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
971 perror("socketpair");
972 return -1;
973 }
974
975 pid = fork();
976 if (pid < 0)
977 goto out;
978 if (!pid) {
979 close(sock[1]);
980 write_task_init_pid_exit(sock[0], task);
981 _exit(0);
982 }
983
984 if (!recv_creds(sock[1], &cred, &v))
985 goto out;
986 ret = cred.pid;
987
988out:
989 close(sock[0]);
990 close(sock[1]);
991 if (pid > 0)
992 wait_for_pid(pid);
993 return ret;
994}
995
996static pid_t lookup_initpid_in_store(pid_t qpid)
997{
998 pid_t answer = 0;
999 struct stat sb;
1000 struct pidns_init_store *e;
1001 char fnam[100];
1002
1003 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1004 store_lock();
1005 if (stat(fnam, &sb) < 0)
1006 goto out;
1007 e = lookup_verify_initpid(&sb);
1008 if (e) {
1009 answer = e->initpid;
1010 goto out;
1011 }
1012 answer = get_init_pid_for_task(qpid);
1013 if (answer > 0)
1014 save_initpid(&sb, answer);
1015
1016out:
1017 /* we prune at end in case we are returning
1018 * the value we were about to return */
1019 prune_initpid_store();
1020 store_unlock();
1021 return answer;
1022}
1023
1024static int wait_for_pid(pid_t pid)
1025{
1026 int status, ret;
1027
1028 if (pid <= 0)
1029 return -1;
1030
1031again:
1032 ret = waitpid(pid, &status, 0);
1033 if (ret == -1) {
1034 if (errno == EINTR)
1035 goto again;
1036 return -1;
1037 }
1038 if (ret != pid)
1039 goto again;
1040 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1041 return -1;
1042 return 0;
1043}
1044
1045
1046/*
1047 * append pid to *src.
1048 * src: a pointer to a char* in which ot append the pid.
1049 * sz: the number of characters printed so far, minus trailing \0.
1050 * asz: the allocated size so far
1051 * pid: the pid to append
1052 */
1053static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1054{
1055 char tmp[30];
1056
1057 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1058
1059 if (!*src || tmplen + *sz + 1 >= *asz) {
1060 char *tmp;
1061 do {
1062 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1063 } while (!tmp);
1064 *src = tmp;
1065 *asz += BUF_RESERVE_SIZE;
1066 }
bbfd0e33 1067 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
237e200e 1068 *sz += tmplen;
237e200e
SH
1069}
1070
1071/*
1072 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1073 * valid in the caller's namespace, return the id mapped into
1074 * pid's namespace.
1075 * Returns the mapped id, or -1 on error.
1076 */
1077unsigned int
1078convert_id_to_ns(FILE *idfile, unsigned int in_id)
1079{
1080 unsigned int nsuid, // base id for a range in the idfile's namespace
1081 hostuid, // base id for a range in the caller's namespace
1082 count; // number of ids in this range
1083 char line[400];
1084 int ret;
1085
1086 fseek(idfile, 0L, SEEK_SET);
1087 while (fgets(line, 400, idfile)) {
1088 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1089 if (ret != 3)
1090 continue;
1091 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1092 /*
1093 * uids wrapped around - unexpected as this is a procfile,
1094 * so just bail.
1095 */
1096 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
1097 nsuid, hostuid, count, line);
1098 return -1;
1099 }
1100 if (hostuid <= in_id && hostuid+count > in_id) {
1101 /*
1102 * now since hostuid <= in_id < hostuid+count, and
1103 * hostuid+count and nsuid+count do not wrap around,
1104 * we know that nsuid+(in_id-hostuid) which must be
1105 * less that nsuid+(count) must not wrap around
1106 */
1107 return (in_id - hostuid) + nsuid;
1108 }
1109 }
1110
1111 // no answer found
1112 return -1;
1113}
1114
1115/*
1116 * for is_privileged_over,
1117 * specify whether we require the calling uid to be root in his
1118 * namespace
1119 */
1120#define NS_ROOT_REQD true
1121#define NS_ROOT_OPT false
1122
1123#define PROCLEN 100
1124
1125static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1126{
1127 char fpath[PROCLEN];
1128 int ret;
1129 bool answer = false;
1130 uid_t nsuid;
1131
1132 if (victim == -1 || uid == -1)
1133 return false;
1134
1135 /*
1136 * If the request is one not requiring root in the namespace,
1137 * then having the same uid suffices. (i.e. uid 1000 has write
1138 * access to files owned by uid 1000
1139 */
1140 if (!req_ns_root && uid == victim)
1141 return true;
1142
1143 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1144 if (ret < 0 || ret >= PROCLEN)
1145 return false;
1146 FILE *f = fopen(fpath, "r");
1147 if (!f)
1148 return false;
1149
1150 /* if caller's not root in his namespace, reject */
1151 nsuid = convert_id_to_ns(f, uid);
1152 if (nsuid)
1153 goto out;
1154
1155 /*
1156 * If victim is not mapped into caller's ns, reject.
1157 * XXX I'm not sure this check is needed given that fuse
1158 * will be sending requests where the vfs has converted
1159 */
1160 nsuid = convert_id_to_ns(f, victim);
1161 if (nsuid == -1)
1162 goto out;
1163
1164 answer = true;
1165
1166out:
1167 fclose(f);
1168 return answer;
1169}
1170
1171static bool perms_include(int fmode, mode_t req_mode)
1172{
1173 mode_t r;
1174
1175 switch (req_mode & O_ACCMODE) {
1176 case O_RDONLY:
1177 r = S_IROTH;
1178 break;
1179 case O_WRONLY:
1180 r = S_IWOTH;
1181 break;
1182 case O_RDWR:
1183 r = S_IROTH | S_IWOTH;
1184 break;
1185 default:
1186 return false;
1187 }
1188 return ((fmode & r) == r);
1189}
1190
1191
1192/*
1193 * taskcg is a/b/c
1194 * querycg is /a/b/c/d/e
1195 * we return 'd'
1196 */
1197static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1198{
1199 char *start, *end;
1200
1201 if (strlen(taskcg) <= strlen(querycg)) {
1202 fprintf(stderr, "%s: I was fed bad input\n", __func__);
1203 return NULL;
1204 }
1205
1206 if (strcmp(querycg, "/") == 0)
1207 start = strdup(taskcg + 1);
1208 else
1209 start = strdup(taskcg + strlen(querycg) + 1);
1210 if (!start)
1211 return NULL;
1212 end = strchr(start, '/');
1213 if (end)
1214 *end = '\0';
1215 return start;
1216}
1217
1218static void stripnewline(char *x)
1219{
1220 size_t l = strlen(x);
1221 if (l && x[l-1] == '\n')
1222 x[l-1] = '\0';
1223}
1224
1225static char *get_pid_cgroup(pid_t pid, const char *contrl)
1226{
5dd3e6fd 1227 int cfd;
237e200e
SH
1228 char fnam[PROCLEN];
1229 FILE *f;
1230 char *answer = NULL;
1231 char *line = NULL;
1232 size_t len = 0;
1233 int ret;
5dd3e6fd 1234 const char *h = find_mounted_controller(contrl, &cfd);
237e200e
SH
1235 if (!h)
1236 return NULL;
1237
1238 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1239 if (ret < 0 || ret >= PROCLEN)
1240 return NULL;
1241 if (!(f = fopen(fnam, "r")))
1242 return NULL;
1243
1244 while (getline(&line, &len, f) != -1) {
1245 char *c1, *c2;
1246 if (!line[0])
1247 continue;
1248 c1 = strchr(line, ':');
1249 if (!c1)
1250 goto out;
1251 c1++;
1252 c2 = strchr(c1, ':');
1253 if (!c2)
1254 goto out;
1255 *c2 = '\0';
1256 if (strcmp(c1, h) != 0)
1257 continue;
1258 c2++;
1259 stripnewline(c2);
1260 do {
1261 answer = strdup(c2);
1262 } while (!answer);
1263 break;
1264 }
1265
1266out:
1267 fclose(f);
1268 free(line);
1269 return answer;
1270}
1271
1272/*
1273 * check whether a fuse context may access a cgroup dir or file
1274 *
1275 * If file is not null, it is a cgroup file to check under cg.
1276 * If file is null, then we are checking perms on cg itself.
1277 *
1278 * For files we can check the mode of the list_keys result.
1279 * For cgroups, we must make assumptions based on the files under the
1280 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1281 * yet.
1282 */
1283static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1284{
1285 struct cgfs_files *k = NULL;
1286 bool ret = false;
1287
1288 k = cgfs_get_key(contrl, cg, file);
1289 if (!k)
1290 return false;
1291
1292 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1293 if (perms_include(k->mode >> 6, mode)) {
1294 ret = true;
1295 goto out;
1296 }
1297 }
1298 if (fc->gid == k->gid) {
1299 if (perms_include(k->mode >> 3, mode)) {
1300 ret = true;
1301 goto out;
1302 }
1303 }
1304 ret = perms_include(k->mode, mode);
1305
1306out:
1307 free_key(k);
1308 return ret;
1309}
1310
1311#define INITSCOPE "/init.scope"
1312static void prune_init_slice(char *cg)
1313{
1314 char *point;
1315 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1316
1317 if (cg_len < initscope_len)
1318 return;
1319
1320 point = cg + cg_len - initscope_len;
1321 if (strcmp(point, INITSCOPE) == 0) {
1322 if (point == cg)
1323 *(point+1) = '\0';
1324 else
1325 *point = '\0';
1326 }
1327}
1328
1329/*
1330 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1331 * If pid is in /a, he may act on /a/b, but not on /b.
1332 * if the answer is false and nextcg is not NULL, then *nextcg will point
1333 * to a string containing the next cgroup directory under cg, which must be
1334 * freed by the caller.
1335 */
1336static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1337{
1338 bool answer = false;
1339 char *c2 = get_pid_cgroup(pid, contrl);
1340 char *linecmp;
1341
1342 if (!c2)
1343 return false;
1344 prune_init_slice(c2);
1345
1346 /*
12c31268
CB
1347 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1348 * they pass in a cgroup without leading '/'
1349 *
1350 * The original line here was:
1351 * linecmp = *cg == '/' ? c2 : c2+1;
1352 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1353 * Serge, do you know?
237e200e 1354 */
12c31268
CB
1355 if (*cg == '/' || !strncmp(cg, "./", 2))
1356 linecmp = c2;
1357 else
1358 linecmp = c2 + 1;
237e200e
SH
1359 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1360 if (nextcg) {
1361 *nextcg = get_next_cgroup_dir(linecmp, cg);
1362 }
1363 goto out;
1364 }
1365 answer = true;
1366
1367out:
1368 free(c2);
1369 return answer;
1370}
1371
1372/*
1373 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1374 */
1375static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1376{
1377 bool answer = false;
1378 char *c2, *task_cg;
1379 size_t target_len, task_len;
1380
f7bff426 1381 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
237e200e
SH
1382 return true;
1383
1384 c2 = get_pid_cgroup(pid, contrl);
1385 if (!c2)
1386 return false;
1387 prune_init_slice(c2);
1388
1389 task_cg = c2 + 1;
1390 target_len = strlen(cg);
1391 task_len = strlen(task_cg);
1392 if (task_len == 0) {
1393 /* Task is in the root cg, it can see everything. This case is
1394 * not handled by the strmcps below, since they test for the
1395 * last /, but that is the first / that we've chopped off
1396 * above.
1397 */
1398 answer = true;
1399 goto out;
1400 }
1401 if (strcmp(cg, task_cg) == 0) {
1402 answer = true;
1403 goto out;
1404 }
1405 if (target_len < task_len) {
1406 /* looking up a parent dir */
1407 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1408 answer = true;
1409 goto out;
1410 }
1411 if (target_len > task_len) {
1412 /* looking up a child dir */
1413 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1414 answer = true;
1415 goto out;
1416 }
1417
1418out:
1419 free(c2);
1420 return answer;
1421}
1422
1423/*
1424 * given /cgroup/freezer/a/b, return "freezer".
1425 * the returned char* should NOT be freed.
1426 */
1427static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1428{
1429 const char *p1;
1430 char *contr, *slash;
1431
1432 if (strlen(path) < 9)
1433 return NULL;
1434 if (*(path+7) != '/')
1435 return NULL;
1436 p1 = path+8;
1437 contr = strdupa(p1);
1438 if (!contr)
1439 return NULL;
1440 slash = strstr(contr, "/");
1441 if (slash)
1442 *slash = '\0';
1443
1444 int i;
1445 for (i = 0; i < num_hierarchies; i++) {
1446 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1447 return hierarchies[i];
1448 }
1449 return NULL;
1450}
1451
1452/*
1453 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1454 * Note that the returned value may include files (keynames) etc
1455 */
1456static const char *find_cgroup_in_path(const char *path)
1457{
1458 const char *p1;
1459
1460 if (strlen(path) < 9)
1461 return NULL;
1462 p1 = strstr(path+8, "/");
1463 if (!p1)
1464 return NULL;
1465 return p1+1;
1466}
1467
1468/*
1469 * split the last path element from the path in @cg.
1470 * @dir is newly allocated and should be freed, @last not
1471*/
1472static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1473{
1474 char *p;
1475
1476 do {
1477 *dir = strdup(cg);
1478 } while (!*dir);
1479 *last = strrchr(cg, '/');
1480 if (!*last) {
1481 *last = NULL;
1482 return;
1483 }
1484 p = strrchr(*dir, '/');
1485 *p = '\0';
1486}
1487
1488/*
1489 * FUSE ops for /cgroup
1490 */
1491
1492int cg_getattr(const char *path, struct stat *sb)
1493{
1494 struct timespec now;
1495 struct fuse_context *fc = fuse_get_context();
1496 char * cgdir = NULL;
1497 char *last = NULL, *path1, *path2;
1498 struct cgfs_files *k = NULL;
1499 const char *cgroup;
1500 const char *controller = NULL;
1501 int ret = -ENOENT;
1502
1503
1504 if (!fc)
1505 return -EIO;
1506
1507 memset(sb, 0, sizeof(struct stat));
1508
1509 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1510 return -EINVAL;
1511
1512 sb->st_uid = sb->st_gid = 0;
1513 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1514 sb->st_size = 0;
1515
1516 if (strcmp(path, "/cgroup") == 0) {
1517 sb->st_mode = S_IFDIR | 00755;
1518 sb->st_nlink = 2;
1519 return 0;
1520 }
1521
1522 controller = pick_controller_from_path(fc, path);
1523 if (!controller)
1524 return -EIO;
1525 cgroup = find_cgroup_in_path(path);
1526 if (!cgroup) {
1527 /* this is just /cgroup/controller, return it as a dir */
1528 sb->st_mode = S_IFDIR | 00755;
1529 sb->st_nlink = 2;
1530 return 0;
1531 }
1532
1533 get_cgdir_and_path(cgroup, &cgdir, &last);
1534
1535 if (!last) {
1536 path1 = "/";
1537 path2 = cgdir;
1538 } else {
1539 path1 = cgdir;
1540 path2 = last;
1541 }
1542
1543 pid_t initpid = lookup_initpid_in_store(fc->pid);
1544 if (initpid <= 0)
1545 initpid = fc->pid;
1546 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1547 * Then check that caller's cgroup is under path if last is a child
1548 * cgroup, or cgdir if last is a file */
1549
1550 if (is_child_cgroup(controller, path1, path2)) {
1551 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1552 ret = -ENOENT;
1553 goto out;
1554 }
1555 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1556 /* this is just /cgroup/controller, return it as a dir */
1557 sb->st_mode = S_IFDIR | 00555;
1558 sb->st_nlink = 2;
1559 ret = 0;
1560 goto out;
1561 }
1562 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1563 ret = -EACCES;
1564 goto out;
1565 }
1566
1567 // get uid, gid, from '/tasks' file and make up a mode
1568 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1569 sb->st_mode = S_IFDIR | 00755;
1570 k = cgfs_get_key(controller, cgroup, NULL);
1571 if (!k) {
1572 sb->st_uid = sb->st_gid = 0;
1573 } else {
1574 sb->st_uid = k->uid;
1575 sb->st_gid = k->gid;
1576 }
1577 free_key(k);
1578 sb->st_nlink = 2;
1579 ret = 0;
1580 goto out;
1581 }
1582
1583 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1584 sb->st_mode = S_IFREG | k->mode;
1585 sb->st_nlink = 1;
1586 sb->st_uid = k->uid;
1587 sb->st_gid = k->gid;
1588 sb->st_size = 0;
1589 free_key(k);
1590 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1591 ret = -ENOENT;
1592 goto out;
1593 }
1594 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
1595 ret = -EACCES;
1596 goto out;
1597 }
1598
1599 ret = 0;
1600 }
1601
1602out:
1603 free(cgdir);
1604 return ret;
1605}
1606
1607int cg_opendir(const char *path, struct fuse_file_info *fi)
1608{
1609 struct fuse_context *fc = fuse_get_context();
1610 const char *cgroup;
1611 struct file_info *dir_info;
1612 char *controller = NULL;
1613
1614 if (!fc)
1615 return -EIO;
1616
1617 if (strcmp(path, "/cgroup") == 0) {
1618 cgroup = NULL;
1619 controller = NULL;
1620 } else {
1621 // return list of keys for the controller, and list of child cgroups
1622 controller = pick_controller_from_path(fc, path);
1623 if (!controller)
1624 return -EIO;
1625
1626 cgroup = find_cgroup_in_path(path);
1627 if (!cgroup) {
1628 /* this is just /cgroup/controller, return its contents */
1629 cgroup = "/";
1630 }
1631 }
1632
1633 pid_t initpid = lookup_initpid_in_store(fc->pid);
1634 if (initpid <= 0)
1635 initpid = fc->pid;
1636 if (cgroup) {
1637 if (!caller_may_see_dir(initpid, controller, cgroup))
1638 return -ENOENT;
1639 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1640 return -EACCES;
1641 }
1642
1643 /* we'll free this at cg_releasedir */
1644 dir_info = malloc(sizeof(*dir_info));
1645 if (!dir_info)
1646 return -ENOMEM;
1647 dir_info->controller = must_copy_string(controller);
1648 dir_info->cgroup = must_copy_string(cgroup);
1649 dir_info->type = LXC_TYPE_CGDIR;
1650 dir_info->buf = NULL;
1651 dir_info->file = NULL;
1652 dir_info->buflen = 0;
1653
1654 fi->fh = (unsigned long)dir_info;
1655 return 0;
1656}
1657
1658int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1659 struct fuse_file_info *fi)
1660{
1661 struct file_info *d = (struct file_info *)fi->fh;
1662 struct cgfs_files **list = NULL;
1663 int i, ret;
1664 char *nextcg = NULL;
1665 struct fuse_context *fc = fuse_get_context();
1666 char **clist = NULL;
1667
1668 if (d->type != LXC_TYPE_CGDIR) {
1669 fprintf(stderr, "Internal error: file cache info used in readdir\n");
1670 return -EIO;
1671 }
1672 if (!d->cgroup && !d->controller) {
1673 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1674 int i;
1675
1676 for (i = 0; i < num_hierarchies; i++) {
1677 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1678 return -EIO;
1679 }
1680 }
1681 return 0;
1682 }
1683
1684 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1685 // not a valid cgroup
1686 ret = -EINVAL;
1687 goto out;
1688 }
1689
1690 pid_t initpid = lookup_initpid_in_store(fc->pid);
1691 if (initpid <= 0)
1692 initpid = fc->pid;
1693 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1694 if (nextcg) {
1695 ret = filler(buf, nextcg, NULL, 0);
1696 free(nextcg);
1697 if (ret != 0) {
1698 ret = -EIO;
1699 goto out;
1700 }
1701 }
1702 ret = 0;
1703 goto out;
1704 }
1705
1706 for (i = 0; list[i]; i++) {
1707 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1708 ret = -EIO;
1709 goto out;
1710 }
1711 }
1712
1713 // now get the list of child cgroups
1714
1715 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1716 ret = 0;
1717 goto out;
1718 }
f366da65
WB
1719 if (clist) {
1720 for (i = 0; clist[i]; i++) {
1721 if (filler(buf, clist[i], NULL, 0) != 0) {
1722 ret = -EIO;
1723 goto out;
1724 }
237e200e
SH
1725 }
1726 }
1727 ret = 0;
1728
1729out:
1730 free_keys(list);
1731 if (clist) {
1732 for (i = 0; clist[i]; i++)
1733 free(clist[i]);
1734 free(clist);
1735 }
1736 return ret;
1737}
1738
43215927 1739static void do_release_file_info(struct fuse_file_info *fi)
237e200e 1740{
43215927
SH
1741 struct file_info *f = (struct file_info *)fi->fh;
1742
237e200e
SH
1743 if (!f)
1744 return;
43215927
SH
1745
1746 fi->fh = 0;
1747
237e200e 1748 free(f->controller);
43215927 1749 f->controller = NULL;
237e200e 1750 free(f->cgroup);
43215927 1751 f->cgroup = NULL;
237e200e 1752 free(f->file);
43215927 1753 f->file = NULL;
237e200e 1754 free(f->buf);
43215927 1755 f->buf = NULL;
237e200e
SH
1756 free(f);
1757}
1758
1759int cg_releasedir(const char *path, struct fuse_file_info *fi)
1760{
43215927 1761 do_release_file_info(fi);
237e200e
SH
1762 return 0;
1763}
1764
1765int cg_open(const char *path, struct fuse_file_info *fi)
1766{
1767 const char *cgroup;
1768 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1769 struct cgfs_files *k = NULL;
1770 struct file_info *file_info;
1771 struct fuse_context *fc = fuse_get_context();
1772 int ret;
1773
1774 if (!fc)
1775 return -EIO;
1776
1777 controller = pick_controller_from_path(fc, path);
1778 if (!controller)
1779 return -EIO;
1780 cgroup = find_cgroup_in_path(path);
1781 if (!cgroup)
1782 return -EINVAL;
1783
1784 get_cgdir_and_path(cgroup, &cgdir, &last);
1785 if (!last) {
1786 path1 = "/";
1787 path2 = cgdir;
1788 } else {
1789 path1 = cgdir;
1790 path2 = last;
1791 }
1792
1793 k = cgfs_get_key(controller, path1, path2);
1794 if (!k) {
1795 ret = -EINVAL;
1796 goto out;
1797 }
1798 free_key(k);
1799
1800 pid_t initpid = lookup_initpid_in_store(fc->pid);
1801 if (initpid <= 0)
1802 initpid = fc->pid;
1803 if (!caller_may_see_dir(initpid, controller, path1)) {
1804 ret = -ENOENT;
1805 goto out;
1806 }
1807 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
237e200e
SH
1808 ret = -EACCES;
1809 goto out;
1810 }
1811
1812 /* we'll free this at cg_release */
1813 file_info = malloc(sizeof(*file_info));
1814 if (!file_info) {
1815 ret = -ENOMEM;
1816 goto out;
1817 }
1818 file_info->controller = must_copy_string(controller);
1819 file_info->cgroup = must_copy_string(path1);
1820 file_info->file = must_copy_string(path2);
1821 file_info->type = LXC_TYPE_CGFILE;
1822 file_info->buf = NULL;
1823 file_info->buflen = 0;
1824
1825 fi->fh = (unsigned long)file_info;
1826 ret = 0;
1827
1828out:
1829 free(cgdir);
1830 return ret;
1831}
1832
bddbb106
SH
1833int cg_access(const char *path, int mode)
1834{
1835 const char *cgroup;
1836 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1837 struct cgfs_files *k = NULL;
1838 struct fuse_context *fc = fuse_get_context();
1839 int ret;
1840
1841 if (!fc)
1842 return -EIO;
1843
1844 controller = pick_controller_from_path(fc, path);
1845 if (!controller)
1846 return -EIO;
1847 cgroup = find_cgroup_in_path(path);
575316c4
SH
1848 if (!cgroup) {
1849 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
3f441bc7
SH
1850 if ((mode & W_OK) == 0)
1851 return 0;
1852 return -EACCES;
575316c4 1853 }
bddbb106
SH
1854
1855 get_cgdir_and_path(cgroup, &cgdir, &last);
1856 if (!last) {
1857 path1 = "/";
1858 path2 = cgdir;
1859 } else {
1860 path1 = cgdir;
1861 path2 = last;
1862 }
1863
1864 k = cgfs_get_key(controller, path1, path2);
1865 if (!k) {
3f441bc7
SH
1866 if ((mode & W_OK) == 0)
1867 ret = 0;
1868 else
1869 ret = -EACCES;
bddbb106
SH
1870 goto out;
1871 }
1872 free_key(k);
1873
1874 pid_t initpid = lookup_initpid_in_store(fc->pid);
1875 if (initpid <= 0)
1876 initpid = fc->pid;
1877 if (!caller_may_see_dir(initpid, controller, path1)) {
1878 ret = -ENOENT;
1879 goto out;
1880 }
1881 if (!fc_may_access(fc, controller, path1, path2, mode)) {
1882 ret = -EACCES;
1883 goto out;
1884 }
1885
1886 ret = 0;
1887
1888out:
1889 free(cgdir);
1890 return ret;
1891}
1892
237e200e
SH
1893int cg_release(const char *path, struct fuse_file_info *fi)
1894{
43215927 1895 do_release_file_info(fi);
237e200e
SH
1896 return 0;
1897}
1898
1899#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
1900
1901static bool wait_for_sock(int sock, int timeout)
1902{
1903 struct epoll_event ev;
1904 int epfd, ret, now, starttime, deltatime, saved_errno;
1905
1906 if ((starttime = time(NULL)) < 0)
1907 return false;
1908
1909 if ((epfd = epoll_create(1)) < 0) {
1910 fprintf(stderr, "Failed to create epoll socket: %m\n");
1911 return false;
1912 }
1913
1914 ev.events = POLLIN_SET;
1915 ev.data.fd = sock;
1916 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
1917 fprintf(stderr, "Failed adding socket to epoll: %m\n");
1918 close(epfd);
1919 return false;
1920 }
1921
1922again:
1923 if ((now = time(NULL)) < 0) {
1924 close(epfd);
1925 return false;
1926 }
1927
1928 deltatime = (starttime + timeout) - now;
1929 if (deltatime < 0) { // timeout
1930 errno = 0;
1931 close(epfd);
1932 return false;
1933 }
1934 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
1935 if (ret < 0 && errno == EINTR)
1936 goto again;
1937 saved_errno = errno;
1938 close(epfd);
1939
1940 if (ret <= 0) {
1941 errno = saved_errno;
1942 return false;
1943 }
1944 return true;
1945}
1946
1947static int msgrecv(int sockfd, void *buf, size_t len)
1948{
1949 if (!wait_for_sock(sockfd, 2))
1950 return -1;
1951 return recv(sockfd, buf, len, MSG_DONTWAIT);
1952}
1953
1954static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
1955{
1956 struct msghdr msg = { 0 };
1957 struct iovec iov;
1958 struct cmsghdr *cmsg;
1959 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
1960 char buf[1];
1961 buf[0] = 'p';
1962
1963 if (pingfirst) {
1964 if (msgrecv(sock, buf, 1) != 1) {
1965 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
1966 __func__);
1967 return SEND_CREDS_FAIL;
1968 }
1969 }
1970
1971 msg.msg_control = cmsgbuf;
1972 msg.msg_controllen = sizeof(cmsgbuf);
1973
1974 cmsg = CMSG_FIRSTHDR(&msg);
1975 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
1976 cmsg->cmsg_level = SOL_SOCKET;
1977 cmsg->cmsg_type = SCM_CREDENTIALS;
1978 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
1979
1980 msg.msg_name = NULL;
1981 msg.msg_namelen = 0;
1982
1983 buf[0] = v;
1984 iov.iov_base = buf;
1985 iov.iov_len = sizeof(buf);
1986 msg.msg_iov = &iov;
1987 msg.msg_iovlen = 1;
1988
1989 if (sendmsg(sock, &msg, 0) < 0) {
1990 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
1991 strerror(errno));
1992 if (errno == 3)
1993 return SEND_CREDS_NOTSK;
1994 return SEND_CREDS_FAIL;
1995 }
1996
1997 return SEND_CREDS_OK;
1998}
1999
2000static bool recv_creds(int sock, struct ucred *cred, char *v)
2001{
2002 struct msghdr msg = { 0 };
2003 struct iovec iov;
2004 struct cmsghdr *cmsg;
2005 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2006 char buf[1];
2007 int ret;
2008 int optval = 1;
2009
2010 *v = '1';
2011
2012 cred->pid = -1;
2013 cred->uid = -1;
2014 cred->gid = -1;
2015
2016 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2017 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
2018 return false;
2019 }
2020 buf[0] = '1';
2021 if (write(sock, buf, 1) != 1) {
2022 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
2023 return false;
2024 }
2025
2026 msg.msg_name = NULL;
2027 msg.msg_namelen = 0;
2028 msg.msg_control = cmsgbuf;
2029 msg.msg_controllen = sizeof(cmsgbuf);
2030
2031 iov.iov_base = buf;
2032 iov.iov_len = sizeof(buf);
2033 msg.msg_iov = &iov;
2034 msg.msg_iovlen = 1;
2035
2036 if (!wait_for_sock(sock, 2)) {
2037 fprintf(stderr, "Timed out waiting for scm_cred: %s\n",
2038 strerror(errno));
2039 return false;
2040 }
2041 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2042 if (ret < 0) {
2043 fprintf(stderr, "Failed to receive scm_cred: %s\n",
2044 strerror(errno));
2045 return false;
2046 }
2047
2048 cmsg = CMSG_FIRSTHDR(&msg);
2049
2050 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2051 cmsg->cmsg_level == SOL_SOCKET &&
2052 cmsg->cmsg_type == SCM_CREDENTIALS) {
2053 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2054 }
2055 *v = buf[0];
2056
2057 return true;
2058}
2059
35174b0f
FG
2060struct pid_ns_clone_args {
2061 int *cpipe;
2062 int sock;
2063 pid_t tpid;
2064 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2065};
2066
2067/*
2068 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2069 * with clone(). This simply writes '1' as ACK back to the parent
2070 * before calling the actual wrapped function.
2071 */
2072static int pid_ns_clone_wrapper(void *arg) {
2073 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2074 char b = '1';
2075
2076 close(args->cpipe[0]);
2077 if (write(args->cpipe[1], &b, sizeof(char)) < 0) {
2078 fprintf(stderr, "%s (child): error on write: %s\n",
2079 __func__, strerror(errno));
2080 }
2081 close(args->cpipe[1]);
2082 return args->wrapped(args->sock, args->tpid);
2083}
237e200e
SH
2084
2085/*
2086 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2087 * int value back over the socket. This shifts the pid from the
2088 * sender's pidns into tpid's pidns.
2089 */
35174b0f 2090static int pid_to_ns(int sock, pid_t tpid)
237e200e
SH
2091{
2092 char v = '0';
2093 struct ucred cred;
2094
2095 while (recv_creds(sock, &cred, &v)) {
2096 if (v == '1')
35174b0f 2097 return 0;
237e200e 2098 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
35174b0f 2099 return 1;
237e200e 2100 }
35174b0f 2101 return 0;
237e200e
SH
2102}
2103
35174b0f 2104
237e200e
SH
2105/*
2106 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
35174b0f
FG
2107 * in your old pidns. Only children which you clone will be in the target
2108 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2109 * actually convert pids.
2110 *
2111 * Note: glibc's fork() does not respect pidns, which can lead to failed
2112 * assertions inside glibc (and thus failed forks) if the child's pid in
2113 * the pidns and the parent pid outside are identical. Using clone prevents
2114 * this issue.
237e200e
SH
2115 */
2116static void pid_to_ns_wrapper(int sock, pid_t tpid)
2117{
2118 int newnsfd = -1, ret, cpipe[2];
2119 char fnam[100];
2120 pid_t cpid;
2121 char v;
2122
2123 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2124 if (ret < 0 || ret >= sizeof(fnam))
2125 _exit(1);
2126 newnsfd = open(fnam, O_RDONLY);
2127 if (newnsfd < 0)
2128 _exit(1);
2129 if (setns(newnsfd, 0) < 0)
2130 _exit(1);
2131 close(newnsfd);
2132
2133 if (pipe(cpipe) < 0)
2134 _exit(1);
2135
35174b0f
FG
2136 struct pid_ns_clone_args args = {
2137 .cpipe = cpipe,
2138 .sock = sock,
2139 .tpid = tpid,
2140 .wrapped = &pid_to_ns
2141 };
2142 size_t stack_size = sysconf(_SC_PAGESIZE);
2143 void *stack = alloca(stack_size);
2144
2145 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2146 if (cpid < 0)
2147 _exit(1);
2148
237e200e
SH
2149 // give the child 1 second to be done forking and
2150 // write its ack
2151 if (!wait_for_sock(cpipe[0], 1))
2152 _exit(1);
2153 ret = read(cpipe[0], &v, 1);
2154 if (ret != sizeof(char) || v != '1')
2155 _exit(1);
2156
2157 if (!wait_for_pid(cpid))
2158 _exit(1);
2159 _exit(0);
2160}
2161
2162/*
2163 * To read cgroup files with a particular pid, we will setns into the child
2164 * pidns, open a pipe, fork a child - which will be the first to really be in
2165 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2166 */
2167bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2168{
2169 int sock[2] = {-1, -1};
2170 char *tmpdata = NULL;
2171 int ret;
2172 pid_t qpid, cpid = -1;
2173 bool answer = false;
2174 char v = '0';
2175 struct ucred cred;
2176 size_t sz = 0, asz = 0;
2177
2178 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2179 return false;
2180
2181 /*
2182 * Now we read the pids from returned data one by one, pass
2183 * them into a child in the target namespace, read back the
2184 * translated pids, and put them into our to-return data
2185 */
2186
2187 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2188 perror("socketpair");
2189 free(tmpdata);
2190 return false;
2191 }
2192
2193 cpid = fork();
2194 if (cpid == -1)
2195 goto out;
2196
2197 if (!cpid) // child - exits when done
2198 pid_to_ns_wrapper(sock[1], tpid);
2199
2200 char *ptr = tmpdata;
2201 cred.uid = 0;
2202 cred.gid = 0;
2203 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2204 cred.pid = qpid;
2205 ret = send_creds(sock[0], &cred, v, true);
2206
2207 if (ret == SEND_CREDS_NOTSK)
2208 goto next;
2209 if (ret == SEND_CREDS_FAIL)
2210 goto out;
2211
2212 // read converted results
2213 if (!wait_for_sock(sock[0], 2)) {
2214 fprintf(stderr, "%s: timed out waiting for pid from child: %s\n",
2215 __func__, strerror(errno));
2216 goto out;
2217 }
2218 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2219 fprintf(stderr, "%s: error reading pid from child: %s\n",
2220 __func__, strerror(errno));
2221 goto out;
2222 }
2223 must_strcat_pid(d, &sz, &asz, qpid);
2224next:
2225 ptr = strchr(ptr, '\n');
2226 if (!ptr)
2227 break;
2228 ptr++;
2229 }
2230
2231 cred.pid = getpid();
2232 v = '1';
2233 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2234 // failed to ask child to exit
2235 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
2236 __func__, strerror(errno));
2237 goto out;
2238 }
2239
2240 answer = true;
2241
2242out:
2243 free(tmpdata);
2244 if (cpid != -1)
2245 wait_for_pid(cpid);
2246 if (sock[0] != -1) {
2247 close(sock[0]);
2248 close(sock[1]);
2249 }
2250 return answer;
2251}
2252
2253int cg_read(const char *path, char *buf, size_t size, off_t offset,
2254 struct fuse_file_info *fi)
2255{
2256 struct fuse_context *fc = fuse_get_context();
2257 struct file_info *f = (struct file_info *)fi->fh;
2258 struct cgfs_files *k = NULL;
2259 char *data = NULL;
2260 int ret, s;
2261 bool r;
2262
2263 if (f->type != LXC_TYPE_CGFILE) {
2264 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
2265 return -EIO;
2266 }
2267
2268 if (offset)
2269 return 0;
2270
2271 if (!fc)
2272 return -EIO;
2273
2274 if (!f->controller)
2275 return -EINVAL;
2276
2277 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2278 return -EINVAL;
2279 }
2280 free_key(k);
2281
2282
888f8f3c 2283 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
237e200e
SH
2284 ret = -EACCES;
2285 goto out;
2286 }
2287
2288 if (strcmp(f->file, "tasks") == 0 ||
2289 strcmp(f->file, "/tasks") == 0 ||
2290 strcmp(f->file, "/cgroup.procs") == 0 ||
2291 strcmp(f->file, "cgroup.procs") == 0)
2292 // special case - we have to translate the pids
2293 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2294 else
2295 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2296
2297 if (!r) {
2298 ret = -EINVAL;
2299 goto out;
2300 }
2301
2302 if (!data) {
2303 ret = 0;
2304 goto out;
2305 }
2306 s = strlen(data);
2307 if (s > size)
2308 s = size;
2309 memcpy(buf, data, s);
2310 if (s > 0 && s < size && data[s-1] != '\n')
2311 buf[s++] = '\n';
2312
2313 ret = s;
2314
2315out:
2316 free(data);
2317 return ret;
2318}
2319
35174b0f 2320static int pid_from_ns(int sock, pid_t tpid)
237e200e
SH
2321{
2322 pid_t vpid;
2323 struct ucred cred;
2324 char v;
2325 int ret;
2326
2327 cred.uid = 0;
2328 cred.gid = 0;
2329 while (1) {
2330 if (!wait_for_sock(sock, 2)) {
2331 fprintf(stderr, "%s: timeout reading from parent\n", __func__);
35174b0f 2332 return 1;
237e200e
SH
2333 }
2334 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2335 fprintf(stderr, "%s: bad read from parent: %s\n",
2336 __func__, strerror(errno));
35174b0f 2337 return 1;
237e200e
SH
2338 }
2339 if (vpid == -1) // done
2340 break;
2341 v = '0';
2342 cred.pid = vpid;
2343 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2344 v = '1';
2345 cred.pid = getpid();
2346 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
35174b0f 2347 return 1;
237e200e
SH
2348 }
2349 }
35174b0f 2350 return 0;
237e200e
SH
2351}
2352
2353static void pid_from_ns_wrapper(int sock, pid_t tpid)
2354{
2355 int newnsfd = -1, ret, cpipe[2];
2356 char fnam[100];
2357 pid_t cpid;
2358 char v;
2359
2360 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2361 if (ret < 0 || ret >= sizeof(fnam))
2362 _exit(1);
2363 newnsfd = open(fnam, O_RDONLY);
2364 if (newnsfd < 0)
2365 _exit(1);
2366 if (setns(newnsfd, 0) < 0)
2367 _exit(1);
2368 close(newnsfd);
2369
2370 if (pipe(cpipe) < 0)
2371 _exit(1);
2372
35174b0f
FG
2373 struct pid_ns_clone_args args = {
2374 .cpipe = cpipe,
2375 .sock = sock,
2376 .tpid = tpid,
2377 .wrapped = &pid_from_ns
2378 };
f0f8b851
SH
2379 size_t stack_size = sysconf(_SC_PAGESIZE);
2380 void *stack = alloca(stack_size);
35174b0f
FG
2381
2382 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2383 if (cpid < 0)
2384 _exit(1);
2385
237e200e
SH
2386 // give the child 1 second to be done forking and
2387 // write its ack
2388 if (!wait_for_sock(cpipe[0], 1))
f0f8b851 2389 _exit(1);
237e200e 2390 ret = read(cpipe[0], &v, 1);
f0f8b851
SH
2391 if (ret != sizeof(char) || v != '1')
2392 _exit(1);
237e200e
SH
2393
2394 if (!wait_for_pid(cpid))
2395 _exit(1);
2396 _exit(0);
237e200e
SH
2397}
2398
2399/*
2400 * Given host @uid, return the uid to which it maps in
2401 * @pid's user namespace, or -1 if none.
2402 */
2403bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2404{
2405 FILE *f;
2406 char line[400];
2407
2408 sprintf(line, "/proc/%d/uid_map", pid);
2409 if ((f = fopen(line, "r")) == NULL) {
2410 return false;
2411 }
2412
2413 *answer = convert_id_to_ns(f, uid);
2414 fclose(f);
2415
2416 if (*answer == -1)
2417 return false;
2418 return true;
2419}
2420
2421/*
2422 * get_pid_creds: get the real uid and gid of @pid from
2423 * /proc/$$/status
2424 * (XXX should we use euid here?)
2425 */
2426void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2427{
2428 char line[400];
2429 uid_t u;
2430 gid_t g;
2431 FILE *f;
2432
2433 *uid = -1;
2434 *gid = -1;
2435 sprintf(line, "/proc/%d/status", pid);
2436 if ((f = fopen(line, "r")) == NULL) {
2437 fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno));
2438 return;
2439 }
2440 while (fgets(line, 400, f)) {
2441 if (strncmp(line, "Uid:", 4) == 0) {
2442 if (sscanf(line+4, "%u", &u) != 1) {
2443 fprintf(stderr, "bad uid line for pid %u\n", pid);
2444 fclose(f);
2445 return;
2446 }
2447 *uid = u;
2448 } else if (strncmp(line, "Gid:", 4) == 0) {
2449 if (sscanf(line+4, "%u", &g) != 1) {
2450 fprintf(stderr, "bad gid line for pid %u\n", pid);
2451 fclose(f);
2452 return;
2453 }
2454 *gid = g;
2455 }
2456 }
2457 fclose(f);
2458}
2459
2460/*
2461 * May the requestor @r move victim @v to a new cgroup?
2462 * This is allowed if
2463 * . they are the same task
2464 * . they are ownedy by the same uid
2465 * . @r is root on the host, or
2466 * . @v's uid is mapped into @r's where @r is root.
2467 */
2468bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2469{
2470 uid_t v_uid, tmpuid;
2471 gid_t v_gid;
2472
2473 if (r == v)
2474 return true;
2475 if (r_uid == 0)
2476 return true;
2477 get_pid_creds(v, &v_uid, &v_gid);
2478 if (r_uid == v_uid)
2479 return true;
2480 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2481 && hostuid_to_ns(v_uid, r, &tmpuid))
2482 return true;
2483 return false;
2484}
2485
2486static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2487 const char *file, const char *buf)
2488{
2489 int sock[2] = {-1, -1};
2490 pid_t qpid, cpid = -1;
2491 FILE *pids_file = NULL;
2492 bool answer = false, fail = false;
2493
2494 pids_file = open_pids_file(contrl, cg);
2495 if (!pids_file)
2496 return false;
2497
2498 /*
2499 * write the pids to a socket, have helper in writer's pidns
2500 * call movepid for us
2501 */
2502 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2503 perror("socketpair");
2504 goto out;
2505 }
2506
2507 cpid = fork();
2508 if (cpid == -1)
2509 goto out;
2510
2511 if (!cpid) { // child
2512 fclose(pids_file);
2513 pid_from_ns_wrapper(sock[1], tpid);
2514 }
2515
2516 const char *ptr = buf;
2517 while (sscanf(ptr, "%d", &qpid) == 1) {
2518 struct ucred cred;
2519 char v;
2520
2521 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2522 fprintf(stderr, "%s: error writing pid to child: %s\n",
2523 __func__, strerror(errno));
2524 goto out;
2525 }
2526
2527 if (recv_creds(sock[0], &cred, &v)) {
2528 if (v == '0') {
2529 if (!may_move_pid(tpid, tuid, cred.pid)) {
2530 fail = true;
2531 break;
2532 }
2533 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2534 fail = true;
2535 }
2536 }
2537
2538 ptr = strchr(ptr, '\n');
2539 if (!ptr)
2540 break;
2541 ptr++;
2542 }
2543
2544 /* All good, write the value */
2545 qpid = -1;
2546 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2547 fprintf(stderr, "Warning: failed to ask child to exit\n");
2548
2549 if (!fail)
2550 answer = true;
2551
2552out:
2553 if (cpid != -1)
2554 wait_for_pid(cpid);
2555 if (sock[0] != -1) {
2556 close(sock[0]);
2557 close(sock[1]);
2558 }
2559 if (pids_file) {
2560 if (fclose(pids_file) != 0)
2561 answer = false;
2562 }
2563 return answer;
2564}
2565
2566int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2567 struct fuse_file_info *fi)
2568{
2569 struct fuse_context *fc = fuse_get_context();
2570 char *localbuf = NULL;
2571 struct cgfs_files *k = NULL;
2572 struct file_info *f = (struct file_info *)fi->fh;
2573 bool r;
2574
2575 if (f->type != LXC_TYPE_CGFILE) {
2576 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
2577 return -EIO;
2578 }
2579
2580 if (offset)
2581 return 0;
2582
2583 if (!fc)
2584 return -EIO;
2585
2586 localbuf = alloca(size+1);
2587 localbuf[size] = '\0';
2588 memcpy(localbuf, buf, size);
2589
2590 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2591 size = -EINVAL;
2592 goto out;
2593 }
2594
2595 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2596 size = -EACCES;
2597 goto out;
2598 }
2599
2600 if (strcmp(f->file, "tasks") == 0 ||
2601 strcmp(f->file, "/tasks") == 0 ||
2602 strcmp(f->file, "/cgroup.procs") == 0 ||
2603 strcmp(f->file, "cgroup.procs") == 0)
2604 // special case - we have to translate the pids
2605 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2606 else
2607 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2608
2609 if (!r)
2610 size = -EINVAL;
2611
2612out:
2613 free_key(k);
2614 return size;
2615}
2616
2617int cg_chown(const char *path, uid_t uid, gid_t gid)
2618{
2619 struct fuse_context *fc = fuse_get_context();
2620 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2621 struct cgfs_files *k = NULL;
2622 const char *cgroup;
2623 int ret;
2624
2625 if (!fc)
2626 return -EIO;
2627
2628 if (strcmp(path, "/cgroup") == 0)
2629 return -EINVAL;
2630
2631 controller = pick_controller_from_path(fc, path);
2632 if (!controller)
2633 return -EINVAL;
2634 cgroup = find_cgroup_in_path(path);
2635 if (!cgroup)
2636 /* this is just /cgroup/controller */
2637 return -EINVAL;
2638
2639 get_cgdir_and_path(cgroup, &cgdir, &last);
2640
2641 if (!last) {
2642 path1 = "/";
2643 path2 = cgdir;
2644 } else {
2645 path1 = cgdir;
2646 path2 = last;
2647 }
2648
2649 if (is_child_cgroup(controller, path1, path2)) {
2650 // get uid, gid, from '/tasks' file and make up a mode
2651 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2652 k = cgfs_get_key(controller, cgroup, "tasks");
2653
2654 } else
2655 k = cgfs_get_key(controller, path1, path2);
2656
2657 if (!k) {
2658 ret = -EINVAL;
2659 goto out;
2660 }
2661
2662 /*
2663 * This being a fuse request, the uid and gid must be valid
2664 * in the caller's namespace. So we can just check to make
2665 * sure that the caller is root in his uid, and privileged
2666 * over the file's current owner.
2667 */
2668 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2669 ret = -EACCES;
2670 goto out;
2671 }
2672
2673 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2674
2675out:
2676 free_key(k);
2677 free(cgdir);
2678
2679 return ret;
2680}
2681
2682int cg_chmod(const char *path, mode_t mode)
2683{
2684 struct fuse_context *fc = fuse_get_context();
2685 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2686 struct cgfs_files *k = NULL;
2687 const char *cgroup;
2688 int ret;
2689
2690 if (!fc)
2691 return -EIO;
2692
2693 if (strcmp(path, "/cgroup") == 0)
2694 return -EINVAL;
2695
2696 controller = pick_controller_from_path(fc, path);
2697 if (!controller)
2698 return -EINVAL;
2699 cgroup = find_cgroup_in_path(path);
2700 if (!cgroup)
2701 /* this is just /cgroup/controller */
2702 return -EINVAL;
2703
2704 get_cgdir_and_path(cgroup, &cgdir, &last);
2705
2706 if (!last) {
2707 path1 = "/";
2708 path2 = cgdir;
2709 } else {
2710 path1 = cgdir;
2711 path2 = last;
2712 }
2713
2714 if (is_child_cgroup(controller, path1, path2)) {
2715 // get uid, gid, from '/tasks' file and make up a mode
2716 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2717 k = cgfs_get_key(controller, cgroup, "tasks");
2718
2719 } else
2720 k = cgfs_get_key(controller, path1, path2);
2721
2722 if (!k) {
2723 ret = -EINVAL;
2724 goto out;
2725 }
2726
2727 /*
2728 * This being a fuse request, the uid and gid must be valid
2729 * in the caller's namespace. So we can just check to make
2730 * sure that the caller is root in his uid, and privileged
2731 * over the file's current owner.
2732 */
2733 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2734 ret = -EPERM;
2735 goto out;
2736 }
2737
2738 if (!cgfs_chmod_file(controller, cgroup, mode)) {
2739 ret = -EINVAL;
2740 goto out;
2741 }
2742
2743 ret = 0;
2744out:
2745 free_key(k);
2746 free(cgdir);
2747 return ret;
2748}
2749
2750int cg_mkdir(const char *path, mode_t mode)
2751{
2752 struct fuse_context *fc = fuse_get_context();
2753 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2754 const char *cgroup;
2755 int ret;
2756
2757 if (!fc)
2758 return -EIO;
2759
2760
2761 controller = pick_controller_from_path(fc, path);
2762 if (!controller)
2763 return -EINVAL;
2764
2765 cgroup = find_cgroup_in_path(path);
2766 if (!cgroup)
2767 return -EINVAL;
2768
2769 get_cgdir_and_path(cgroup, &cgdir, &last);
2770 if (!last)
2771 path1 = "/";
2772 else
2773 path1 = cgdir;
2774
2775 pid_t initpid = lookup_initpid_in_store(fc->pid);
2776 if (initpid <= 0)
2777 initpid = fc->pid;
2778 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
2779 if (!next)
2780 ret = -EINVAL;
2781 else if (last && strcmp(next, last) == 0)
2782 ret = -EEXIST;
2783 else
2784 ret = -ENOENT;
2785 goto out;
2786 }
2787
2788 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
2789 ret = -EACCES;
2790 goto out;
2791 }
2792 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2793 ret = -EACCES;
2794 goto out;
2795 }
2796
2797 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
2798
2799out:
2800 free(cgdir);
2801 free(next);
2802 return ret;
2803}
2804
2805int cg_rmdir(const char *path)
2806{
2807 struct fuse_context *fc = fuse_get_context();
2808 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
2809 const char *cgroup;
2810 int ret;
2811
2812 if (!fc)
2813 return -EIO;
2814
2815 controller = pick_controller_from_path(fc, path);
2816 if (!controller)
2817 return -EINVAL;
2818
2819 cgroup = find_cgroup_in_path(path);
2820 if (!cgroup)
2821 return -EINVAL;
2822
2823 get_cgdir_and_path(cgroup, &cgdir, &last);
2824 if (!last) {
2825 ret = -EINVAL;
2826 goto out;
2827 }
2828
2829 pid_t initpid = lookup_initpid_in_store(fc->pid);
2830 if (initpid <= 0)
2831 initpid = fc->pid;
2832 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
2833 if (!last || strcmp(next, last) == 0)
2834 ret = -EBUSY;
2835 else
2836 ret = -ENOENT;
2837 goto out;
2838 }
2839
2840 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
2841 ret = -EACCES;
2842 goto out;
2843 }
2844 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2845 ret = -EACCES;
2846 goto out;
2847 }
2848
2849 if (!cgfs_remove(controller, cgroup)) {
2850 ret = -EINVAL;
2851 goto out;
2852 }
2853
2854 ret = 0;
2855
2856out:
2857 free(cgdir);
2858 free(next);
2859 return ret;
2860}
2861
2862static bool startswith(const char *line, const char *pref)
2863{
2864 if (strncmp(line, pref, strlen(pref)) == 0)
2865 return true;
2866 return false;
2867}
2868
2869static void get_mem_cached(char *memstat, unsigned long *v)
2870{
2871 char *eol;
2872
2873 *v = 0;
2874 while (*memstat) {
2875 if (startswith(memstat, "total_cache")) {
2876 sscanf(memstat + 11, "%lu", v);
2877 *v /= 1024;
2878 return;
2879 }
2880 eol = strchr(memstat, '\n');
2881 if (!eol)
2882 return;
2883 memstat = eol+1;
2884 }
2885}
2886
2887static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2888{
2889 char *eol;
2890 char key[32];
2891
2892 memset(key, 0, 32);
2893 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2894
2895 size_t len = strlen(key);
2896 *v = 0;
2897
2898 while (*str) {
2899 if (startswith(str, key)) {
2900 sscanf(str + len, "%lu", v);
2901 return;
2902 }
2903 eol = strchr(str, '\n');
2904 if (!eol)
2905 return;
2906 str = eol+1;
2907 }
2908}
2909
2910static int read_file(const char *path, char *buf, size_t size,
2911 struct file_info *d)
2912{
2913 size_t linelen = 0, total_len = 0, rv = 0;
2914 char *line = NULL;
2915 char *cache = d->buf;
2916 size_t cache_size = d->buflen;
2917 FILE *f = fopen(path, "r");
2918 if (!f)
2919 return 0;
2920
2921 while (getline(&line, &linelen, f) != -1) {
a262ddb7 2922 ssize_t l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
2923 if (l < 0) {
2924 perror("Error writing to cache");
2925 rv = 0;
2926 goto err;
2927 }
2928 if (l >= cache_size) {
2929 fprintf(stderr, "Internal error: truncated write to cache\n");
2930 rv = 0;
2931 goto err;
2932 }
2933 cache += l;
2934 cache_size -= l;
2935 total_len += l;
2936 }
2937
2938 d->size = total_len;
a262ddb7
CB
2939 if (total_len > size)
2940 total_len = size;
237e200e
SH
2941
2942 /* read from off 0 */
2943 memcpy(buf, d->buf, total_len);
2944 rv = total_len;
2945 err:
2946 fclose(f);
2947 free(line);
2948 return rv;
2949}
2950
2951/*
2952 * FUSE ops for /proc
2953 */
2954
2955static unsigned long get_memlimit(const char *cgroup)
2956{
2957 char *memlimit_str = NULL;
2958 unsigned long memlimit = -1;
2959
2960 if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
2961 memlimit = strtoul(memlimit_str, NULL, 10);
2962
2963 free(memlimit_str);
2964
2965 return memlimit;
2966}
2967
2968static unsigned long get_min_memlimit(const char *cgroup)
2969{
2970 char *copy = strdupa(cgroup);
2971 unsigned long memlimit = 0, retlimit;
2972
2973 retlimit = get_memlimit(copy);
2974
2975 while (strcmp(copy, "/") != 0) {
2976 copy = dirname(copy);
2977 memlimit = get_memlimit(copy);
2978 if (memlimit != -1 && memlimit < retlimit)
2979 retlimit = memlimit;
2980 };
2981
2982 return retlimit;
2983}
2984
2985static int proc_meminfo_read(char *buf, size_t size, off_t offset,
2986 struct fuse_file_info *fi)
2987{
2988 struct fuse_context *fc = fuse_get_context();
2989 struct file_info *d = (struct file_info *)fi->fh;
2990 char *cg;
2991 char *memusage_str = NULL, *memstat_str = NULL,
2992 *memswlimit_str = NULL, *memswusage_str = NULL,
2993 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
2994 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
2995 cached = 0, hosttotal = 0;
2996 char *line = NULL;
2997 size_t linelen = 0, total_len = 0, rv = 0;
2998 char *cache = d->buf;
2999 size_t cache_size = d->buflen;
3000 FILE *f = NULL;
3001
3002 if (offset){
3003 if (offset > d->size)
3004 return -EINVAL;
3005 if (!d->cached)
3006 return 0;
3007 int left = d->size - offset;
3008 total_len = left > size ? size: left;
3009 memcpy(buf, cache + offset, total_len);
3010 return total_len;
3011 }
3012
3013 pid_t initpid = lookup_initpid_in_store(fc->pid);
3014 if (initpid <= 0)
3015 initpid = fc->pid;
3016 cg = get_pid_cgroup(initpid, "memory");
3017 if (!cg)
3018 return read_file("/proc/meminfo", buf, size, d);
6d2f6996 3019 prune_init_slice(cg);
237e200e
SH
3020
3021 memlimit = get_min_memlimit(cg);
3022 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3023 goto err;
3024 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3025 goto err;
3026
3027 // Following values are allowed to fail, because swapaccount might be turned
3028 // off for current kernel
3029 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3030 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3031 {
3032 /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
3033 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
3034 goto err;
3035 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
3036 goto err;
3037
3038 memswlimit = strtoul(memswlimit_str, NULL, 10);
3039 memswusage = strtoul(memswusage_str, NULL, 10);
3040
3041 if (!strcmp(memswlimit_str, memswlimit_default_str))
3042 memswlimit = 0;
3043 if (!strcmp(memswusage_str, memswusage_default_str))
3044 memswusage = 0;
3045
3046 memswlimit = memswlimit / 1024;
3047 memswusage = memswusage / 1024;
3048 }
3049
3050 memusage = strtoul(memusage_str, NULL, 10);
3051 memlimit /= 1024;
3052 memusage /= 1024;
3053
3054 get_mem_cached(memstat_str, &cached);
3055
3056 f = fopen("/proc/meminfo", "r");
3057 if (!f)
3058 goto err;
3059
3060 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3061 ssize_t l;
237e200e
SH
3062 char *printme, lbuf[100];
3063
3064 memset(lbuf, 0, 100);
3065 if (startswith(line, "MemTotal:")) {
3066 sscanf(line+14, "%lu", &hosttotal);
3067 if (hosttotal < memlimit)
3068 memlimit = hosttotal;
3069 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3070 printme = lbuf;
3071 } else if (startswith(line, "MemFree:")) {
3072 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3073 printme = lbuf;
3074 } else if (startswith(line, "MemAvailable:")) {
3075 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
3076 printme = lbuf;
3077 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3078 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit);
3079 printme = lbuf;
3080 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
f676eb79 3081 snprintf(lbuf, 100, "SwapFree: %8lu kB\n",
237e200e
SH
3082 (memswlimit - memlimit) - (memswusage - memusage));
3083 printme = lbuf;
da35d72a
SH
3084 } else if (startswith(line, "Slab:")) {
3085 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3086 printme = lbuf;
237e200e
SH
3087 } else if (startswith(line, "Buffers:")) {
3088 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3089 printme = lbuf;
3090 } else if (startswith(line, "Cached:")) {
3091 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3092 printme = lbuf;
3093 } else if (startswith(line, "SwapCached:")) {
3094 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3095 printme = lbuf;
3096 } else
3097 printme = line;
3098
3099 l = snprintf(cache, cache_size, "%s", printme);
3100 if (l < 0) {
3101 perror("Error writing to cache");
3102 rv = 0;
3103 goto err;
3104
3105 }
3106 if (l >= cache_size) {
3107 fprintf(stderr, "Internal error: truncated write to cache\n");
3108 rv = 0;
3109 goto err;
3110 }
3111
3112 cache += l;
3113 cache_size -= l;
3114 total_len += l;
3115 }
3116
3117 d->cached = 1;
3118 d->size = total_len;
3119 if (total_len > size ) total_len = size;
3120 memcpy(buf, d->buf, total_len);
3121
3122 rv = total_len;
3123err:
3124 if (f)
3125 fclose(f);
3126 free(line);
3127 free(cg);
3128 free(memusage_str);
3129 free(memswlimit_str);
3130 free(memswusage_str);
3131 free(memstat_str);
3132 free(memswlimit_default_str);
3133 free(memswusage_default_str);
3134 return rv;
3135}
3136
3137/*
3138 * Read the cpuset.cpus for cg
3139 * Return the answer in a newly allocated string which must be freed
3140 */
3141static char *get_cpuset(const char *cg)
3142{
3143 char *answer;
3144
3145 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3146 return NULL;
3147 return answer;
3148}
3149
3150bool cpu_in_cpuset(int cpu, const char *cpuset);
3151
3152static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3153{
3154 int cpu;
3155
3156 if (sscanf(line, "processor : %d", &cpu) != 1)
3157 return false;
3158 return cpu_in_cpuset(cpu, cpuset);
3159}
3160
3161/*
3162 * check whether this is a '^processor" line in /proc/cpuinfo
3163 */
3164static bool is_processor_line(const char *line)
3165{
3166 int cpu;
3167
3168 if (sscanf(line, "processor : %d", &cpu) == 1)
3169 return true;
3170 return false;
3171}
3172
3173static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3174 struct fuse_file_info *fi)
3175{
3176 struct fuse_context *fc = fuse_get_context();
3177 struct file_info *d = (struct file_info *)fi->fh;
3178 char *cg;
3179 char *cpuset = NULL;
3180 char *line = NULL;
3181 size_t linelen = 0, total_len = 0, rv = 0;
f676eb79
SH
3182 bool am_printing = false, firstline = true, is_s390x = false;
3183 int curcpu = -1, cpu;
237e200e
SH
3184 char *cache = d->buf;
3185 size_t cache_size = d->buflen;
3186 FILE *f = NULL;
3187
3188 if (offset){
3189 if (offset > d->size)
3190 return -EINVAL;
3191 if (!d->cached)
3192 return 0;
3193 int left = d->size - offset;
3194 total_len = left > size ? size: left;
3195 memcpy(buf, cache + offset, total_len);
3196 return total_len;
3197 }
3198
3199 pid_t initpid = lookup_initpid_in_store(fc->pid);
3200 if (initpid <= 0)
3201 initpid = fc->pid;
3202 cg = get_pid_cgroup(initpid, "cpuset");
3203 if (!cg)
3204 return read_file("proc/cpuinfo", buf, size, d);
6d2f6996 3205 prune_init_slice(cg);
237e200e
SH
3206
3207 cpuset = get_cpuset(cg);
3208 if (!cpuset)
3209 goto err;
3210
3211 f = fopen("/proc/cpuinfo", "r");
3212 if (!f)
3213 goto err;
3214
3215 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3216 ssize_t l;
f676eb79
SH
3217 if (firstline) {
3218 firstline = false;
3219 if (strstr(line, "IBM/S390") != NULL) {
3220 is_s390x = true;
3221 am_printing = true;
5ed9d4e2 3222 continue;
f676eb79
SH
3223 }
3224 }
5ed9d4e2
SH
3225 if (strncmp(line, "# processors:", 12) == 0)
3226 continue;
237e200e
SH
3227 if (is_processor_line(line)) {
3228 am_printing = cpuline_in_cpuset(line, cpuset);
3229 if (am_printing) {
3230 curcpu ++;
3231 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3232 if (l < 0) {
3233 perror("Error writing to cache");
3234 rv = 0;
3235 goto err;
3236 }
3237 if (l >= cache_size) {
3238 fprintf(stderr, "Internal error: truncated write to cache\n");
3239 rv = 0;
3240 goto err;
3241 }
3242 cache += l;
3243 cache_size -= l;
3244 total_len += l;
3245 }
3246 continue;
f676eb79
SH
3247 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3248 char *p;
3249 if (!cpu_in_cpuset(cpu, cpuset))
3250 continue;
3251 curcpu ++;
3252 p = strchr(line, ':');
3253 if (!p || !*p)
3254 goto err;
3255 p++;
5ed9d4e2 3256 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
f676eb79
SH
3257 if (l < 0) {
3258 perror("Error writing to cache");
3259 rv = 0;
3260 goto err;
3261 }
3262 if (l >= cache_size) {
3263 fprintf(stderr, "Internal error: truncated write to cache\n");
3264 rv = 0;
3265 goto err;
3266 }
3267 cache += l;
3268 cache_size -= l;
3269 total_len += l;
3270 continue;
3271
237e200e
SH
3272 }
3273 if (am_printing) {
3274 l = snprintf(cache, cache_size, "%s", line);
3275 if (l < 0) {
3276 perror("Error writing to cache");
3277 rv = 0;
3278 goto err;
3279 }
3280 if (l >= cache_size) {
3281 fprintf(stderr, "Internal error: truncated write to cache\n");
3282 rv = 0;
3283 goto err;
3284 }
3285 cache += l;
3286 cache_size -= l;
3287 total_len += l;
3288 }
3289 }
3290
5ed9d4e2
SH
3291 if (is_s390x) {
3292 char *origcache = d->buf;
a262ddb7 3293 ssize_t l;
5ed9d4e2
SH
3294 do {
3295 d->buf = malloc(d->buflen);
3296 } while (!d->buf);
3297 cache = d->buf;
3298 cache_size = d->buflen;
3299 total_len = 0;
3300 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3301 if (l < 0 || l >= cache_size) {
3302 free(origcache);
3303 goto err;
3304 }
3305 cache_size -= l;
3306 cache += l;
3307 total_len += l;
3308 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3309 if (l < 0 || l >= cache_size) {
3310 free(origcache);
3311 goto err;
3312 }
3313 cache_size -= l;
3314 cache += l;
3315 total_len += l;
3316 l = snprintf(cache, cache_size, "%s", origcache);
3317 free(origcache);
3318 if (l < 0 || l >= cache_size)
3319 goto err;
3320 total_len += l;
3321 }
3322
237e200e
SH
3323 d->cached = 1;
3324 d->size = total_len;
3325 if (total_len > size ) total_len = size;
3326
3327 /* read from off 0 */
3328 memcpy(buf, d->buf, total_len);
3329 rv = total_len;
3330err:
3331 if (f)
3332 fclose(f);
3333 free(line);
3334 free(cpuset);
3335 free(cg);
3336 return rv;
3337}
3338
3339static int proc_stat_read(char *buf, size_t size, off_t offset,
3340 struct fuse_file_info *fi)
3341{
3342 struct fuse_context *fc = fuse_get_context();
3343 struct file_info *d = (struct file_info *)fi->fh;
3344 char *cg;
3345 char *cpuset = NULL;
3346 char *line = NULL;
3347 size_t linelen = 0, total_len = 0, rv = 0;
3348 int curcpu = -1; /* cpu numbering starts at 0 */
3349 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
3350 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3351 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
3352#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
3353 char cpuall[CPUALL_MAX_SIZE];
3354 /* reserve for cpu all */
3355 char *cache = d->buf + CPUALL_MAX_SIZE;
3356 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3357 FILE *f = NULL;
3358
3359 if (offset){
3360 if (offset > d->size)
3361 return -EINVAL;
3362 if (!d->cached)
3363 return 0;
3364 int left = d->size - offset;
3365 total_len = left > size ? size: left;
3366 memcpy(buf, d->buf + offset, total_len);
3367 return total_len;
3368 }
3369
3370 pid_t initpid = lookup_initpid_in_store(fc->pid);
3371 if (initpid <= 0)
3372 initpid = fc->pid;
3373 cg = get_pid_cgroup(initpid, "cpuset");
3374 if (!cg)
3375 return read_file("/proc/stat", buf, size, d);
6d2f6996 3376 prune_init_slice(cg);
237e200e
SH
3377
3378 cpuset = get_cpuset(cg);
3379 if (!cpuset)
3380 goto err;
3381
3382 f = fopen("/proc/stat", "r");
3383 if (!f)
3384 goto err;
3385
3386 //skip first line
3387 if (getline(&line, &linelen, f) < 0) {
3388 fprintf(stderr, "proc_stat_read read first line failed\n");
3389 goto err;
3390 }
3391
3392 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3393 ssize_t l;
237e200e
SH
3394 int cpu;
3395 char cpu_char[10]; /* That's a lot of cores */
3396 char *c;
3397
3398 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3399 /* not a ^cpuN line containing a number N, just print it */
3400 l = snprintf(cache, cache_size, "%s", line);
3401 if (l < 0) {
3402 perror("Error writing to cache");
3403 rv = 0;
3404 goto err;
3405 }
3406 if (l >= cache_size) {
3407 fprintf(stderr, "Internal error: truncated write to cache\n");
3408 rv = 0;
3409 goto err;
3410 }
3411 cache += l;
3412 cache_size -= l;
3413 total_len += l;
3414 continue;
3415 }
3416
3417 if (sscanf(cpu_char, "%d", &cpu) != 1)
3418 continue;
3419 if (!cpu_in_cpuset(cpu, cpuset))
3420 continue;
3421 curcpu ++;
3422
3423 c = strchr(line, ' ');
3424 if (!c)
3425 continue;
3426 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3427 if (l < 0) {
3428 perror("Error writing to cache");
3429 rv = 0;
3430 goto err;
3431
3432 }
3433 if (l >= cache_size) {
3434 fprintf(stderr, "Internal error: truncated write to cache\n");
3435 rv = 0;
3436 goto err;
3437 }
3438
3439 cache += l;
3440 cache_size -= l;
3441 total_len += l;
3442
3443 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
3444 &softirq, &steal, &guest) != 9)
3445 continue;
3446 user_sum += user;
3447 nice_sum += nice;
3448 system_sum += system;
3449 idle_sum += idle;
3450 iowait_sum += iowait;
3451 irq_sum += irq;
3452 softirq_sum += softirq;
3453 steal_sum += steal;
3454 guest_sum += guest;
3455 }
3456
3457 cache = d->buf;
3458
3459 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3460 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
3461 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
3462 memcpy(cache, cpuall, cpuall_len);
3463 cache += cpuall_len;
3464 } else{
3465 /* shouldn't happen */
3466 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
3467 cpuall_len = 0;
3468 }
3469
3470 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3471 total_len += cpuall_len;
3472 d->cached = 1;
3473 d->size = total_len;
3474 if (total_len > size ) total_len = size;
3475
3476 memcpy(buf, d->buf, total_len);
3477 rv = total_len;
3478
3479err:
3480 if (f)
3481 fclose(f);
3482 free(line);
3483 free(cpuset);
3484 free(cg);
3485 return rv;
3486}
3487
3488static long int getreaperage(pid_t pid)
3489{
3490 char fnam[100];
3491 struct stat sb;
3492 int ret;
3493 pid_t qpid;
3494
3495 qpid = lookup_initpid_in_store(pid);
3496 if (qpid <= 0)
3497 return 0;
3498
3499 ret = snprintf(fnam, 100, "/proc/%d", qpid);
3500 if (ret < 0 || ret >= 100)
3501 return 0;
3502
3503 if (lstat(fnam, &sb) < 0)
3504 return 0;
3505
3506 return time(NULL) - sb.st_ctime;
3507}
3508
3509static unsigned long get_reaper_busy(pid_t task)
3510{
3511 pid_t initpid = lookup_initpid_in_store(task);
3512 char *cgroup = NULL, *usage_str = NULL;
3513 unsigned long usage = 0;
3514
3515 if (initpid <= 0)
3516 return 0;
3517
3518 cgroup = get_pid_cgroup(initpid, "cpuacct");
3519 if (!cgroup)
3520 goto out;
6d2f6996 3521 prune_init_slice(cgroup);
237e200e
SH
3522 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3523 goto out;
3524 usage = strtoul(usage_str, NULL, 10);
3525 usage /= 1000000000;
3526
3527out:
3528 free(cgroup);
3529 free(usage_str);
3530 return usage;
3531}
3532
3533#if RELOADTEST
3534void iwashere(void)
3535{
3536 char *name, *cwd = get_current_dir_name();
3537 size_t len;
3538 int fd;
3539
3540 if (!cwd)
3541 exit(1);
3542 len = strlen(cwd) + strlen("/iwashere") + 1;
3543 name = alloca(len);
3544 snprintf(name, len, "%s/iwashere", cwd);
3545 free(cwd);
3546 fd = creat(name, 0755);
3547 if (fd >= 0)
3548 close(fd);
3549}
3550#endif
3551
3552/*
3553 * We read /proc/uptime and reuse its second field.
3554 * For the first field, we use the mtime for the reaper for
3555 * the calling pid as returned by getreaperage
3556 */
3557static int proc_uptime_read(char *buf, size_t size, off_t offset,
3558 struct fuse_file_info *fi)
3559{
3560 struct fuse_context *fc = fuse_get_context();
3561 struct file_info *d = (struct file_info *)fi->fh;
3562 long int reaperage = getreaperage(fc->pid);
3563 unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
3564 char *cache = d->buf;
a262ddb7 3565 ssize_t total_len = 0;
237e200e
SH
3566
3567#if RELOADTEST
3568 iwashere();
3569#endif
3570
3571 if (offset){
3572 if (offset > d->size)
3573 return -EINVAL;
3574 if (!d->cached)
3575 return 0;
3576 int left = d->size - offset;
3577 total_len = left > size ? size: left;
3578 memcpy(buf, cache + offset, total_len);
3579 return total_len;
3580 }
3581
3582 idletime = reaperage - busytime;
3583 if (idletime > reaperage)
3584 idletime = reaperage;
3585
3586 total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
3587 if (total_len < 0){
3588 perror("Error writing to cache");
3589 return 0;
3590 }
3591
3592 d->size = (int)total_len;
3593 d->cached = 1;
3594
3595 if (total_len > size) total_len = size;
3596
3597 memcpy(buf, d->buf, total_len);
3598 return total_len;
3599}
3600
3601static int proc_diskstats_read(char *buf, size_t size, off_t offset,
3602 struct fuse_file_info *fi)
3603{
3604 char dev_name[72];
3605 struct fuse_context *fc = fuse_get_context();
3606 struct file_info *d = (struct file_info *)fi->fh;
3607 char *cg;
3608 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
3609 *io_wait_time_str = NULL, *io_service_time_str = NULL;
3610 unsigned long read = 0, write = 0;
3611 unsigned long read_merged = 0, write_merged = 0;
3612 unsigned long read_sectors = 0, write_sectors = 0;
3613 unsigned long read_ticks = 0, write_ticks = 0;
3614 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
3615 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
3616 char *cache = d->buf;
3617 size_t cache_size = d->buflen;
3618 char *line = NULL;
3619 size_t linelen = 0, total_len = 0, rv = 0;
3620 unsigned int major = 0, minor = 0;
3621 int i = 0;
3622 FILE *f = NULL;
3623
3624 if (offset){
3625 if (offset > d->size)
3626 return -EINVAL;
3627 if (!d->cached)
3628 return 0;
3629 int left = d->size - offset;
3630 total_len = left > size ? size: left;
3631 memcpy(buf, cache + offset, total_len);
3632 return total_len;
3633 }
3634
3635 pid_t initpid = lookup_initpid_in_store(fc->pid);
3636 if (initpid <= 0)
3637 initpid = fc->pid;
3638 cg = get_pid_cgroup(initpid, "blkio");
3639 if (!cg)
3640 return read_file("/proc/diskstats", buf, size, d);
6d2f6996 3641 prune_init_slice(cg);
237e200e 3642
2209fe50 3643 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
237e200e 3644 goto err;
2209fe50 3645 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
237e200e 3646 goto err;
2209fe50 3647 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
237e200e 3648 goto err;
2209fe50 3649 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
237e200e 3650 goto err;
2209fe50 3651 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
237e200e
SH
3652 goto err;
3653
3654
3655 f = fopen("/proc/diskstats", "r");
3656 if (!f)
3657 goto err;
3658
3659 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3660 ssize_t l;
2209fe50 3661 char lbuf[256];
237e200e
SH
3662
3663 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2209fe50 3664 if (i != 3)
237e200e 3665 continue;
2209fe50
SH
3666
3667 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
3668 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
3669 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
3670 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
3671 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
3672 read_sectors = read_sectors/512;
3673 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
3674 write_sectors = write_sectors/512;
3675
3676 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
3677 rd_svctm = rd_svctm/1000000;
3678 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
3679 rd_wait = rd_wait/1000000;
3680 read_ticks = rd_svctm + rd_wait;
3681
3682 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
3683 wr_svctm = wr_svctm/1000000;
3684 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
3685 wr_wait = wr_wait/1000000;
3686 write_ticks = wr_svctm + wr_wait;
3687
3688 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
3689 tot_ticks = tot_ticks/1000000;
237e200e
SH
3690
3691 memset(lbuf, 0, 256);
2db31eb6
SH
3692 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
3693 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3694 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
3695 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
3696 else
3697 continue;
237e200e 3698
2209fe50 3699 l = snprintf(cache, cache_size, "%s", lbuf);
237e200e
SH
3700 if (l < 0) {
3701 perror("Error writing to fuse buf");
3702 rv = 0;
3703 goto err;
3704 }
3705 if (l >= cache_size) {
3706 fprintf(stderr, "Internal error: truncated write to cache\n");
3707 rv = 0;
3708 goto err;
3709 }
3710 cache += l;
3711 cache_size -= l;
3712 total_len += l;
3713 }
3714
3715 d->cached = 1;
3716 d->size = total_len;
3717 if (total_len > size ) total_len = size;
3718 memcpy(buf, d->buf, total_len);
3719
3720 rv = total_len;
3721err:
3722 free(cg);
3723 if (f)
3724 fclose(f);
3725 free(line);
3726 free(io_serviced_str);
3727 free(io_merged_str);
3728 free(io_service_bytes_str);
3729 free(io_wait_time_str);
3730 free(io_service_time_str);
3731 return rv;
3732}
3733
70dcc12e
SH
3734static int proc_swaps_read(char *buf, size_t size, off_t offset,
3735 struct fuse_file_info *fi)
3736{
3737 struct fuse_context *fc = fuse_get_context();
3738 struct file_info *d = (struct file_info *)fi->fh;
3739 char *cg = NULL;
3740 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL,
3741 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
3742 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
a262ddb7
CB
3743 ssize_t total_len = 0, rv = 0;
3744 ssize_t l = 0;
70dcc12e
SH
3745 char *cache = d->buf;
3746
3747 if (offset) {
3748 if (offset > d->size)
3749 return -EINVAL;
3750 if (!d->cached)
3751 return 0;
3752 int left = d->size - offset;
3753 total_len = left > size ? size: left;
3754 memcpy(buf, cache + offset, total_len);
3755 return total_len;
3756 }
3757
3758 pid_t initpid = lookup_initpid_in_store(fc->pid);
3759 if (initpid <= 0)
3760 initpid = fc->pid;
3761 cg = get_pid_cgroup(initpid, "memory");
3762 if (!cg)
3763 return read_file("/proc/swaps", buf, size, d);
6d2f6996 3764 prune_init_slice(cg);
70dcc12e
SH
3765
3766 if (!cgfs_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
3767 goto err;
3768
3769 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3770 goto err;
3771
3772 memlimit = strtoul(memlimit_str, NULL, 10);
3773 memusage = strtoul(memusage_str, NULL, 10);
3774
3775 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
3776 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
3777
3778 /* If swap accounting is turned on, then default value is assumed to be that of cgroup / */
3779 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
3780 goto err;
3781 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
3782 goto err;
3783
3784 memswlimit = strtoul(memswlimit_str, NULL, 10);
3785 memswusage = strtoul(memswusage_str, NULL, 10);
3786
3787 if (!strcmp(memswlimit_str, memswlimit_default_str))
3788 memswlimit = 0;
3789 if (!strcmp(memswusage_str, memswusage_default_str))
3790 memswusage = 0;
3791
3792 swap_total = (memswlimit - memlimit) / 1024;
3793 swap_free = (memswusage - memusage) / 1024;
3794 }
3795
3796 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
3797
3798 /* When no mem + swap limit is specified or swapaccount=0*/
3799 if (!memswlimit) {
3800 char *line = NULL;
3801 size_t linelen = 0;
3802 FILE *f = fopen("/proc/meminfo", "r");
3803
3804 if (!f)
3805 goto err;
3806
3807 while (getline(&line, &linelen, f) != -1) {
3808 if (startswith(line, "SwapTotal:")) {
3809 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
3810 } else if (startswith(line, "SwapFree:")) {
3811 sscanf(line, "SwapFree: %8lu kB", &swap_free);
3812 }
3813 }
3814
3815 free(line);
3816 fclose(f);
3817 }
3818
3819 if (swap_total > 0) {
a262ddb7
CB
3820 l = snprintf(d->buf + total_len, d->size - total_len,
3821 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
3822 swap_total, swap_free);
3823 total_len += l;
70dcc12e
SH
3824 }
3825
a262ddb7 3826 if (total_len < 0 || l < 0) {
70dcc12e
SH
3827 perror("Error writing to cache");
3828 rv = 0;
3829 goto err;
3830 }
3831
3832 d->cached = 1;
3833 d->size = (int)total_len;
3834
3835 if (total_len > size) total_len = size;
3836 memcpy(buf, d->buf, total_len);
3837 rv = total_len;
3838
3839err:
3840 free(cg);
3841 free(memswlimit_str);
3842 free(memlimit_str);
3843 free(memusage_str);
3844 free(memswusage_str);
3845 free(memswusage_default_str);
3846 free(memswlimit_default_str);
3847 return rv;
3848}
3849
237e200e
SH
3850static off_t get_procfile_size(const char *which)
3851{
3852 FILE *f = fopen(which, "r");
3853 char *line = NULL;
3854 size_t len = 0;
3855 ssize_t sz, answer = 0;
3856 if (!f)
3857 return 0;
3858
3859 while ((sz = getline(&line, &len, f)) != -1)
3860 answer += sz;
3861 fclose (f);
3862 free(line);
3863
3864 return answer;
3865}
3866
3867int proc_getattr(const char *path, struct stat *sb)
3868{
3869 struct timespec now;
3870
3871 memset(sb, 0, sizeof(struct stat));
3872 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
3873 return -EINVAL;
3874 sb->st_uid = sb->st_gid = 0;
3875 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
3876 if (strcmp(path, "/proc") == 0) {
3877 sb->st_mode = S_IFDIR | 00555;
3878 sb->st_nlink = 2;
3879 return 0;
3880 }
3881 if (strcmp(path, "/proc/meminfo") == 0 ||
3882 strcmp(path, "/proc/cpuinfo") == 0 ||
3883 strcmp(path, "/proc/uptime") == 0 ||
3884 strcmp(path, "/proc/stat") == 0 ||
70dcc12e
SH
3885 strcmp(path, "/proc/diskstats") == 0 ||
3886 strcmp(path, "/proc/swaps") == 0) {
237e200e
SH
3887 sb->st_size = 0;
3888 sb->st_mode = S_IFREG | 00444;
3889 sb->st_nlink = 1;
3890 return 0;
3891 }
3892
3893 return -ENOENT;
3894}
3895
3896int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
3897 struct fuse_file_info *fi)
3898{
3899 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
3900 filler(buf, "meminfo", NULL, 0) != 0 ||
3901 filler(buf, "stat", NULL, 0) != 0 ||
3902 filler(buf, "uptime", NULL, 0) != 0 ||
70dcc12e
SH
3903 filler(buf, "diskstats", NULL, 0) != 0 ||
3904 filler(buf, "swaps", NULL, 0) != 0)
237e200e
SH
3905 return -EINVAL;
3906 return 0;
3907}
3908
3909int proc_open(const char *path, struct fuse_file_info *fi)
3910{
3911 int type = -1;
3912 struct file_info *info;
3913
3914 if (strcmp(path, "/proc/meminfo") == 0)
3915 type = LXC_TYPE_PROC_MEMINFO;
3916 else if (strcmp(path, "/proc/cpuinfo") == 0)
3917 type = LXC_TYPE_PROC_CPUINFO;
3918 else if (strcmp(path, "/proc/uptime") == 0)
3919 type = LXC_TYPE_PROC_UPTIME;
3920 else if (strcmp(path, "/proc/stat") == 0)
3921 type = LXC_TYPE_PROC_STAT;
3922 else if (strcmp(path, "/proc/diskstats") == 0)
3923 type = LXC_TYPE_PROC_DISKSTATS;
70dcc12e
SH
3924 else if (strcmp(path, "/proc/swaps") == 0)
3925 type = LXC_TYPE_PROC_SWAPS;
237e200e
SH
3926 if (type == -1)
3927 return -ENOENT;
3928
3929 info = malloc(sizeof(*info));
3930 if (!info)
3931 return -ENOMEM;
3932
3933 memset(info, 0, sizeof(*info));
3934 info->type = type;
3935
3936 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
3937 do {
3938 info->buf = malloc(info->buflen);
3939 } while (!info->buf);
3940 memset(info->buf, 0, info->buflen);
3941 /* set actual size to buffer size */
3942 info->size = info->buflen;
3943
3944 fi->fh = (unsigned long)info;
3945 return 0;
3946}
3947
bddbb106
SH
3948int proc_access(const char *path, int mask)
3949{
3950 /* these are all read-only */
3951 if ((mask & ~R_OK) != 0)
1b060d0a 3952 return -EACCES;
bddbb106
SH
3953 return 0;
3954}
3955
237e200e
SH
3956int proc_release(const char *path, struct fuse_file_info *fi)
3957{
43215927 3958 do_release_file_info(fi);
237e200e
SH
3959 return 0;
3960}
3961
3962int proc_read(const char *path, char *buf, size_t size, off_t offset,
3963 struct fuse_file_info *fi)
3964{
3965 struct file_info *f = (struct file_info *) fi->fh;
3966
3967 switch (f->type) {
3968 case LXC_TYPE_PROC_MEMINFO:
3969 return proc_meminfo_read(buf, size, offset, fi);
3970 case LXC_TYPE_PROC_CPUINFO:
3971 return proc_cpuinfo_read(buf, size, offset, fi);
3972 case LXC_TYPE_PROC_UPTIME:
3973 return proc_uptime_read(buf, size, offset, fi);
3974 case LXC_TYPE_PROC_STAT:
3975 return proc_stat_read(buf, size, offset, fi);
3976 case LXC_TYPE_PROC_DISKSTATS:
3977 return proc_diskstats_read(buf, size, offset, fi);
70dcc12e
SH
3978 case LXC_TYPE_PROC_SWAPS:
3979 return proc_swaps_read(buf, size, offset, fi);
237e200e
SH
3980 default:
3981 return -EINVAL;
3982 }
3983}
3984
3985static void __attribute__((constructor)) collect_subsystems(void)
3986{
3987 FILE *f;
3988 char *line = NULL;
3989 size_t len = 0;
3990
3991 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
3992 fprintf(stderr, "Error opening /proc/self/cgroup: %s\n", strerror(errno));
3993 return;
3994 }
3995 while (getline(&line, &len, f) != -1) {
3996 char *p, *p2;
3997
3998 p = strchr(line, ':');
3999 if (!p)
4000 goto out;
4001 *(p++) = '\0';
4002
4003 p2 = strrchr(p, ':');
4004 if (!p2)
4005 goto out;
4006 *p2 = '\0';
4007
a67719f6
CB
4008 /* With cgroupv2 /proc/self/cgroup can contain entries of the
4009 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
4010 * because it parses out the empty string "" and later on passes
4011 * it to mount(). Let's skip such entries.
4012 */
4013 if (!strcmp(p, ""))
4014 continue;
4015
237e200e
SH
4016 if (!store_hierarchy(line, p))
4017 goto out;
4018 }
4019
4020 print_subsystems();
4021
4022out:
4023 free(line);
4024 fclose(f);
4025}
4026
4027static void __attribute__((destructor)) free_subsystems(void)
4028{
4029 int i;
4030
4031 for (i = 0; i < num_hierarchies; i++)
4032 if (hierarchies[i])
4033 free(hierarchies[i]);
4034 free(hierarchies);
4035}