]> git.proxmox.com Git - mirror_lxcfs.git/blame - lxcfs.c
Don't use tasks file to determine access rights to its cgroup
[mirror_lxcfs.git] / lxcfs.c
CommitLineData
758ad80c
SH
1/* lxcfs
2 *
2c51f8dd 3 * Copyright © 2014,2015 Canonical, Inc
758ad80c
SH
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
f2799430 6 * See COPYING file for details.
758ad80c
SH
7 */
8
758ad80c
SH
9#define FUSE_USE_VERSION 26
10
2183082c 11#include <stdio.h>
758ad80c
SH
12#include <dirent.h>
13#include <fcntl.h>
14#include <fuse.h>
15#include <unistd.h>
16#include <errno.h>
17#include <stdbool.h>
18#include <time.h>
19#include <string.h>
20#include <stdlib.h>
21#include <libgen.h>
41bb9357
SH
22#include <sched.h>
23#include <linux/sched.h>
a05660a6 24#include <sys/socket.h>
41bb9357 25#include <sys/mount.h>
5b2dfd85 26#include <sys/epoll.h>
41bb9357 27#include <wait.h>
758ad80c 28
977ac879 29#ifdef FORTRAVIS
df062bcb
SH
30#define GLIB_DISABLE_DEPRECATION_WARNINGS
31#include <glib-object.h>
977ac879 32#endif
df062bcb 33
35482f91 34#include "cgfs.h"
2e9c0b32 35#include "config.h" // for VERSION
758ad80c 36
443d13f5
SH
37enum {
38 LXC_TYPE_CGDIR,
39 LXC_TYPE_CGFILE,
40 LXC_TYPE_PROC_MEMINFO,
41 LXC_TYPE_PROC_CPUINFO,
42 LXC_TYPE_PROC_UPTIME,
43 LXC_TYPE_PROC_STAT,
44 LXC_TYPE_PROC_DISKSTATS,
45};
46
c688e1b3
SH
47struct file_info {
48 char *controller;
49 char *cgroup;
8f6e8f5e 50 char *file;
443d13f5 51 int type;
c688e1b3
SH
52 char *buf; // unused as of yet
53 int buflen;
97f1f27b 54 int size; //actual data size
b5ad2d21 55 int cached;
c688e1b3
SH
56};
57
97f1f27b
YY
58/* reserve buffer size, for cpuall in /proc/stat */
59#define BUF_RESERVE_SIZE 256
60
2c51f8dd
SH
61/*
62 * append pid to *src.
63 * src: a pointer to a char* in which ot append the pid.
64 * sz: the number of characters printed so far, minus trailing \0.
65 * asz: the allocated size so far
66 * pid: the pid to append
67 */
68static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
69{
2c51f8dd
SH
70 char tmp[30];
71
257f639b 72 int tmplen = sprintf(tmp, "%d\n", (int)pid);
2c51f8dd 73
3beb5342
SH
74 if (!*src || tmplen + *sz + 1 >= *asz) {
75 char *tmp;
2c51f8dd 76 do {
3beb5342
SH
77 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
78 } while (!tmp);
79 *src = tmp;
2c51f8dd
SH
80 *asz += BUF_RESERVE_SIZE;
81 }
3beb5342 82 memcpy((*src) +*sz , tmp, tmplen);
257f639b 83 *sz += tmplen;
3beb5342 84 (*src)[*sz] = '\0';
2c51f8dd
SH
85}
86
0afd85bd
SH
87static pid_t get_init_pid_for_task(pid_t task);
88
a05660a6
SH
89static int wait_for_pid(pid_t pid)
90{
91 int status, ret;
92
93again:
94 ret = waitpid(pid, &status, 0);
95 if (ret == -1) {
96 if (errno == EINTR)
97 goto again;
98 return -1;
99 }
100 if (ret != pid)
101 goto again;
102 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
103 return -1;
104 return 0;
105}
106
053a659d
SH
107/*
108 * Given a open file * to /proc/pid/{u,g}id_map, and an id
109 * valid in the caller's namespace, return the id mapped into
110 * pid's namespace.
111 * Returns the mapped id, or -1 on error.
112 */
113unsigned int
114convert_id_to_ns(FILE *idfile, unsigned int in_id)
115{
116 unsigned int nsuid, // base id for a range in the idfile's namespace
117 hostuid, // base id for a range in the caller's namespace
118 count; // number of ids in this range
119 char line[400];
120 int ret;
121
122 fseek(idfile, 0L, SEEK_SET);
123 while (fgets(line, 400, idfile)) {
124 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
125 if (ret != 3)
126 continue;
127 if (hostuid + count < hostuid || nsuid + count < nsuid) {
128 /*
129 * uids wrapped around - unexpected as this is a procfile,
130 * so just bail.
131 */
647c89e5 132 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
053a659d
SH
133 nsuid, hostuid, count, line);
134 return -1;
135 }
136 if (hostuid <= in_id && hostuid+count > in_id) {
137 /*
138 * now since hostuid <= in_id < hostuid+count, and
139 * hostuid+count and nsuid+count do not wrap around,
140 * we know that nsuid+(in_id-hostuid) which must be
141 * less that nsuid+(count) must not wrap around
142 */
143 return (in_id - hostuid) + nsuid;
144 }
145 }
146
147 // no answer found
148 return -1;
149}
150
341b21ad
SH
151/*
152 * for is_privileged_over,
153 * specify whether we require the calling uid to be root in his
154 * namespace
155 */
156#define NS_ROOT_REQD true
157#define NS_ROOT_OPT false
158
2c51f8dd
SH
159#define PROCLEN 100
160
341b21ad 161static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
758ad80c 162{
2c51f8dd
SH
163 char fpath[PROCLEN];
164 int ret;
053a659d
SH
165 bool answer = false;
166 uid_t nsuid;
167
341b21ad
SH
168 if (victim == -1 || uid == -1)
169 return false;
170
171 /*
172 * If the request is one not requiring root in the namespace,
173 * then having the same uid suffices. (i.e. uid 1000 has write
174 * access to files owned by uid 1000
175 */
176 if (!req_ns_root && uid == victim)
758ad80c
SH
177 return true;
178
2c51f8dd
SH
179 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
180 if (ret < 0 || ret >= PROCLEN)
181 return false;
053a659d
SH
182 FILE *f = fopen(fpath, "r");
183 if (!f)
184 return false;
185
341b21ad 186 /* if caller's not root in his namespace, reject */
053a659d
SH
187 nsuid = convert_id_to_ns(f, uid);
188 if (nsuid)
189 goto out;
190
341b21ad
SH
191 /*
192 * If victim is not mapped into caller's ns, reject.
193 * XXX I'm not sure this check is needed given that fuse
194 * will be sending requests where the vfs has converted
195 */
053a659d
SH
196 nsuid = convert_id_to_ns(f, victim);
197 if (nsuid == -1)
198 goto out;
199
200 answer = true;
201
202out:
203 fclose(f);
204 return answer;
758ad80c
SH
205}
206
207static bool perms_include(int fmode, mode_t req_mode)
208{
2ad6d2bd
SH
209 mode_t r;
210
211 switch (req_mode & O_ACCMODE) {
212 case O_RDONLY:
213 r = S_IROTH;
214 break;
215 case O_WRONLY:
216 r = S_IWOTH;
217 break;
218 case O_RDWR:
219 r = S_IROTH | S_IWOTH;
220 break;
221 default:
222 return false;
223 }
224 return ((fmode & r) == r);
758ad80c
SH
225}
226
a8b6c3e0
SH
227
228/*
229 * taskcg is a/b/c
230 * querycg is /a/b/c/d/e
231 * we return 'd'
232 */
3db25a35
SH
233static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
234{
235 char *start, *end;
236
237 if (strlen(taskcg) <= strlen(querycg)) {
238 fprintf(stderr, "%s: I was fed bad input\n", __func__);
239 return NULL;
240 }
241
242 if (strcmp(querycg, "/") == 0)
2c51f8dd 243 start = strdup(taskcg + 1);
3db25a35 244 else
2c51f8dd
SH
245 start = strdup(taskcg + strlen(querycg) + 1);
246 if (!start)
247 return NULL;
3db25a35
SH
248 end = strchr(start, '/');
249 if (end)
250 *end = '\0';
251 return start;
252}
253
2c51f8dd
SH
254static void stripnewline(char *x)
255{
256 size_t l = strlen(x);
257 if (l && x[l-1] == '\n')
258 x[l-1] = '\0';
259}
260
261static char *get_pid_cgroup(pid_t pid, const char *contrl)
262{
263 char fnam[PROCLEN];
264 FILE *f;
265 char *answer = NULL;
266 char *line = NULL;
267 size_t len = 0;
268 int ret;
777dd831
SH
269 const char *h = find_mounted_controller(contrl);
270 if (!h)
271 return NULL;
2c51f8dd
SH
272
273 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
274 if (ret < 0 || ret >= PROCLEN)
275 return NULL;
276 if (!(f = fopen(fnam, "r")))
277 return NULL;
278
279 while (getline(&line, &len, f) != -1) {
280 char *c1, *c2;
281 if (!line[0])
282 continue;
283 c1 = strchr(line, ':');
284 if (!c1)
285 goto out;
286 c1++;
287 c2 = strchr(c1, ':');
288 if (!c2)
289 goto out;
290 *c2 = '\0';
777dd831 291 if (strcmp(c1, h) != 0)
2c51f8dd
SH
292 continue;
293 c2++;
294 stripnewline(c2);
295 do {
296 answer = strdup(c2);
297 } while (!answer);
298 break;
299 }
300
301out:
302 fclose(f);
303 free(line);
304 return answer;
305}
306
758ad80c
SH
307/*
308 * check whether a fuse context may access a cgroup dir or file
309 *
310 * If file is not null, it is a cgroup file to check under cg.
311 * If file is null, then we are checking perms on cg itself.
312 *
313 * For files we can check the mode of the list_keys result.
314 * For cgroups, we must make assumptions based on the files under the
315 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
316 * yet.
317 */
318static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
319{
35482f91 320 struct cgfs_files *k = NULL;
2c51f8dd 321 bool ret = false;
758ad80c 322
35482f91
SH
323 k = cgfs_get_key(contrl, cg, file);
324 if (!k)
758ad80c 325 return false;
35482f91
SH
326
327 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
328 if (perms_include(k->mode >> 6, mode)) {
329 ret = true;
2c51f8dd 330 goto out;
758ad80c
SH
331 }
332 }
35482f91
SH
333 if (fc->gid == k->gid) {
334 if (perms_include(k->mode >> 3, mode)) {
335 ret = true;
336 goto out;
337 }
338 }
339 ret = perms_include(k->mode, mode);
758ad80c 340
2c51f8dd 341out:
35482f91 342 free_key(k);
2c51f8dd 343 return ret;
3db25a35
SH
344}
345
04b5cbdc
SH
346#define INITSCOPE "/init.scope"
347static void prune_init_slice(char *cg)
348{
349 char *point;
350 point = cg + strlen(cg) - strlen(INITSCOPE);
351 if (point < cg)
352 return;
353 if (strcmp(point, INITSCOPE) == 0) {
354 if (point == cg)
355 *(point+1) = '\0';
356 else
357 *point = '\0';
358 }
359}
360
3db25a35
SH
361/*
362 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
363 * If caller is in /a, he may act on /a/b, but not on /b.
364 * if the answer is false and nextcg is not NULL, then *nextcg will point
2c51f8dd
SH
365 * to a string containing the next cgroup directory under cg, which must be
366 * freed by the caller.
3db25a35
SH
367 */
368static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
369{
3db25a35 370 bool answer = false;
a8b6c3e0
SH
371 char *c2 = get_pid_cgroup(pid, contrl);
372 char *linecmp;
3db25a35 373
a8b6c3e0 374 if (!c2)
3db25a35 375 return false;
a8b6c3e0 376 prune_init_slice(c2);
3db25a35 377
a8b6c3e0
SH
378 /*
379 * callers pass in '/' for root cgroup, otherwise they pass
380 * in a cgroup without leading '/'
381 */
382 linecmp = *cg == '/' ? c2 : c2+1;
383 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
384 if (nextcg) {
385 *nextcg = get_next_cgroup_dir(linecmp, cg);
3db25a35 386 }
a8b6c3e0
SH
387 goto out;
388 }
389 answer = true;
390
391out:
392 free(c2);
393 return answer;
394}
395
396/*
397 * If caller is in /a/b/c, he may see that /a exists, but not /b or /a/c.
398 */
399static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
400{
401 bool answer = false;
402 char *c2, *task_cg;
403 size_t target_len, task_len;
404
405 if (strcmp(cg, "/") == 0)
406 return true;
407
408 c2 = get_pid_cgroup(pid, contrl);
a8b6c3e0
SH
409 if (!c2)
410 return false;
ec3b236f 411 prune_init_slice(c2);
a8b6c3e0
SH
412
413 task_cg = c2 + 1;
414 target_len = strlen(cg);
415 task_len = strlen(task_cg);
a57cba3c
TA
416 if (task_len == 0) {
417 /* Task is in the root cg, it can see everything. This case is
418 * not handled by the strmcps below, since they test for the
419 * last /, but that is the first / that we've chopped off
420 * above.
421 */
422 answer = true;
423 goto out;
424 }
a8b6c3e0 425 if (strcmp(cg, task_cg) == 0) {
3db25a35
SH
426 answer = true;
427 goto out;
428 }
a8b6c3e0
SH
429 if (target_len < task_len) {
430 /* looking up a parent dir */
431 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
432 answer = true;
433 goto out;
434 }
435 if (target_len > task_len) {
436 /* looking up a child dir */
437 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
438 answer = true;
439 goto out;
440 }
3db25a35
SH
441
442out:
a8b6c3e0 443 free(c2);
3db25a35
SH
444 return answer;
445}
446
758ad80c 447/*
2c51f8dd
SH
448 * given /cgroup/freezer/a/b, return "freezer".
449 * the returned char* should NOT be freed.
758ad80c
SH
450 */
451static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
452{
453 const char *p1;
2c51f8dd 454 char *contr, *slash;
758ad80c
SH
455
456 if (strlen(path) < 9)
457 return NULL;
ac5d9d48
SH
458 if (*(path+7) != '/')
459 return NULL;
758ad80c 460 p1 = path+8;
2c51f8dd
SH
461 contr = strdupa(p1);
462 if (!contr)
463 return NULL;
464 slash = strstr(contr, "/");
758ad80c
SH
465 if (slash)
466 *slash = '\0';
467
758ad80c 468 int i;
35482f91
SH
469 for (i = 0; i < num_hierarchies; i++) {
470 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
471 return hierarchies[i];
758ad80c 472 }
758ad80c
SH
473 return NULL;
474}
475
476/*
477 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
478 * Note that the returned value may include files (keynames) etc
479 */
480static const char *find_cgroup_in_path(const char *path)
481{
482 const char *p1;
483
484 if (strlen(path) < 9)
485 return NULL;
486 p1 = strstr(path+8, "/");
487 if (!p1)
488 return NULL;
489 return p1+1;
490}
491
2c51f8dd 492/*
febf2b87
SH
493 * split the last path element from the path in @cg.
494 * @dir is newly allocated and should be freed, @last not
495*/
496static void get_cgdir_and_path(const char *cg, char **dir, char **last)
758ad80c 497{
758ad80c
SH
498 char *p;
499
2c51f8dd
SH
500 do {
501 *dir = strdup(cg);
502 } while (!*dir);
febf2b87
SH
503 *last = strrchr(cg, '/');
504 if (!*last) {
505 *last = NULL;
758ad80c
SH
506 return;
507 }
508 p = strrchr(*dir, '/');
509 *p = '\0';
510}
511
512/*
2ad6d2bd 513 * FUSE ops for /cgroup
758ad80c 514 */
2ad6d2bd 515
758ad80c
SH
516static int cg_getattr(const char *path, struct stat *sb)
517{
518 struct timespec now;
519 struct fuse_context *fc = fuse_get_context();
2c51f8dd 520 char * cgdir = NULL;
febf2b87 521 char *last = NULL, *path1, *path2;
35482f91 522 struct cgfs_files *k = NULL;
758ad80c 523 const char *cgroup;
2c51f8dd
SH
524 const char *controller = NULL;
525 int ret = -ENOENT;
758ad80c
SH
526
527
528 if (!fc)
529 return -EIO;
530
531 memset(sb, 0, sizeof(struct stat));
532
533 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
534 return -EINVAL;
535
536 sb->st_uid = sb->st_gid = 0;
537 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
538 sb->st_size = 0;
539
540 if (strcmp(path, "/cgroup") == 0) {
541 sb->st_mode = S_IFDIR | 00755;
542 sb->st_nlink = 2;
543 return 0;
544 }
545
546 controller = pick_controller_from_path(fc, path);
547 if (!controller)
548 return -EIO;
758ad80c
SH
549 cgroup = find_cgroup_in_path(path);
550 if (!cgroup) {
551 /* this is just /cgroup/controller, return it as a dir */
552 sb->st_mode = S_IFDIR | 00755;
553 sb->st_nlink = 2;
554 return 0;
555 }
341b21ad 556
febf2b87 557 get_cgdir_and_path(cgroup, &cgdir, &last);
758ad80c 558
febf2b87 559 if (!last) {
758ad80c
SH
560 path1 = "/";
561 path2 = cgdir;
562 } else {
563 path1 = cgdir;
febf2b87 564 path2 = last;
758ad80c
SH
565 }
566
758ad80c 567 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
febf2b87
SH
568 * Then check that caller's cgroup is under path if last is a child
569 * cgroup, or cgdir if last is a file */
758ad80c
SH
570
571 if (is_child_cgroup(controller, path1, path2)) {
a8b6c3e0
SH
572 if (!caller_may_see_dir(fc->pid, controller, cgroup)) {
573 ret = -ENOENT;
574 goto out;
575 }
f9a05025
SH
576 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
577 /* this is just /cgroup/controller, return it as a dir */
578 sb->st_mode = S_IFDIR | 00555;
579 sb->st_nlink = 2;
2c51f8dd
SH
580 ret = 0;
581 goto out;
582 }
583 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
584 ret = -EACCES;
585 goto out;
f9a05025 586 }
758ad80c 587
053a659d
SH
588 // get uid, gid, from '/tasks' file and make up a mode
589 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
590 sb->st_mode = S_IFDIR | 00755;
febf2b87 591 k = cgfs_get_key(controller, cgroup, NULL);
053a659d 592 if (!k) {
053a659d
SH
593 sb->st_uid = sb->st_gid = 0;
594 } else {
053a659d
SH
595 sb->st_uid = k->uid;
596 sb->st_gid = k->gid;
597 }
2c51f8dd 598 free_key(k);
758ad80c 599 sb->st_nlink = 2;
2c51f8dd
SH
600 ret = 0;
601 goto out;
758ad80c
SH
602 }
603
35482f91 604 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
758ad80c 605 sb->st_mode = S_IFREG | k->mode;
053a659d 606 sb->st_nlink = 1;
758ad80c
SH
607 sb->st_uid = k->uid;
608 sb->st_gid = k->gid;
7253e0a4 609 sb->st_size = 0;
2c51f8dd 610 free_key(k);
adc3867b
SH
611 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL)) {
612 ret = -ENOENT;
613 goto out;
614 }
615 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
616 ret = -EACCES;
617 goto out;
618 }
2c51f8dd
SH
619
620 ret = 0;
758ad80c
SH
621 }
622
2c51f8dd
SH
623out:
624 free(cgdir);
625 return ret;
758ad80c 626}
2183082c 627
758ad80c 628static int cg_opendir(const char *path, struct fuse_file_info *fi)
2183082c 629{
7f163b71 630 struct fuse_context *fc = fuse_get_context();
7f163b71 631 const char *cgroup;
c688e1b3 632 struct file_info *dir_info;
2c51f8dd 633 char *controller = NULL;
7f163b71
SH
634
635 if (!fc)
636 return -EIO;
637
c688e1b3
SH
638 if (strcmp(path, "/cgroup") == 0) {
639 cgroup = NULL;
640 controller = NULL;
641 } else {
642 // return list of keys for the controller, and list of child cgroups
643 controller = pick_controller_from_path(fc, path);
644 if (!controller)
645 return -EIO;
7f163b71 646
c688e1b3
SH
647 cgroup = find_cgroup_in_path(path);
648 if (!cgroup) {
649 /* this is just /cgroup/controller, return its contents */
650 cgroup = "/";
651 }
7f163b71
SH
652 }
653
a8b6c3e0
SH
654 if (cgroup) {
655 if (!caller_may_see_dir(fc->pid, controller, cgroup))
656 return -ENOENT;
657 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
658 return -EACCES;
2c51f8dd 659 }
c688e1b3
SH
660
661 /* we'll free this at cg_releasedir */
2c51f8dd
SH
662 dir_info = malloc(sizeof(*dir_info));
663 if (!dir_info)
664 return -ENOMEM;
35482f91
SH
665 dir_info->controller = must_copy_string(controller);
666 dir_info->cgroup = must_copy_string(cgroup);
443d13f5 667 dir_info->type = LXC_TYPE_CGDIR;
c688e1b3 668 dir_info->buf = NULL;
8f6e8f5e 669 dir_info->file = NULL;
c688e1b3
SH
670 dir_info->buflen = 0;
671
672 fi->fh = (unsigned long)dir_info;
758ad80c
SH
673 return 0;
674}
675
758ad80c
SH
676static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
677 struct fuse_file_info *fi)
678{
c688e1b3 679 struct file_info *d = (struct file_info *)fi->fh;
35482f91 680 struct cgfs_files **list = NULL;
2c51f8dd
SH
681 int i, ret;
682 char *nextcg = NULL;
758ad80c 683 struct fuse_context *fc = fuse_get_context();
2c51f8dd 684 char **clist = NULL;
758ad80c 685
443d13f5 686 if (d->type != LXC_TYPE_CGDIR) {
b845ad01
SH
687 fprintf(stderr, "Internal error: file cache info used in readdir\n");
688 return -EIO;
689 }
c688e1b3
SH
690 if (!d->cgroup && !d->controller) {
691 // ls /var/lib/lxcfs/cgroup - just show list of controllers
758ad80c
SH
692 int i;
693
35482f91
SH
694 for (i = 0; i < num_hierarchies; i++) {
695 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
758ad80c
SH
696 return -EIO;
697 }
698 }
699 return 0;
700 }
701
35482f91 702 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
3db25a35 703 // not a valid cgroup
2c51f8dd
SH
704 ret = -EINVAL;
705 goto out;
706 }
3db25a35 707
c688e1b3 708 if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
3db25a35
SH
709 if (nextcg) {
710 int ret;
711 ret = filler(buf, nextcg, NULL, 0);
2c51f8dd
SH
712 free(nextcg);
713 if (ret != 0) {
714 ret = -EIO;
715 goto out;
716 }
3db25a35 717 }
2c51f8dd
SH
718 ret = 0;
719 goto out;
3db25a35
SH
720 }
721
758ad80c 722 for (i = 0; list[i]; i++) {
758ad80c 723 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2c51f8dd
SH
724 ret = -EIO;
725 goto out;
758ad80c
SH
726 }
727 }
728
729 // now get the list of child cgroups
758ad80c 730
35482f91 731 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2c51f8dd
SH
732 ret = 0;
733 goto out;
734 }
758ad80c 735 for (i = 0; clist[i]; i++) {
758ad80c 736 if (filler(buf, clist[i], NULL, 0) != 0) {
2c51f8dd
SH
737 ret = -EIO;
738 goto out;
758ad80c
SH
739 }
740 }
2c51f8dd
SH
741 ret = 0;
742
743out:
744 free_keys(list);
745 if (clist) {
746 for (i = 0; clist[i]; i++)
747 free(clist[i]);
748 free(clist);
749 }
750 return ret;
758ad80c
SH
751}
752
8f6e8f5e
SH
753static void do_release_file_info(struct file_info *f)
754{
2c51f8dd
SH
755 if (!f)
756 return;
757 free(f->controller);
758 free(f->cgroup);
759 free(f->file);
760 free(f->buf);
761 free(f);
8f6e8f5e
SH
762}
763
758ad80c
SH
764static int cg_releasedir(const char *path, struct fuse_file_info *fi)
765{
c688e1b3
SH
766 struct file_info *d = (struct file_info *)fi->fh;
767
8f6e8f5e 768 do_release_file_info(d);
758ad80c
SH
769 return 0;
770}
771
99978832
SH
772static int cg_open(const char *path, struct fuse_file_info *fi)
773{
99978832 774 const char *cgroup;
febf2b87 775 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
35482f91 776 struct cgfs_files *k = NULL;
8f6e8f5e 777 struct file_info *file_info;
99978832 778 struct fuse_context *fc = fuse_get_context();
2c51f8dd 779 int ret;
99978832
SH
780
781 if (!fc)
782 return -EIO;
783
784 controller = pick_controller_from_path(fc, path);
785 if (!controller)
786 return -EIO;
787 cgroup = find_cgroup_in_path(path);
788 if (!cgroup)
789 return -EINVAL;
790
febf2b87
SH
791 get_cgdir_and_path(cgroup, &cgdir, &last);
792 if (!last) {
99978832
SH
793 path1 = "/";
794 path2 = cgdir;
795 } else {
796 path1 = cgdir;
febf2b87 797 path2 = last;
99978832
SH
798 }
799
35482f91 800 k = cgfs_get_key(controller, path1, path2);
2c51f8dd
SH
801 if (!k) {
802 ret = -EINVAL;
803 goto out;
804 }
805 free_key(k);
99978832 806
a8b6c3e0
SH
807 if (!caller_may_see_dir(fc->pid, controller, path1)) {
808 ret = -ENOENT;
809 goto out;
810 }
2c51f8dd 811 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
8f6e8f5e 812 // should never get here
2c51f8dd
SH
813 ret = -EACCES;
814 goto out;
815 }
99978832 816
8f6e8f5e 817 /* we'll free this at cg_release */
2c51f8dd
SH
818 file_info = malloc(sizeof(*file_info));
819 if (!file_info) {
820 ret = -ENOMEM;
821 goto out;
822 }
35482f91
SH
823 file_info->controller = must_copy_string(controller);
824 file_info->cgroup = must_copy_string(path1);
825 file_info->file = must_copy_string(path2);
443d13f5 826 file_info->type = LXC_TYPE_CGFILE;
8f6e8f5e
SH
827 file_info->buf = NULL;
828 file_info->buflen = 0;
829
830 fi->fh = (unsigned long)file_info;
2c51f8dd
SH
831 ret = 0;
832
833out:
834 free(cgdir);
835 return ret;
8f6e8f5e
SH
836}
837
838static int cg_release(const char *path, struct fuse_file_info *fi)
839{
840 struct file_info *f = (struct file_info *)fi->fh;
841
842 do_release_file_info(f);
843 return 0;
99978832
SH
844}
845
5b2dfd85
SH
846#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
847
848static bool wait_for_sock(int sock, int timeout)
a05660a6 849{
5b2dfd85
SH
850 struct epoll_event ev;
851 int epfd, ret;
852
853 epfd = epoll_create(1);
854 if (epfd < 0) {
855 fprintf(stderr, "Failed to create epoll socket: %m\n");
856 return false;
857 }
858
859 ev.events = POLLIN_SET;
860 ev.data.fd = sock;
861 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
862 fprintf(stderr, "Failed adding socket to epoll: %m\n");
863 close(epfd);
864 return false;
865 }
866
867 ret = epoll_wait(epfd, &ev, 1, timeout);
868 close(epfd);
a05660a6 869
5b2dfd85
SH
870 if (ret == 0)
871 return false;
872 if (ret < 0) {
873 fprintf(stderr, "Failure during epoll_wait: %m\n");
874 return false;
875 }
876 return true;
877}
a05660a6 878
5b2dfd85
SH
879static int msgrecv(int sockfd, void *buf, size_t len)
880{
881 if (!wait_for_sock(sockfd, 2))
a05660a6
SH
882 return -1;
883 return recv(sockfd, buf, len, MSG_DONTWAIT);
884}
885
01e71852
SH
886#define SEND_CREDS_OK 0
887#define SEND_CREDS_NOTSK 1
888#define SEND_CREDS_FAIL 2
889static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
a05660a6
SH
890{
891 struct msghdr msg = { 0 };
892 struct iovec iov;
893 struct cmsghdr *cmsg;
894 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
895 char buf[1];
896 buf[0] = 'p';
897
01e71852
SH
898 if (pingfirst) {
899 if (msgrecv(sock, buf, 1) != 1) {
1420baf8 900 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
01e71852
SH
901 __func__);
902 return SEND_CREDS_FAIL;
903 }
a05660a6
SH
904 }
905
906 msg.msg_control = cmsgbuf;
907 msg.msg_controllen = sizeof(cmsgbuf);
908
909 cmsg = CMSG_FIRSTHDR(&msg);
910 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
911 cmsg->cmsg_level = SOL_SOCKET;
912 cmsg->cmsg_type = SCM_CREDENTIALS;
913 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
914
915 msg.msg_name = NULL;
916 msg.msg_namelen = 0;
917
918 buf[0] = v;
919 iov.iov_base = buf;
920 iov.iov_len = sizeof(buf);
921 msg.msg_iov = &iov;
922 msg.msg_iovlen = 1;
923
924 if (sendmsg(sock, &msg, 0) < 0) {
1420baf8 925 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
a05660a6
SH
926 strerror(errno));
927 if (errno == 3)
01e71852
SH
928 return SEND_CREDS_NOTSK;
929 return SEND_CREDS_FAIL;
a05660a6
SH
930 }
931
01e71852 932 return SEND_CREDS_OK;
a05660a6
SH
933}
934
935static bool recv_creds(int sock, struct ucred *cred, char *v)
936{
937 struct msghdr msg = { 0 };
938 struct iovec iov;
939 struct cmsghdr *cmsg;
940 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
941 char buf[1];
942 int ret;
943 int optval = 1;
944
945 *v = '1';
946
947 cred->pid = -1;
948 cred->uid = -1;
949 cred->gid = -1;
950
951 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
1420baf8 952 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
a05660a6
SH
953 return false;
954 }
955 buf[0] = '1';
956 if (write(sock, buf, 1) != 1) {
1420baf8 957 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
a05660a6
SH
958 return false;
959 }
960
961 msg.msg_name = NULL;
962 msg.msg_namelen = 0;
963 msg.msg_control = cmsgbuf;
964 msg.msg_controllen = sizeof(cmsgbuf);
965
966 iov.iov_base = buf;
967 iov.iov_len = sizeof(buf);
968 msg.msg_iov = &iov;
969 msg.msg_iovlen = 1;
970
5b2dfd85
SH
971 if (!wait_for_sock(sock, 2)) {
972 fprintf(stderr, "Timed out waiting for scm_cred: %s\n",
6ee867dc
SH
973 strerror(errno));
974 return false;
975 }
976 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
a05660a6 977 if (ret < 0) {
1420baf8 978 fprintf(stderr, "Failed to receive scm_cred: %s\n",
a05660a6
SH
979 strerror(errno));
980 return false;
981 }
982
983 cmsg = CMSG_FIRSTHDR(&msg);
984
985 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
986 cmsg->cmsg_level == SOL_SOCKET &&
987 cmsg->cmsg_type == SCM_CREDENTIALS) {
988 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
989 }
990 *v = buf[0];
991
992 return true;
993}
994
995
996/*
4775fba1
SH
997 * pid_to_ns - reads pids from a ucred over a socket, then writes the
998 * int value back over the socket. This shifts the pid from the
999 * sender's pidns into tpid's pidns.
a05660a6 1000 */
4775fba1 1001static void pid_to_ns(int sock, pid_t tpid)
a05660a6
SH
1002{
1003 char v = '0';
1004 struct ucred cred;
1005
1006 while (recv_creds(sock, &cred, &v)) {
1007 if (v == '1')
67bd113f 1008 _exit(0);
a05660a6 1009 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
67bd113f 1010 _exit(1);
a05660a6 1011 }
67bd113f 1012 _exit(0);
a05660a6
SH
1013}
1014
1015/*
4775fba1 1016 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
a05660a6 1017 * in your old pidns. Only children which you fork will be in the target
4775fba1 1018 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
a05660a6
SH
1019 * actually convert pids
1020 */
4775fba1 1021static void pid_to_ns_wrapper(int sock, pid_t tpid)
a05660a6 1022{
ea56f722 1023 int newnsfd = -1, ret, cpipe[2];
a05660a6
SH
1024 char fnam[100];
1025 pid_t cpid;
ea56f722 1026 char v;
a05660a6 1027
c0adec85
SH
1028 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1029 if (ret < 0 || ret >= sizeof(fnam))
67bd113f 1030 _exit(1);
a05660a6
SH
1031 newnsfd = open(fnam, O_RDONLY);
1032 if (newnsfd < 0)
67bd113f 1033 _exit(1);
a05660a6 1034 if (setns(newnsfd, 0) < 0)
67bd113f 1035 _exit(1);
a05660a6
SH
1036 close(newnsfd);
1037
ea56f722 1038 if (pipe(cpipe) < 0)
67bd113f 1039 _exit(1);
a05660a6 1040
ea56f722 1041 cpid = fork();
a05660a6 1042 if (cpid < 0)
67bd113f 1043 _exit(1);
ea56f722
SH
1044
1045 if (!cpid) {
1046 char b = '1';
1047 close(cpipe[0]);
1048 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1049 fprintf(stderr, "%s (child): erorr on write: %s\n",
1050 __func__, strerror(errno));
1051 }
1052 close(cpipe[1]);
4775fba1 1053 pid_to_ns(sock, tpid);
ff96a5f9 1054 _exit(1); // not reached
ea56f722
SH
1055 }
1056 // give the child 1 second to be done forking and
ff96a5f9 1057 // write its ack
5b2dfd85 1058 if (!wait_for_sock(cpipe[0], 1))
ff96a5f9 1059 _exit(1);
ea56f722 1060 ret = read(cpipe[0], &v, 1);
ff96a5f9
SH
1061 if (ret != sizeof(char) || v != '1')
1062 _exit(1);
ea56f722 1063
a05660a6 1064 if (!wait_for_pid(cpid))
67bd113f
SH
1065 _exit(1);
1066 _exit(0);
a05660a6
SH
1067}
1068
1069/*
1070 * To read cgroup files with a particular pid, we will setns into the child
1071 * pidns, open a pipe, fork a child - which will be the first to really be in
35482f91 1072 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
a05660a6
SH
1073 */
1074static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
1075{
1076 int sock[2] = {-1, -1};
2c51f8dd 1077 char *tmpdata = NULL;
a05660a6
SH
1078 int ret;
1079 pid_t qpid, cpid = -1;
1080 bool answer = false;
1081 char v = '0';
1082 struct ucred cred;
2c51f8dd 1083 size_t sz = 0, asz = 0;
a05660a6 1084
35482f91 1085 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
a05660a6
SH
1086 return false;
1087
1088 /*
1089 * Now we read the pids from returned data one by one, pass
1090 * them into a child in the target namespace, read back the
1091 * translated pids, and put them into our to-return data
1092 */
1093
1094 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1095 perror("socketpair");
2c51f8dd
SH
1096 free(tmpdata);
1097 return false;
a05660a6
SH
1098 }
1099
1100 cpid = fork();
1101 if (cpid == -1)
1102 goto out;
1103
ff96a5f9 1104 if (!cpid) // child - exits when done
4775fba1 1105 pid_to_ns_wrapper(sock[1], tpid);
a05660a6
SH
1106
1107 char *ptr = tmpdata;
1108 cred.uid = 0;
1109 cred.gid = 0;
1110 while (sscanf(ptr, "%d\n", &qpid) == 1) {
1111 cred.pid = qpid;
01e71852
SH
1112 ret = send_creds(sock[0], &cred, v, true);
1113
1114 if (ret == SEND_CREDS_NOTSK)
1115 goto next;
1116 if (ret == SEND_CREDS_FAIL)
a05660a6
SH
1117 goto out;
1118
1119 // read converted results
5b2dfd85
SH
1120 if (!wait_for_sock(sock[0], 2)) {
1121 fprintf(stderr, "%s: timed out waiting for pid from child: %s\n",
6ee867dc 1122 __func__, strerror(errno));
a05660a6
SH
1123 goto out;
1124 }
1125 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
6ee867dc
SH
1126 fprintf(stderr, "%s: error reading pid from child: %s\n",
1127 __func__, strerror(errno));
a05660a6
SH
1128 goto out;
1129 }
2c51f8dd 1130 must_strcat_pid(d, &sz, &asz, qpid);
01e71852 1131next:
a05660a6
SH
1132 ptr = strchr(ptr, '\n');
1133 if (!ptr)
1134 break;
1135 ptr++;
1136 }
1137
1138 cred.pid = getpid();
1139 v = '1';
01e71852 1140 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
a05660a6 1141 // failed to ask child to exit
6ee867dc
SH
1142 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1143 __func__, strerror(errno));
a05660a6
SH
1144 goto out;
1145 }
1146
1147 answer = true;
1148
1149out:
2c51f8dd 1150 free(tmpdata);
a05660a6
SH
1151 if (cpid != -1)
1152 wait_for_pid(cpid);
1153 if (sock[0] != -1) {
1154 close(sock[0]);
1155 close(sock[1]);
1156 }
1157 return answer;
1158}
1159
99978832
SH
1160static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1161 struct fuse_file_info *fi)
1162{
99978832 1163 struct fuse_context *fc = fuse_get_context();
8f6e8f5e 1164 struct file_info *f = (struct file_info *)fi->fh;
35482f91 1165 struct cgfs_files *k = NULL;
2c51f8dd
SH
1166 char *data = NULL;
1167 int ret, s;
1168 bool r;
99978832 1169
443d13f5 1170 if (f->type != LXC_TYPE_CGFILE) {
b845ad01
SH
1171 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1172 return -EIO;
1173 }
1174
99978832 1175 if (offset)
7253e0a4 1176 return 0;
99978832
SH
1177
1178 if (!fc)
1179 return -EIO;
1180
8f6e8f5e 1181 if (!f->controller)
99978832
SH
1182 return -EINVAL;
1183
35482f91 1184 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2c51f8dd
SH
1185 return -EINVAL;
1186 }
1187 free_key(k);
99978832 1188
99978832 1189
2c51f8dd
SH
1190 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { // should never get here
1191 ret = -EACCES;
1192 goto out;
1193 }
a05660a6 1194
2c51f8dd
SH
1195 if (strcmp(f->file, "tasks") == 0 ||
1196 strcmp(f->file, "/tasks") == 0 ||
1197 strcmp(f->file, "/cgroup.procs") == 0 ||
1198 strcmp(f->file, "cgroup.procs") == 0)
1199 // special case - we have to translate the pids
1200 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
1201 else
35482f91 1202 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
99978832 1203
2c51f8dd
SH
1204 if (!r) {
1205 ret = -EINVAL;
1206 goto out;
1207 }
99978832 1208
2c51f8dd
SH
1209 if (!data) {
1210 ret = 0;
1211 goto out;
99978832 1212 }
2c51f8dd
SH
1213 s = strlen(data);
1214 if (s > size)
1215 s = size;
1216 memcpy(buf, data, s);
1217 if (s > 0 && s < size && data[s-1] != '\n')
1218 buf[s++] = '\n';
99978832 1219
2c51f8dd
SH
1220 ret = s;
1221
1222out:
1223 free(data);
1224 return ret;
99978832
SH
1225}
1226
4775fba1
SH
1227static void pid_from_ns(int sock, pid_t tpid)
1228{
1229 pid_t vpid;
1230 struct ucred cred;
1231 char v;
6ee867dc 1232 int ret;
4775fba1
SH
1233
1234 cred.uid = 0;
1235 cred.gid = 0;
6ee867dc 1236 while (1) {
5b2dfd85
SH
1237 if (!wait_for_sock(sock, 2)) {
1238 fprintf(stderr, "%s: timeout reading from parent\n", __func__);
67bd113f 1239 _exit(1);
6ee867dc
SH
1240 }
1241 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1242 fprintf(stderr, "%s: bad read from parent: %s\n",
1243 __func__, strerror(errno));
67bd113f 1244 _exit(1);
6ee867dc 1245 }
4775fba1 1246 if (vpid == -1) // done
01e71852 1247 break;
4775fba1
SH
1248 v = '0';
1249 cred.pid = vpid;
01e71852 1250 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
4775fba1
SH
1251 v = '1';
1252 cred.pid = getpid();
01e71852 1253 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
67bd113f 1254 _exit(1);
4775fba1
SH
1255 }
1256 }
67bd113f 1257 _exit(0);
4775fba1
SH
1258}
1259
1260static void pid_from_ns_wrapper(int sock, pid_t tpid)
1261{
ea56f722 1262 int newnsfd = -1, ret, cpipe[2];
4775fba1
SH
1263 char fnam[100];
1264 pid_t cpid;
ea56f722 1265 char v;
4775fba1 1266
c0adec85
SH
1267 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1268 if (ret < 0 || ret >= sizeof(fnam))
67bd113f 1269 _exit(1);
4775fba1
SH
1270 newnsfd = open(fnam, O_RDONLY);
1271 if (newnsfd < 0)
67bd113f 1272 _exit(1);
4775fba1 1273 if (setns(newnsfd, 0) < 0)
67bd113f 1274 _exit(1);
4775fba1
SH
1275 close(newnsfd);
1276
ea56f722 1277 if (pipe(cpipe) < 0)
67bd113f 1278 _exit(1);
ea56f722
SH
1279
1280loop:
4775fba1
SH
1281 cpid = fork();
1282
1283 if (cpid < 0)
67bd113f 1284 _exit(1);
ea56f722
SH
1285
1286 if (!cpid) {
1287 char b = '1';
1288 close(cpipe[0]);
1289 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1290 fprintf(stderr, "%s (child): erorr on write: %s\n",
1291 __func__, strerror(errno));
1292 }
1293 close(cpipe[1]);
4775fba1 1294 pid_from_ns(sock, tpid);
ea56f722
SH
1295 }
1296
1297 // give the child 1 second to be done forking and
1298 // write it's ack
5b2dfd85 1299 if (!wait_for_sock(cpipe[0], 1))
ea56f722
SH
1300 goto again;
1301 ret = read(cpipe[0], &v, 1);
1302 if (ret != sizeof(char) || v != '1') {
1303 goto again;
1304 }
1305
4775fba1 1306 if (!wait_for_pid(cpid))
67bd113f
SH
1307 _exit(1);
1308 _exit(0);
ea56f722
SH
1309
1310again:
1311 kill(cpid, SIGKILL);
1312 wait_for_pid(cpid);
1313 goto loop;
4775fba1
SH
1314}
1315
8ee2a503
SH
1316/*
1317 * Given host @uid, return the uid to which it maps in
1318 * @pid's user namespace, or -1 if none.
1319 */
1320bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
1321{
1322 FILE *f;
1323 char line[400];
1324
1325 sprintf(line, "/proc/%d/uid_map", pid);
1326 if ((f = fopen(line, "r")) == NULL) {
1327 return false;
1328 }
1329
1330 *answer = convert_id_to_ns(f, uid);
1331 fclose(f);
1332
1333 if (*answer == -1)
1334 return false;
1335 return true;
1336}
1337
1338/*
1339 * get_pid_creds: get the real uid and gid of @pid from
1340 * /proc/$$/status
1341 * (XXX should we use euid here?)
1342 */
1343void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
1344{
1345 char line[400];
1346 uid_t u;
1347 gid_t g;
1348 FILE *f;
1349
1350 *uid = -1;
1351 *gid = -1;
1352 sprintf(line, "/proc/%d/status", pid);
1353 if ((f = fopen(line, "r")) == NULL) {
1354 fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno));
1355 return;
1356 }
1357 while (fgets(line, 400, f)) {
1358 if (strncmp(line, "Uid:", 4) == 0) {
1359 if (sscanf(line+4, "%u", &u) != 1) {
1360 fprintf(stderr, "bad uid line for pid %u\n", pid);
1361 fclose(f);
1362 return;
1363 }
1364 *uid = u;
1365 } else if (strncmp(line, "Gid:", 4) == 0) {
1366 if (sscanf(line+4, "%u", &g) != 1) {
1367 fprintf(stderr, "bad gid line for pid %u\n", pid);
1368 fclose(f);
1369 return;
1370 }
1371 *gid = g;
1372 }
1373 }
1374 fclose(f);
1375}
1376
1377/*
1378 * May the requestor @r move victim @v to a new cgroup?
1379 * This is allowed if
1380 * . they are the same task
1381 * . they are ownedy by the same uid
1382 * . @r is root on the host, or
1383 * . @v's uid is mapped into @r's where @r is root.
1384 */
1385bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
1386{
1387 uid_t v_uid, tmpuid;
1388 gid_t v_gid;
1389
1390 if (r == v)
1391 return true;
1392 if (r_uid == 0)
1393 return true;
1394 get_pid_creds(v, &v_uid, &v_gid);
1395 if (r_uid == v_uid)
1396 return true;
1397 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
1398 && hostuid_to_ns(v_uid, r, &tmpuid))
1399 return true;
1400 return false;
1401}
1402
1403static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
1404 const char *file, const char *buf)
4775fba1
SH
1405{
1406 int sock[2] = {-1, -1};
1407 pid_t qpid, cpid = -1;
35482f91 1408 FILE *pids_file = NULL;
4775fba1
SH
1409 bool answer = false, fail = false;
1410
35482f91
SH
1411 pids_file = open_pids_file(contrl, cg);
1412 if (!pids_file)
1413 return false;
1414
4775fba1
SH
1415 /*
1416 * write the pids to a socket, have helper in writer's pidns
1417 * call movepid for us
1418 */
1419 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1420 perror("socketpair");
35482f91 1421 goto out;
4775fba1
SH
1422 }
1423
1424 cpid = fork();
1425 if (cpid == -1)
1426 goto out;
1427
35482f91
SH
1428 if (!cpid) { // child
1429 fclose(pids_file);
4775fba1 1430 pid_from_ns_wrapper(sock[1], tpid);
35482f91 1431 }
4775fba1
SH
1432
1433 const char *ptr = buf;
1434 while (sscanf(ptr, "%d", &qpid) == 1) {
1435 struct ucred cred;
1436 char v;
1437
1438 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
6ee867dc
SH
1439 fprintf(stderr, "%s: error writing pid to child: %s\n",
1440 __func__, strerror(errno));
4775fba1
SH
1441 goto out;
1442 }
1443
01e71852
SH
1444 if (recv_creds(sock[0], &cred, &v)) {
1445 if (v == '0') {
8ee2a503
SH
1446 if (!may_move_pid(tpid, tuid, cred.pid)) {
1447 fail = true;
1448 break;
1449 }
35482f91 1450 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
01e71852
SH
1451 fail = true;
1452 }
4775fba1
SH
1453 }
1454
1455 ptr = strchr(ptr, '\n');
1456 if (!ptr)
1457 break;
1458 ptr++;
1459 }
1460
1461 /* All good, write the value */
1462 qpid = -1;
1463 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1420baf8 1464 fprintf(stderr, "Warning: failed to ask child to exit\n");
4775fba1
SH
1465
1466 if (!fail)
1467 answer = true;
1468
1469out:
1470 if (cpid != -1)
1471 wait_for_pid(cpid);
1472 if (sock[0] != -1) {
1473 close(sock[0]);
1474 close(sock[1]);
1475 }
35482f91
SH
1476 if (pids_file) {
1477 if (fclose(pids_file) != 0)
1478 answer = false;
1479 }
4775fba1
SH
1480 return answer;
1481}
1482
2ad6d2bd
SH
1483int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1484 struct fuse_file_info *fi)
1485{
2ad6d2bd 1486 struct fuse_context *fc = fuse_get_context();
2c51f8dd 1487 char *localbuf = NULL;
35482f91 1488 struct cgfs_files *k = NULL;
8f6e8f5e 1489 struct file_info *f = (struct file_info *)fi->fh;
2c51f8dd 1490 bool r;
2ad6d2bd 1491
443d13f5 1492 if (f->type != LXC_TYPE_CGFILE) {
b845ad01
SH
1493 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1494 return -EIO;
1495 }
1496
2ad6d2bd 1497 if (offset)
7253e0a4 1498 return 0;
2ad6d2bd
SH
1499
1500 if (!fc)
1501 return -EIO;
1502
2c51f8dd 1503 localbuf = alloca(size+1);
47cbf0e5
SH
1504 localbuf[size] = '\0';
1505 memcpy(localbuf, buf, size);
2ad6d2bd 1506
35482f91 1507 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2c51f8dd
SH
1508 size = -EINVAL;
1509 goto out;
1510 }
2ad6d2bd 1511
2c51f8dd
SH
1512 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
1513 size = -EACCES;
1514 goto out;
1515 }
4775fba1 1516
2c51f8dd
SH
1517 if (strcmp(f->file, "tasks") == 0 ||
1518 strcmp(f->file, "/tasks") == 0 ||
1519 strcmp(f->file, "/cgroup.procs") == 0 ||
1520 strcmp(f->file, "cgroup.procs") == 0)
1521 // special case - we have to translate the pids
8ee2a503 1522 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2c51f8dd 1523 else
35482f91 1524 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2ad6d2bd 1525
2c51f8dd
SH
1526 if (!r)
1527 size = -EINVAL;
2ad6d2bd 1528
2c51f8dd
SH
1529out:
1530 free_key(k);
1531 return size;
2ad6d2bd
SH
1532}
1533
341b21ad
SH
1534int cg_chown(const char *path, uid_t uid, gid_t gid)
1535{
1536 struct fuse_context *fc = fuse_get_context();
febf2b87 1537 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
35482f91 1538 struct cgfs_files *k = NULL;
341b21ad 1539 const char *cgroup;
2c51f8dd 1540 int ret;
341b21ad
SH
1541
1542 if (!fc)
1543 return -EIO;
1544
1545 if (strcmp(path, "/cgroup") == 0)
1546 return -EINVAL;
1547
1548 controller = pick_controller_from_path(fc, path);
1549 if (!controller)
f9a05025 1550 return -EINVAL;
341b21ad
SH
1551 cgroup = find_cgroup_in_path(path);
1552 if (!cgroup)
1553 /* this is just /cgroup/controller */
1554 return -EINVAL;
1555
febf2b87 1556 get_cgdir_and_path(cgroup, &cgdir, &last);
341b21ad 1557
febf2b87 1558 if (!last) {
341b21ad
SH
1559 path1 = "/";
1560 path2 = cgdir;
1561 } else {
1562 path1 = cgdir;
febf2b87 1563 path2 = last;
341b21ad
SH
1564 }
1565
1566 if (is_child_cgroup(controller, path1, path2)) {
1567 // get uid, gid, from '/tasks' file and make up a mode
1568 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
35482f91 1569 k = cgfs_get_key(controller, cgroup, "tasks");
341b21ad
SH
1570
1571 } else
35482f91 1572 k = cgfs_get_key(controller, path1, path2);
341b21ad 1573
2c51f8dd
SH
1574 if (!k) {
1575 ret = -EINVAL;
1576 goto out;
1577 }
341b21ad
SH
1578
1579 /*
1580 * This being a fuse request, the uid and gid must be valid
1581 * in the caller's namespace. So we can just check to make
1582 * sure that the caller is root in his uid, and privileged
1583 * over the file's current owner.
1584 */
2c51f8dd
SH
1585 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
1586 ret = -EACCES;
1587 goto out;
1588 }
341b21ad 1589
1f69d62e 1590 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2c51f8dd
SH
1591
1592out:
1593 free_key(k);
1594 free(cgdir);
1595
1596 return ret;
341b21ad 1597}
2ad6d2bd 1598
fd2e4e03
SH
1599int cg_chmod(const char *path, mode_t mode)
1600{
0a1bb5ea 1601 struct fuse_context *fc = fuse_get_context();
febf2b87 1602 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
35482f91 1603 struct cgfs_files *k = NULL;
0a1bb5ea 1604 const char *cgroup;
2c51f8dd 1605 int ret;
0a1bb5ea
SH
1606
1607 if (!fc)
1608 return -EIO;
1609
1610 if (strcmp(path, "/cgroup") == 0)
1611 return -EINVAL;
1612
1613 controller = pick_controller_from_path(fc, path);
1614 if (!controller)
f9a05025 1615 return -EINVAL;
0a1bb5ea
SH
1616 cgroup = find_cgroup_in_path(path);
1617 if (!cgroup)
1618 /* this is just /cgroup/controller */
1619 return -EINVAL;
1620
febf2b87 1621 get_cgdir_and_path(cgroup, &cgdir, &last);
0a1bb5ea 1622
febf2b87 1623 if (!last) {
0a1bb5ea
SH
1624 path1 = "/";
1625 path2 = cgdir;
1626 } else {
1627 path1 = cgdir;
febf2b87 1628 path2 = last;
0a1bb5ea
SH
1629 }
1630
1631 if (is_child_cgroup(controller, path1, path2)) {
1632 // get uid, gid, from '/tasks' file and make up a mode
1633 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
35482f91 1634 k = cgfs_get_key(controller, cgroup, "tasks");
0a1bb5ea
SH
1635
1636 } else
35482f91 1637 k = cgfs_get_key(controller, path1, path2);
0a1bb5ea 1638
2c51f8dd
SH
1639 if (!k) {
1640 ret = -EINVAL;
1641 goto out;
1642 }
0a1bb5ea
SH
1643
1644 /*
1645 * This being a fuse request, the uid and gid must be valid
1646 * in the caller's namespace. So we can just check to make
1647 * sure that the caller is root in his uid, and privileged
1648 * over the file's current owner.
1649 */
2c51f8dd
SH
1650 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1651 ret = -EPERM;
1652 goto out;
1653 }
0a1bb5ea 1654
35482f91 1655 if (!cgfs_chmod_file(controller, cgroup, mode)) {
2c51f8dd
SH
1656 ret = -EINVAL;
1657 goto out;
1658 }
1659
1660 ret = 0;
1661out:
1662 free_key(k);
1663 free(cgdir);
1664 return ret;
fd2e4e03
SH
1665}
1666
ab54b798
SH
1667int cg_mkdir(const char *path, mode_t mode)
1668{
1669 struct fuse_context *fc = fuse_get_context();
febf2b87 1670 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
ab54b798 1671 const char *cgroup;
2c51f8dd 1672 int ret;
ab54b798 1673
ab54b798
SH
1674 if (!fc)
1675 return -EIO;
1676
1677
1678 controller = pick_controller_from_path(fc, path);
1679 if (!controller)
f9a05025 1680 return -EINVAL;
ab54b798
SH
1681
1682 cgroup = find_cgroup_in_path(path);
1683 if (!cgroup)
f9a05025 1684 return -EINVAL;
ab54b798 1685
febf2b87
SH
1686 get_cgdir_and_path(cgroup, &cgdir, &last);
1687 if (!last)
ab54b798
SH
1688 path1 = "/";
1689 else
1690 path1 = cgdir;
1691
a8b6c3e0 1692 if (!caller_is_in_ancestor(fc->pid, controller, path1, &next)) {
febf2b87 1693 if (last && strcmp(next, last) == 0)
a8b6c3e0
SH
1694 ret = -EEXIST;
1695 else
1696 ret = -ENOENT;
1697 goto out;
1698 }
1699
2c51f8dd
SH
1700 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
1701 ret = -EACCES;
1702 goto out;
1703 }
1704 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL)) {
1705 ret = -EACCES;
1706 goto out;
1707 }
ab54b798 1708
af869b9c 1709 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
ab54b798 1710
2c51f8dd
SH
1711out:
1712 free(cgdir);
a8b6c3e0 1713 free(next);
2c51f8dd 1714 return ret;
ab54b798
SH
1715}
1716
50d8d5b5
SH
1717static int cg_rmdir(const char *path)
1718{
1719 struct fuse_context *fc = fuse_get_context();
febf2b87 1720 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
50d8d5b5 1721 const char *cgroup;
2c51f8dd 1722 int ret;
50d8d5b5
SH
1723
1724 if (!fc)
1725 return -EIO;
1726
50d8d5b5
SH
1727 controller = pick_controller_from_path(fc, path);
1728 if (!controller)
f9a05025 1729 return -EINVAL;
50d8d5b5
SH
1730
1731 cgroup = find_cgroup_in_path(path);
1732 if (!cgroup)
f9a05025 1733 return -EINVAL;
50d8d5b5 1734
febf2b87
SH
1735 get_cgdir_and_path(cgroup, &cgdir, &last);
1736 if (!last) {
2c51f8dd
SH
1737 ret = -EINVAL;
1738 goto out;
1739 }
50d8d5b5 1740
a8b6c3e0 1741 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, &next)) {
febf2b87 1742 if (!last || strcmp(next, last) == 0)
a8b6c3e0
SH
1743 ret = -EBUSY;
1744 else
1745 ret = -ENOENT;
1746 goto out;
1747 }
1748
2c51f8dd
SH
1749 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
1750 ret = -EACCES;
1751 goto out;
1752 }
1753 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
1754 ret = -EACCES;
1755 goto out;
1756 }
50d8d5b5 1757
35482f91 1758 if (!cgfs_remove(controller, cgroup)) {
2c51f8dd
SH
1759 ret = -EINVAL;
1760 goto out;
1761 }
50d8d5b5 1762
2c51f8dd
SH
1763 ret = 0;
1764
1765out:
1766 free(cgdir);
a8b6c3e0 1767 free(next);
2c51f8dd 1768 return ret;
50d8d5b5
SH
1769}
1770
2dc17609
SH
1771static bool startswith(const char *line, const char *pref)
1772{
1773 if (strncmp(line, pref, strlen(pref)) == 0)
1774 return true;
1775 return false;
1776}
1777
1778static void get_mem_cached(char *memstat, unsigned long *v)
1779{
1780 char *eol;
1781
1782 *v = 0;
1783 while (*memstat) {
1784 if (startswith(memstat, "total_cache")) {
1785 sscanf(memstat + 11, "%lu", v);
1786 *v /= 1024;
1787 return;
1788 }
1789 eol = strchr(memstat, '\n');
1790 if (!eol)
1791 return;
1792 memstat = eol+1;
1793 }
1794}
1795
49878439 1796static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2f919d9d 1797{
49878439
YY
1798 char *eol;
1799 char key[32];
2f919d9d 1800
49878439
YY
1801 memset(key, 0, 32);
1802 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2f919d9d 1803
49878439
YY
1804 size_t len = strlen(key);
1805 *v = 0;
1806
1807 while (*str) {
1808 if (startswith(str, key)) {
2f919d9d
SH
1809 sscanf(str + len, "%lu", v);
1810 return;
1811 }
1812 eol = strchr(str, '\n');
49878439 1813 if (!eol)
2f919d9d 1814 return;
49878439
YY
1815 str = eol+1;
1816 }
1817}
1818
53b43826
SH
1819static int read_file(const char *path, char *buf, size_t size,
1820 struct file_info *d)
1821{
1822 size_t linelen = 0, total_len = 0, rv = 0;
1823 char *line = NULL;
1824 char *cache = d->buf;
1825 size_t cache_size = d->buflen;
1826 FILE *f = fopen(path, "r");
1827 if (!f)
1828 return 0;
1829
1830 while (getline(&line, &linelen, f) != -1) {
1831 size_t l = snprintf(cache, cache_size, "%s", line);
1832 if (l < 0) {
1833 perror("Error writing to cache");
1834 rv = 0;
1835 goto err;
1836 }
1837 if (l >= cache_size) {
1838 fprintf(stderr, "Internal error: truncated write to cache\n");
1839 rv = 0;
1840 goto err;
1841 }
1842 if (l < cache_size) {
1843 cache += l;
1844 cache_size -= l;
1845 total_len += l;
1846 } else {
1847 cache += cache_size;
1848 total_len += cache_size;
1849 cache_size = 0;
1850 break;
1851 }
1852 }
1853
1854 d->size = total_len;
1855 if (total_len > size ) total_len = size;
1856
1857 /* read from off 0 */
1858 memcpy(buf, d->buf, total_len);
1859 rv = total_len;
1860 err:
1861 fclose(f);
1862 free(line);
1863 return rv;
1864}
1865
758ad80c 1866/*
2ad6d2bd 1867 * FUSE ops for /proc
758ad80c 1868 */
758ad80c 1869
7bc95a75
SH
1870static unsigned long get_memlimit(const char *cgroup)
1871{
1872 char *memlimit_str = NULL;
1873 unsigned long memlimit = -1;
1874
35482f91 1875 if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
7bc95a75
SH
1876 memlimit = strtoul(memlimit_str, NULL, 10);
1877
1878 free(memlimit_str);
1879
1880 return memlimit;
1881}
1882
1883static unsigned long get_min_memlimit(const char *cgroup)
1884{
1885 char *copy = strdupa(cgroup);
1886 unsigned long memlimit = 0, retlimit;
1887
1888 retlimit = get_memlimit(copy);
1889
1890 while (strcmp(copy, "/") != 0) {
1891 copy = dirname(copy);
1892 memlimit = get_memlimit(copy);
1893 if (memlimit != -1 && memlimit < retlimit)
1894 retlimit = memlimit;
1895 };
1896
1897 return retlimit;
1898}
1899
23ce2127
SH
1900static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1901 struct fuse_file_info *fi)
1902{
2dc17609 1903 struct fuse_context *fc = fuse_get_context();
97f1f27b 1904 struct file_info *d = (struct file_info *)fi->fh;
2c51f8dd 1905 char *cg;
4622ad78 1906 char *memusage_str = NULL, *memstat_str = NULL,
b731895e
NW
1907 *memswlimit_str = NULL, *memswusage_str = NULL,
1908 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
4622ad78
TG
1909 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
1910 cached = 0, hosttotal = 0;
2dc17609 1911 char *line = NULL;
e1068397 1912 size_t linelen = 0, total_len = 0, rv = 0;
97f1f27b
YY
1913 char *cache = d->buf;
1914 size_t cache_size = d->buflen;
2c51f8dd 1915 FILE *f = NULL;
2dc17609 1916
97f1f27b
YY
1917 if (offset){
1918 if (offset > d->size)
1919 return -EINVAL;
b5ad2d21
SH
1920 if (!d->cached)
1921 return 0;
97f1f27b
YY
1922 int left = d->size - offset;
1923 total_len = left > size ? size: left;
1924 memcpy(buf, cache + offset, total_len);
1925 return total_len;
1926 }
2dc17609 1927
2c51f8dd 1928 cg = get_pid_cgroup(fc->pid, "memory");
2dc17609 1929 if (!cg)
53b43826 1930 return read_file("/proc/meminfo", buf, size, d);
2dc17609 1931
7bc95a75 1932 memlimit = get_min_memlimit(cg);
35482f91 1933 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
2c51f8dd 1934 goto err;
35482f91 1935 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
2c51f8dd 1936 goto err;
4622ad78
TG
1937
1938 // Following values are allowed to fail, because swapaccount might be turned
1939 // off for current kernel
1940 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
1941 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
1942 {
b731895e
NW
1943 /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
1944 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
1945 goto err;
1946 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
1947 goto err;
1948
4622ad78
TG
1949 memswlimit = strtoul(memswlimit_str, NULL, 10);
1950 memswusage = strtoul(memswusage_str, NULL, 10);
b731895e
NW
1951
1952 if (!strcmp(memswlimit_str, memswlimit_default_str))
a2de34ba 1953 memswlimit = 0;
b731895e 1954 if (!strcmp(memswusage_str, memswusage_default_str))
a2de34ba
SH
1955 memswusage = 0;
1956
b731895e
NW
1957 memswlimit = memswlimit / 1024;
1958 memswusage = memswusage / 1024;
4622ad78 1959 }
b731895e
NW
1960
1961 memusage = strtoul(memusage_str, NULL, 10);
1962 memlimit /= 1024;
1963 memusage /= 1024;
1964
2dc17609
SH
1965 get_mem_cached(memstat_str, &cached);
1966
1967 f = fopen("/proc/meminfo", "r");
1968 if (!f)
2c51f8dd 1969 goto err;
2dc17609
SH
1970
1971 while (getline(&line, &linelen, f) != -1) {
1972 size_t l;
1973 char *printme, lbuf[100];
1974
1975 memset(lbuf, 0, 100);
1976 if (startswith(line, "MemTotal:")) {
1977 sscanf(line+14, "%lu", &hosttotal);
1978 if (hosttotal < memlimit)
1979 memlimit = hosttotal;
1980 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1981 printme = lbuf;
1982 } else if (startswith(line, "MemFree:")) {
1983 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1984 printme = lbuf;
1985 } else if (startswith(line, "MemAvailable:")) {
1986 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1987 printme = lbuf;
4622ad78
TG
1988 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
1989 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit);
1990 printme = lbuf;
1991 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
1992 snprintf(lbuf, 100, "SwapFree: %8lu kB\n",
1993 (memswlimit - memlimit) - (memswusage - memusage));
1994 printme = lbuf;
2dc17609
SH
1995 } else if (startswith(line, "Buffers:")) {
1996 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1997 printme = lbuf;
1998 } else if (startswith(line, "Cached:")) {
1999 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
2000 printme = lbuf;
2001 } else if (startswith(line, "SwapCached:")) {
2002 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
2003 printme = lbuf;
2004 } else
2005 printme = line;
97f1f27b
YY
2006
2007 l = snprintf(cache, cache_size, "%s", printme);
e1068397
MM
2008 if (l < 0) {
2009 perror("Error writing to cache");
2010 rv = 0;
2011 goto err;
2012
2013 }
2014 if (l >= cache_size) {
2015 fprintf(stderr, "Internal error: truncated write to cache\n");
2016 rv = 0;
2017 goto err;
2018 }
2019
97f1f27b
YY
2020 cache += l;
2021 cache_size -= l;
2f919d9d 2022 total_len += l;
2dc17609
SH
2023 }
2024
b5ad2d21 2025 d->cached = 1;
97f1f27b
YY
2026 d->size = total_len;
2027 if (total_len > size ) total_len = size;
2028 memcpy(buf, d->buf, total_len);
2029
e1068397 2030 rv = total_len;
2c51f8dd
SH
2031err:
2032 if (f)
2033 fclose(f);
92c84dc4 2034 free(line);
2c51f8dd 2035 free(cg);
2c51f8dd 2036 free(memusage_str);
4622ad78
TG
2037 free(memswlimit_str);
2038 free(memswusage_str);
2c51f8dd 2039 free(memstat_str);
b731895e
NW
2040 free(memswlimit_default_str);
2041 free(memswusage_default_str);
e1068397 2042 return rv;
23ce2127
SH
2043}
2044
2045/*
2046 * Read the cpuset.cpus for cg
2c51f8dd 2047 * Return the answer in a newly allocated string which must be freed
23ce2127
SH
2048 */
2049static char *get_cpuset(const char *cg)
2050{
2051 char *answer;
2052
35482f91 2053 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
23ce2127
SH
2054 return NULL;
2055 return answer;
2056}
2057
fa47bb52 2058bool cpu_in_cpuset(int cpu, const char *cpuset);
23ce2127 2059
aeb56147
SH
2060static bool cpuline_in_cpuset(const char *line, const char *cpuset)
2061{
2062 int cpu;
2063
2064 if (sscanf(line, "processor : %d", &cpu) != 1)
2065 return false;
2066 return cpu_in_cpuset(cpu, cpuset);
2067}
2068
23ce2127
SH
2069/*
2070 * check whether this is a '^processor" line in /proc/cpuinfo
2071 */
2072static bool is_processor_line(const char *line)
2073{
2074 int cpu;
2075
2076 if (sscanf(line, "processor : %d", &cpu) == 1)
2077 return true;
2078 return false;
2079}
2080
23ce2127
SH
2081static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
2082 struct fuse_file_info *fi)
2083{
2084 struct fuse_context *fc = fuse_get_context();
97f1f27b 2085 struct file_info *d = (struct file_info *)fi->fh;
2c51f8dd
SH
2086 char *cg;
2087 char *cpuset = NULL;
23ce2127 2088 char *line = NULL;
e1068397 2089 size_t linelen = 0, total_len = 0, rv = 0;
23ce2127
SH
2090 bool am_printing = false;
2091 int curcpu = -1;
97f1f27b
YY
2092 char *cache = d->buf;
2093 size_t cache_size = d->buflen;
2c51f8dd 2094 FILE *f = NULL;
23ce2127 2095
97f1f27b
YY
2096 if (offset){
2097 if (offset > d->size)
2098 return -EINVAL;
b5ad2d21
SH
2099 if (!d->cached)
2100 return 0;
97f1f27b
YY
2101 int left = d->size - offset;
2102 total_len = left > size ? size: left;
2103 memcpy(buf, cache + offset, total_len);
2f919d9d 2104 return total_len;
97f1f27b 2105 }
23ce2127 2106
2c51f8dd 2107 cg = get_pid_cgroup(fc->pid, "cpuset");
23ce2127 2108 if (!cg)
53b43826 2109 return read_file("proc/cpuinfo", buf, size, d);
23ce2127
SH
2110
2111 cpuset = get_cpuset(cg);
2112 if (!cpuset)
2c51f8dd 2113 goto err;
23ce2127
SH
2114
2115 f = fopen("/proc/cpuinfo", "r");
2116 if (!f)
2c51f8dd 2117 goto err;
23ce2127
SH
2118
2119 while (getline(&line, &linelen, f) != -1) {
2120 size_t l;
2121 if (is_processor_line(line)) {
aeb56147 2122 am_printing = cpuline_in_cpuset(line, cpuset);
23ce2127
SH
2123 if (am_printing) {
2124 curcpu ++;
97f1f27b 2125 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
e1068397
MM
2126 if (l < 0) {
2127 perror("Error writing to cache");
2128 rv = 0;
2129 goto err;
2130 }
2131 if (l >= cache_size) {
2132 fprintf(stderr, "Internal error: truncated write to cache\n");
2133 rv = 0;
2134 goto err;
2135 }
97f1f27b
YY
2136 if (l < cache_size){
2137 cache += l;
2138 cache_size -= l;
2139 total_len += l;
2140 }else{
2141 cache += cache_size;
2142 total_len += cache_size;
2143 cache_size = 0;
2144 break;
2145 }
23ce2127
SH
2146 }
2147 continue;
2148 }
2149 if (am_printing) {
97f1f27b 2150 l = snprintf(cache, cache_size, "%s", line);
e1068397
MM
2151 if (l < 0) {
2152 perror("Error writing to cache");
2153 rv = 0;
2154 goto err;
2155 }
2156 if (l >= cache_size) {
2157 fprintf(stderr, "Internal error: truncated write to cache\n");
2158 rv = 0;
2159 goto err;
2160 }
97f1f27b
YY
2161 if (l < cache_size) {
2162 cache += l;
2163 cache_size -= l;
2164 total_len += l;
2165 } else {
2166 cache += cache_size;
2167 total_len += cache_size;
2168 cache_size = 0;
2169 break;
2170 }
23ce2127
SH
2171 }
2172 }
2173
b5ad2d21 2174 d->cached = 1;
97f1f27b
YY
2175 d->size = total_len;
2176 if (total_len > size ) total_len = size;
2177
2178 /* read from off 0 */
2179 memcpy(buf, d->buf, total_len);
e1068397 2180 rv = total_len;
2c51f8dd
SH
2181err:
2182 if (f)
2183 fclose(f);
92c84dc4 2184 free(line);
2c51f8dd
SH
2185 free(cpuset);
2186 free(cg);
e1068397 2187 return rv;
23ce2127
SH
2188}
2189
2190static int proc_stat_read(char *buf, size_t size, off_t offset,
2191 struct fuse_file_info *fi)
2192{
aeb56147 2193 struct fuse_context *fc = fuse_get_context();
97f1f27b 2194 struct file_info *d = (struct file_info *)fi->fh;
2c51f8dd
SH
2195 char *cg;
2196 char *cpuset = NULL;
aeb56147 2197 char *line = NULL;
e1068397 2198 size_t linelen = 0, total_len = 0, rv = 0;
2a0fde62 2199 int curcpu = -1; /* cpu numbering starts at 0 */
97f1f27b
YY
2200 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
2201 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
2202 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
2203#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
2204 char cpuall[CPUALL_MAX_SIZE];
2205 /* reserve for cpu all */
2206 char *cache = d->buf + CPUALL_MAX_SIZE;
2207 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
2c51f8dd 2208 FILE *f = NULL;
aeb56147 2209
97f1f27b
YY
2210 if (offset){
2211 if (offset > d->size)
2212 return -EINVAL;
b5ad2d21
SH
2213 if (!d->cached)
2214 return 0;
97f1f27b
YY
2215 int left = d->size - offset;
2216 total_len = left > size ? size: left;
2217 memcpy(buf, d->buf + offset, total_len);
2f919d9d 2218 return total_len;
97f1f27b 2219 }
aeb56147 2220
2c51f8dd 2221 cg = get_pid_cgroup(fc->pid, "cpuset");
aeb56147 2222 if (!cg)
53b43826 2223 return read_file("/proc/stat", buf, size, d);
aeb56147
SH
2224
2225 cpuset = get_cpuset(cg);
2226 if (!cpuset)
2c51f8dd 2227 goto err;
aeb56147
SH
2228
2229 f = fopen("/proc/stat", "r");
2230 if (!f)
2c51f8dd 2231 goto err;
aeb56147 2232
97f1f27b
YY
2233 //skip first line
2234 if (getline(&line, &linelen, f) < 0) {
2235 fprintf(stderr, "proc_stat_read read first line failed\n");
2c51f8dd 2236 goto err;
97f1f27b
YY
2237 }
2238
aeb56147
SH
2239 while (getline(&line, &linelen, f) != -1) {
2240 size_t l;
2241 int cpu;
2a0fde62 2242 char cpu_char[10]; /* That's a lot of cores */
aeb56147
SH
2243 char *c;
2244
2a0fde62
CB
2245 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
2246 /* not a ^cpuN line containing a number N, just print it */
97f1f27b 2247 l = snprintf(cache, cache_size, "%s", line);
e1068397
MM
2248 if (l < 0) {
2249 perror("Error writing to cache");
2250 rv = 0;
2251 goto err;
2252 }
2253 if (l >= cache_size) {
2254 fprintf(stderr, "Internal error: truncated write to cache\n");
2255 rv = 0;
2256 goto err;
2257 }
2258 if (l < cache_size) {
97f1f27b
YY
2259 cache += l;
2260 cache_size -= l;
2261 total_len += l;
2262 continue;
e1068397 2263 } else {
97f1f27b
YY
2264 //no more space, break it
2265 cache += cache_size;
2266 total_len += cache_size;
2267 cache_size = 0;
2268 break;
2269 }
aeb56147 2270 }
2a0fde62
CB
2271
2272 if (sscanf(cpu_char, "%d", &cpu) != 1)
2273 continue;
aeb56147
SH
2274 if (!cpu_in_cpuset(cpu, cpuset))
2275 continue;
2276 curcpu ++;
2277
2278 c = strchr(line, ' ');
2279 if (!c)
2280 continue;
25c5e8fb 2281 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
e1068397
MM
2282 if (l < 0) {
2283 perror("Error writing to cache");
2284 rv = 0;
2285 goto err;
2286
2287 }
2288 if (l >= cache_size) {
2289 fprintf(stderr, "Internal error: truncated write to cache\n");
2290 rv = 0;
2291 goto err;
2292 }
2293
97f1f27b
YY
2294 cache += l;
2295 cache_size -= l;
aeb56147 2296 total_len += l;
2f919d9d 2297
97f1f27b
YY
2298 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
2299 &softirq, &steal, &guest) != 9)
2300 continue;
2301 user_sum += user;
2302 nice_sum += nice;
2303 system_sum += system;
2304 idle_sum += idle;
2305 iowait_sum += iowait;
2306 irq_sum += irq;
2307 softirq_sum += softirq;
2308 steal_sum += steal;
2f919d9d 2309 guest_sum += guest;
97f1f27b
YY
2310 }
2311
2312 cache = d->buf;
2313
2f919d9d 2314 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
97f1f27b
YY
2315 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
2316 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
2317 memcpy(cache, cpuall, cpuall_len);
2f919d9d 2318 cache += cpuall_len;
2c51f8dd 2319 } else{
97f1f27b
YY
2320 /* shouldn't happen */
2321 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
2322 cpuall_len = 0;
2323 }
2324
2325 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
2326 total_len += cpuall_len;
b5ad2d21 2327 d->cached = 1;
97f1f27b
YY
2328 d->size = total_len;
2329 if (total_len > size ) total_len = size;
2330
2331 memcpy(buf, d->buf, total_len);
e1068397 2332 rv = total_len;
2c51f8dd
SH
2333
2334err:
2335 if (f)
2336 fclose(f);
92c84dc4 2337 free(line);
2c51f8dd
SH
2338 free(cpuset);
2339 free(cg);
e1068397 2340 return rv;
23ce2127
SH
2341}
2342
0afd85bd 2343static long int getreaperage(pid_t pid)
41bb9357
SH
2344{
2345 char fnam[100];
41bb9357 2346 struct stat sb;
0afd85bd
SH
2347 int ret;
2348 pid_t qpid;
5ca64c2a 2349
0afd85bd
SH
2350 qpid = get_init_pid_for_task(pid);
2351 if (qpid < 0)
c0adec85
SH
2352 return 0;
2353
0afd85bd
SH
2354 ret = snprintf(fnam, 100, "/proc/%d", qpid);
2355 if (ret < 0 || ret >= 100)
41bb9357 2356 return 0;
ea56f722 2357
0afd85bd 2358 if (lstat(fnam, &sb) < 0)
41bb9357 2359 return 0;
41bb9357 2360
0afd85bd 2361 return time(NULL) - sb.st_ctime;
41bb9357
SH
2362}
2363
0b6af11b
SH
2364/*
2365 * fork a task which switches to @task's namespace and writes '1'.
2366 * over a unix sock so we can read the task's reaper's pid in our
2367 * namespace
2368 */
2369void write_task_init_pid_exit(int sock, pid_t target)
41bb9357 2370{
0b6af11b
SH
2371 struct ucred cred;
2372 char fnam[100];
2373 pid_t pid;
2374 char v;
2375 int fd, ret;
38056ebc 2376
0b6af11b
SH
2377 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
2378 if (ret < 0 || ret >= sizeof(fnam))
ff96a5f9 2379 _exit(1);
0b6af11b
SH
2380
2381 fd = open(fnam, O_RDONLY);
2382 if (fd < 0) {
ff96a5f9
SH
2383 perror("write_task_init_pid_exit open of ns/pid");
2384 _exit(1);
0b6af11b
SH
2385 }
2386 if (setns(fd, 0)) {
ff96a5f9 2387 perror("write_task_init_pid_exit setns 1");
0b6af11b 2388 close(fd);
ff96a5f9 2389 _exit(1);
0b6af11b
SH
2390 }
2391 pid = fork();
2392 if (pid < 0)
ff96a5f9 2393 _exit(1);
0b6af11b
SH
2394 if (pid != 0) {
2395 wait_for_pid(pid);
ff96a5f9 2396 _exit(0);
0b6af11b
SH
2397 }
2398
2399 /* we are the child */
2400 cred.uid = 0;
2401 cred.gid = 0;
2402 cred.pid = 1;
2403 v = '1';
2404 send_creds(sock, &cred, v, true);
ff96a5f9 2405 _exit(0);
0b6af11b
SH
2406}
2407
395a8b77 2408static pid_t get_init_pid_for_task(pid_t task)
0b6af11b
SH
2409{
2410 int sock[2];
2411 pid_t pid;
2412 pid_t ret = -1;
2413 char v = '0';
2414 struct ucred cred;
2415
2416 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2417 perror("socketpair");
2418 return -1;
2419 }
2420
2421 pid = fork();
2422 if (pid < 0)
2423 goto out;
2424 if (!pid) {
2425 close(sock[1]);
2426 write_task_init_pid_exit(sock[0], task);
2427 }
2428
2429 if (!recv_creds(sock[1], &cred, &v))
2430 goto out;
2431 ret = cred.pid;
2432
2433out:
2434 close(sock[0]);
2435 close(sock[1]);
40110ed0 2436 wait_for_pid(pid);
0b6af11b
SH
2437 return ret;
2438}
2439
2440static unsigned long get_reaper_busy(pid_t task)
2441{
395a8b77 2442 pid_t init = get_init_pid_for_task(task);
0b6af11b
SH
2443 char *cgroup = NULL, *usage_str = NULL;
2444 unsigned long usage = 0;
2445
2446 if (init == -1)
41bb9357 2447 return 0;
0b6af11b 2448
395a8b77 2449 cgroup = get_pid_cgroup(init, "cpuacct");
0b6af11b
SH
2450 if (!cgroup)
2451 goto out;
2452 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
2453 goto out;
2454 usage = strtoul(usage_str, NULL, 10);
87e96963 2455 usage /= 1000000000;
0b6af11b
SH
2456
2457out:
2458 free(cgroup);
2459 free(usage_str);
2460 return usage;
41bb9357
SH
2461}
2462
2463/*
2464 * We read /proc/uptime and reuse its second field.
2465 * For the first field, we use the mtime for the reaper for
2466 * the calling pid as returned by getreaperage
2467 */
23ce2127
SH
2468static int proc_uptime_read(char *buf, size_t size, off_t offset,
2469 struct fuse_file_info *fi)
2470{
41bb9357 2471 struct fuse_context *fc = fuse_get_context();
97f1f27b 2472 struct file_info *d = (struct file_info *)fi->fh;
ff96a5f9 2473 long int reaperage = getreaperage(fc->pid);
0b6af11b 2474 unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
b5ad2d21 2475 char *cache = d->buf;
97f1f27b 2476 size_t total_len = 0;
41bb9357 2477
97f1f27b
YY
2478 if (offset){
2479 if (offset > d->size)
2480 return -EINVAL;
b5ad2d21
SH
2481 if (!d->cached)
2482 return 0;
2483 int left = d->size - offset;
2484 total_len = left > size ? size: left;
2485 memcpy(buf, cache + offset, total_len);
2486 return total_len;
97f1f27b
YY
2487 }
2488
0b6af11b 2489 idletime = reaperage - busytime;
f6c0b279
SH
2490 if (idletime > reaperage)
2491 idletime = reaperage;
2492
b5ad2d21 2493 total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
e1068397
MM
2494 if (total_len < 0){
2495 perror("Error writing to cache");
2496 return 0;
2497 }
cdcdb29b 2498
b5ad2d21
SH
2499 d->size = (int)total_len;
2500 d->cached = 1;
2501
2502 if (total_len > size) total_len = size;
2503
2504 memcpy(buf, d->buf, total_len);
97f1f27b 2505 return total_len;
23ce2127
SH
2506}
2507
49878439
YY
2508static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2509 struct fuse_file_info *fi)
2510{
2511 char dev_name[72];
2512 struct fuse_context *fc = fuse_get_context();
97f1f27b 2513 struct file_info *d = (struct file_info *)fi->fh;
2c51f8dd
SH
2514 char *cg;
2515 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
49878439
YY
2516 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2517 unsigned long read = 0, write = 0;
2518 unsigned long read_merged = 0, write_merged = 0;
2519 unsigned long read_sectors = 0, write_sectors = 0;
2520 unsigned long read_ticks = 0, write_ticks = 0;
2521 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2522 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
b5ad2d21
SH
2523 char *cache = d->buf;
2524 size_t cache_size = d->buflen;
49878439 2525 char *line = NULL;
e1068397 2526 size_t linelen = 0, total_len = 0, rv = 0;
49878439
YY
2527 unsigned int major = 0, minor = 0;
2528 int i = 0;
2c51f8dd 2529 FILE *f = NULL;
49878439 2530
97f1f27b
YY
2531 if (offset){
2532 if (offset > d->size)
2533 return -EINVAL;
b5ad2d21
SH
2534 if (!d->cached)
2535 return 0;
2536 int left = d->size - offset;
2537 total_len = left > size ? size: left;
2538 memcpy(buf, cache + offset, total_len);
2539 return total_len;
97f1f27b 2540 }
49878439 2541
2c51f8dd 2542 cg = get_pid_cgroup(fc->pid, "blkio");
49878439 2543 if (!cg)
53b43826 2544 return read_file("/proc/diskstats", buf, size, d);
49878439 2545
35482f91 2546 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2c51f8dd 2547 goto err;
35482f91 2548 if (!cgfs_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2c51f8dd 2549 goto err;
35482f91 2550 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2c51f8dd 2551 goto err;
35482f91 2552 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2c51f8dd 2553 goto err;
35482f91 2554 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2c51f8dd 2555 goto err;
49878439
YY
2556
2557
2558 f = fopen("/proc/diskstats", "r");
2559 if (!f)
2c51f8dd 2560 goto err;
49878439
YY
2561
2562 while (getline(&line, &linelen, f) != -1) {
2563 size_t l;
2564 char *printme, lbuf[256];
2565
c0adec85 2566 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
49878439
YY
2567 if(i == 3){
2568 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2569 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2570 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2571 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2572 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2573 read_sectors = read_sectors/512;
2574 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2575 write_sectors = write_sectors/512;
2f919d9d 2576
49878439
YY
2577 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2578 rd_svctm = rd_svctm/1000000;
2579 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2580 rd_wait = rd_wait/1000000;
2581 read_ticks = rd_svctm + rd_wait;
2582
2583 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2584 wr_svctm = wr_svctm/1000000;
2585 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2586 wr_wait = wr_wait/1000000;
2587 write_ticks = wr_svctm + wr_wait;
2588
2589 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2590 tot_ticks = tot_ticks/1000000;
2591 }else{
2592 continue;
2593 }
2594
2595 memset(lbuf, 0, 256);
2596 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2f919d9d 2597 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
49878439
YY
2598 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2599 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2600 printme = lbuf;
2601 } else
2602 continue;
2603
b5ad2d21 2604 l = snprintf(cache, cache_size, "%s", printme);
e1068397
MM
2605 if (l < 0) {
2606 perror("Error writing to fuse buf");
2607 rv = 0;
2608 goto err;
2609 }
b5ad2d21 2610 if (l >= cache_size) {
e1068397
MM
2611 fprintf(stderr, "Internal error: truncated write to cache\n");
2612 rv = 0;
2613 goto err;
2614 }
b5ad2d21
SH
2615 cache += l;
2616 cache_size -= l;
49878439
YY
2617 total_len += l;
2618 }
2619
b5ad2d21 2620 d->cached = 1;
97f1f27b 2621 d->size = total_len;
b5ad2d21
SH
2622 if (total_len > size ) total_len = size;
2623 memcpy(buf, d->buf, total_len);
2624
e1068397 2625 rv = total_len;
2c51f8dd
SH
2626err:
2627 free(cg);
2628 if (f)
2629 fclose(f);
49878439 2630 free(line);
2c51f8dd
SH
2631 free(io_serviced_str);
2632 free(io_merged_str);
2633 free(io_service_bytes_str);
2634 free(io_wait_time_str);
2635 free(io_service_time_str);
e1068397 2636 return rv;
49878439
YY
2637}
2638
23ce2127
SH
2639static off_t get_procfile_size(const char *which)
2640{
2641 FILE *f = fopen(which, "r");
2642 char *line = NULL;
2643 size_t len = 0;
2644 ssize_t sz, answer = 0;
2645 if (!f)
2646 return 0;
2647
2648 while ((sz = getline(&line, &len, f)) != -1)
2649 answer += sz;
2650 fclose (f);
92c84dc4 2651 free(line);
23ce2127
SH
2652
2653 return answer;
2654}
2655
758ad80c
SH
2656static int proc_getattr(const char *path, struct stat *sb)
2657{
35629743
SH
2658 struct timespec now;
2659
2660 memset(sb, 0, sizeof(struct stat));
2661 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2662 return -EINVAL;
2663 sb->st_uid = sb->st_gid = 0;
2664 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2665 if (strcmp(path, "/proc") == 0) {
2666 sb->st_mode = S_IFDIR | 00555;
2667 sb->st_nlink = 2;
2668 return 0;
2669 }
2670 if (strcmp(path, "/proc/meminfo") == 0 ||
2671 strcmp(path, "/proc/cpuinfo") == 0 ||
2672 strcmp(path, "/proc/uptime") == 0 ||
49878439
YY
2673 strcmp(path, "/proc/stat") == 0 ||
2674 strcmp(path, "/proc/diskstats") == 0) {
7253e0a4 2675 sb->st_size = 0;
35629743
SH
2676 sb->st_mode = S_IFREG | 00444;
2677 sb->st_nlink = 1;
2678 return 0;
2679 }
2680
2681 return -ENOENT;
2682}
2683
2684static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2685 struct fuse_file_info *fi)
2686{
2687 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2688 filler(buf, "meminfo", NULL, 0) != 0 ||
2689 filler(buf, "stat", NULL, 0) != 0 ||
49878439
YY
2690 filler(buf, "uptime", NULL, 0) != 0 ||
2691 filler(buf, "diskstats", NULL, 0) != 0)
758ad80c 2692 return -EINVAL;
758ad80c
SH
2693 return 0;
2694}
2695
35629743
SH
2696static int proc_open(const char *path, struct fuse_file_info *fi)
2697{
96fc5ee6
SH
2698 int type = -1;
2699 struct file_info *info;
2700
2701 if (strcmp(path, "/proc/meminfo") == 0)
2702 type = LXC_TYPE_PROC_MEMINFO;
2703 else if (strcmp(path, "/proc/cpuinfo") == 0)
2704 type = LXC_TYPE_PROC_CPUINFO;
2705 else if (strcmp(path, "/proc/uptime") == 0)
2706 type = LXC_TYPE_PROC_UPTIME;
2707 else if (strcmp(path, "/proc/stat") == 0)
2708 type = LXC_TYPE_PROC_STAT;
2709 else if (strcmp(path, "/proc/diskstats") == 0)
2710 type = LXC_TYPE_PROC_DISKSTATS;
2711 if (type == -1)
2712 return -ENOENT;
2713
2c51f8dd
SH
2714 info = malloc(sizeof(*info));
2715 if (!info)
2716 return -ENOMEM;
2717
96fc5ee6
SH
2718 memset(info, 0, sizeof(*info));
2719 info->type = type;
2720
97f1f27b 2721 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
2c51f8dd
SH
2722 do {
2723 info->buf = malloc(info->buflen);
2724 } while (!info->buf);
97f1f27b
YY
2725 memset(info->buf, 0, info->buflen);
2726 /* set actual size to buffer size */
2f919d9d 2727 info->size = info->buflen;
97f1f27b 2728
96fc5ee6
SH
2729 fi->fh = (unsigned long)info;
2730 return 0;
2731}
2732
2733static int proc_release(const char *path, struct fuse_file_info *fi)
2734{
2735 struct file_info *f = (struct file_info *)fi->fh;
2736
2737 do_release_file_info(f);
2738 return 0;
35629743
SH
2739}
2740
35629743
SH
2741static int proc_read(const char *path, char *buf, size_t size, off_t offset,
2742 struct fuse_file_info *fi)
2743{
96fc5ee6
SH
2744 struct file_info *f = (struct file_info *) fi->fh;
2745
2746 switch (f->type) {
2f919d9d 2747 case LXC_TYPE_PROC_MEMINFO:
23ce2127 2748 return proc_meminfo_read(buf, size, offset, fi);
96fc5ee6 2749 case LXC_TYPE_PROC_CPUINFO:
23ce2127 2750 return proc_cpuinfo_read(buf, size, offset, fi);
96fc5ee6 2751 case LXC_TYPE_PROC_UPTIME:
23ce2127 2752 return proc_uptime_read(buf, size, offset, fi);
96fc5ee6 2753 case LXC_TYPE_PROC_STAT:
23ce2127 2754 return proc_stat_read(buf, size, offset, fi);
96fc5ee6 2755 case LXC_TYPE_PROC_DISKSTATS:
49878439 2756 return proc_diskstats_read(buf, size, offset, fi);
96fc5ee6
SH
2757 default:
2758 return -EINVAL;
2759 }
35629743
SH
2760}
2761
2ad6d2bd
SH
2762/*
2763 * FUSE ops for /
2764 * these just delegate to the /proc and /cgroup ops as
2765 * needed
2766 */
758ad80c
SH
2767
2768static int lxcfs_getattr(const char *path, struct stat *sb)
2769{
2770 if (strcmp(path, "/") == 0) {
2771 sb->st_mode = S_IFDIR | 00755;
2772 sb->st_nlink = 2;
2773 return 0;
2774 }
2775 if (strncmp(path, "/cgroup", 7) == 0) {
2776 return cg_getattr(path, sb);
2777 }
35629743 2778 if (strncmp(path, "/proc", 5) == 0) {
758ad80c
SH
2779 return proc_getattr(path, sb);
2780 }
2781 return -EINVAL;
2782}
2783
2784static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
2785{
2786 if (strcmp(path, "/") == 0)
2787 return 0;
2788
2789 if (strncmp(path, "/cgroup", 7) == 0) {
2790 return cg_opendir(path, fi);
2791 }
35629743
SH
2792 if (strcmp(path, "/proc") == 0)
2793 return 0;
2794 return -ENOENT;
758ad80c
SH
2795}
2796
2797static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2798 struct fuse_file_info *fi)
2799{
2800 if (strcmp(path, "/") == 0) {
2801 if (filler(buf, "proc", NULL, 0) != 0 ||
2802 filler(buf, "cgroup", NULL, 0) != 0)
2803 return -EINVAL;
2804 return 0;
2805 }
35629743 2806 if (strncmp(path, "/cgroup", 7) == 0)
758ad80c 2807 return cg_readdir(path, buf, filler, offset, fi);
35629743
SH
2808 if (strcmp(path, "/proc") == 0)
2809 return proc_readdir(path, buf, filler, offset, fi);
758ad80c
SH
2810 return -EINVAL;
2811}
2812
2813static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
2814{
2815 if (strcmp(path, "/") == 0)
2816 return 0;
2817 if (strncmp(path, "/cgroup", 7) == 0) {
2818 return cg_releasedir(path, fi);
2819 }
35629743
SH
2820 if (strcmp(path, "/proc") == 0)
2821 return 0;
758ad80c
SH
2822 return -EINVAL;
2823}
2824
99978832
SH
2825static int lxcfs_open(const char *path, struct fuse_file_info *fi)
2826{
35629743 2827 if (strncmp(path, "/cgroup", 7) == 0)
99978832 2828 return cg_open(path, fi);
35629743
SH
2829 if (strncmp(path, "/proc", 5) == 0)
2830 return proc_open(path, fi);
99978832
SH
2831
2832 return -EINVAL;
2833}
2834
2835static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
2836 struct fuse_file_info *fi)
2837{
35629743 2838 if (strncmp(path, "/cgroup", 7) == 0)
99978832 2839 return cg_read(path, buf, size, offset, fi);
35629743
SH
2840 if (strncmp(path, "/proc", 5) == 0)
2841 return proc_read(path, buf, size, offset, fi);
99978832
SH
2842
2843 return -EINVAL;
2844}
2845
2ad6d2bd
SH
2846int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
2847 struct fuse_file_info *fi)
2848{
2849 if (strncmp(path, "/cgroup", 7) == 0) {
2850 return cg_write(path, buf, size, offset, fi);
2851 }
2852
2853 return -EINVAL;
2854}
2855
99978832
SH
2856static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2857{
2858 return 0;
2859}
2860
2861static int lxcfs_release(const char *path, struct fuse_file_info *fi)
758ad80c 2862{
8f6e8f5e
SH
2863 if (strncmp(path, "/cgroup", 7) == 0)
2864 return cg_release(path, fi);
8f6e8f5e 2865 if (strncmp(path, "/proc", 5) == 0)
96fc5ee6 2866 return proc_release(path, fi);
8f6e8f5e
SH
2867
2868 return -EINVAL;
99978832
SH
2869}
2870
2871static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2872{
2873 return 0;
758ad80c
SH
2874}
2875
ab54b798
SH
2876int lxcfs_mkdir(const char *path, mode_t mode)
2877{
2878 if (strncmp(path, "/cgroup", 7) == 0)
2879 return cg_mkdir(path, mode);
2880
2881 return -EINVAL;
2882}
2883
341b21ad
SH
2884int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2885{
2886 if (strncmp(path, "/cgroup", 7) == 0)
2887 return cg_chown(path, uid, gid);
2888
2889 return -EINVAL;
2890}
2891
2ad6d2bd
SH
2892/*
2893 * cat first does a truncate before doing ops->write. This doesn't
2894 * really make sense for cgroups. So just return 0 always but do
2895 * nothing.
2896 */
2897int lxcfs_truncate(const char *path, off_t newsize)
2898{
2899 if (strncmp(path, "/cgroup", 7) == 0)
2900 return 0;
2901 return -EINVAL;
2902}
2903
50d8d5b5
SH
2904int lxcfs_rmdir(const char *path)
2905{
2906 if (strncmp(path, "/cgroup", 7) == 0)
2907 return cg_rmdir(path);
2908 return -EINVAL;
2909}
2910
fd2e4e03
SH
2911int lxcfs_chmod(const char *path, mode_t mode)
2912{
2913 if (strncmp(path, "/cgroup", 7) == 0)
2914 return cg_chmod(path, mode);
2915 return -EINVAL;
2916}
2917
758ad80c
SH
2918const struct fuse_operations lxcfs_ops = {
2919 .getattr = lxcfs_getattr,
2920 .readlink = NULL,
2921 .getdir = NULL,
2922 .mknod = NULL,
ab54b798 2923 .mkdir = lxcfs_mkdir,
758ad80c 2924 .unlink = NULL,
50d8d5b5 2925 .rmdir = lxcfs_rmdir,
758ad80c
SH
2926 .symlink = NULL,
2927 .rename = NULL,
2928 .link = NULL,
fd2e4e03 2929 .chmod = lxcfs_chmod,
341b21ad 2930 .chown = lxcfs_chown,
2ad6d2bd 2931 .truncate = lxcfs_truncate,
758ad80c 2932 .utime = NULL,
99978832
SH
2933
2934 .open = lxcfs_open,
2935 .read = lxcfs_read,
2936 .release = lxcfs_release,
2ad6d2bd 2937 .write = lxcfs_write,
99978832 2938
758ad80c 2939 .statfs = NULL,
99978832
SH
2940 .flush = lxcfs_flush,
2941 .fsync = lxcfs_fsync,
758ad80c
SH
2942
2943 .setxattr = NULL,
2944 .getxattr = NULL,
2945 .listxattr = NULL,
2946 .removexattr = NULL,
2947
2948 .opendir = lxcfs_opendir,
2949 .readdir = lxcfs_readdir,
2950 .releasedir = lxcfs_releasedir,
2951
2952 .fsyncdir = NULL,
2953 .init = NULL,
2954 .destroy = NULL,
2955 .access = NULL,
2956 .create = NULL,
2957 .ftruncate = NULL,
2958 .fgetattr = NULL,
2959};
2960
99978832 2961static void usage(const char *me)
758ad80c
SH
2962{
2963 fprintf(stderr, "Usage:\n");
2964 fprintf(stderr, "\n");
0b0f73db
SH
2965 fprintf(stderr, "%s mountpoint\n", me);
2966 fprintf(stderr, "%s -h\n", me);
758ad80c
SH
2967 exit(1);
2968}
2969
99978832 2970static bool is_help(char *w)
758ad80c
SH
2971{
2972 if (strcmp(w, "-h") == 0 ||
2973 strcmp(w, "--help") == 0 ||
2974 strcmp(w, "-help") == 0 ||
2975 strcmp(w, "help") == 0)
2976 return true;
2977 return false;
2978}
2979
0b0f73db
SH
2980void swallow_arg(int *argcp, char *argv[], char *which)
2981{
2982 int i;
2983
2984 for (i = 1; argv[i]; i++) {
2985 if (strcmp(argv[i], which) != 0)
2986 continue;
2987 for (; argv[i]; i++) {
2988 argv[i] = argv[i+1];
2989 }
2990 (*argcp)--;
2991 return;
2992 }
2993}
2994
2995void swallow_option(int *argcp, char *argv[], char *opt, char *v)
2996{
2997 int i;
2998
2999 for (i = 1; argv[i]; i++) {
3000 if (!argv[i+1])
3001 continue;
3002 if (strcmp(argv[i], opt) != 0)
3003 continue;
3004 if (strcmp(argv[i+1], v) != 0) {
3005 fprintf(stderr, "Warning: unexpected fuse option %s\n", v);
3006 exit(1);
3007 }
3008 for (; argv[i+1]; i++) {
3009 argv[i] = argv[i+2];
3010 }
3011 (*argcp) -= 2;
3012 return;
3013 }
3014}
3015
758ad80c
SH
3016int main(int argc, char *argv[])
3017{
c0adec85 3018 int ret = -1;
0b0f73db
SH
3019 /*
3020 * what we pass to fuse_main is:
3021 * argv[0] -s -f -o allow_other,directio argv[1] NULL
3022 */
2c51f8dd
SH
3023 int nargs = 5, cnt = 0;
3024 char *newargv[6];
758ad80c 3025
977ac879 3026#ifdef FORTRAVIS
df062bcb
SH
3027 /* for travis which runs on 12.04 */
3028 if (glib_check_version (2, 36, 0) != NULL)
3029 g_type_init ();
977ac879 3030#endif
df062bcb 3031
0b0f73db
SH
3032 /* accomodate older init scripts */
3033 swallow_arg(&argc, argv, "-s");
3034 swallow_arg(&argc, argv, "-f");
3035 swallow_option(&argc, argv, "-o", "allow_other");
3036
2e9c0b32
SH
3037 if (argc == 2 && strcmp(argv[1], "--version") == 0) {
3038 fprintf(stderr, "%s\n", VERSION);
3039 exit(0);
3040 }
0b0f73db 3041 if (argc != 2 || is_help(argv[1]))
758ad80c
SH
3042 usage(argv[0]);
3043
38a76a91 3044 newargv[cnt++] = argv[0];
38a76a91
SH
3045 newargv[cnt++] = "-f";
3046 newargv[cnt++] = "-o";
f466a31e 3047 newargv[cnt++] = "allow_other,direct_io,entry_timeout=0.5,attr_timeout=0.5";
38a76a91
SH
3048 newargv[cnt++] = argv[1];
3049 newargv[cnt++] = NULL;
758ad80c 3050
35482f91 3051 if (!cgfs_setup_controllers())
c0adec85 3052 goto out;
758ad80c 3053
35482f91 3054 ret = fuse_main(nargs, newargv, &lxcfs_ops, NULL);
758ad80c 3055
c0adec85 3056out:
758ad80c 3057 return ret;
2183082c 3058}