]> git.proxmox.com Git - mirror_lxcfs.git/blame - lxcfs.c
epoll: update timeout and retry at eintr
[mirror_lxcfs.git] / lxcfs.c
CommitLineData
758ad80c
SH
1/* lxcfs
2 *
2c51f8dd 3 * Copyright © 2014,2015 Canonical, Inc
758ad80c
SH
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
f2799430 6 * See COPYING file for details.
758ad80c
SH
7 */
8
758ad80c
SH
9#define FUSE_USE_VERSION 26
10
2183082c 11#include <stdio.h>
758ad80c
SH
12#include <dirent.h>
13#include <fcntl.h>
14#include <fuse.h>
15#include <unistd.h>
16#include <errno.h>
17#include <stdbool.h>
18#include <time.h>
19#include <string.h>
20#include <stdlib.h>
21#include <libgen.h>
41bb9357
SH
22#include <sched.h>
23#include <linux/sched.h>
a05660a6 24#include <sys/socket.h>
41bb9357 25#include <sys/mount.h>
5b2dfd85 26#include <sys/epoll.h>
41bb9357 27#include <wait.h>
758ad80c 28
977ac879 29#ifdef FORTRAVIS
df062bcb
SH
30#define GLIB_DISABLE_DEPRECATION_WARNINGS
31#include <glib-object.h>
977ac879 32#endif
df062bcb 33
35482f91 34#include "cgfs.h"
2e9c0b32 35#include "config.h" // for VERSION
758ad80c 36
443d13f5
SH
37enum {
38 LXC_TYPE_CGDIR,
39 LXC_TYPE_CGFILE,
40 LXC_TYPE_PROC_MEMINFO,
41 LXC_TYPE_PROC_CPUINFO,
42 LXC_TYPE_PROC_UPTIME,
43 LXC_TYPE_PROC_STAT,
44 LXC_TYPE_PROC_DISKSTATS,
45};
46
c688e1b3
SH
47struct file_info {
48 char *controller;
49 char *cgroup;
8f6e8f5e 50 char *file;
443d13f5 51 int type;
c688e1b3
SH
52 char *buf; // unused as of yet
53 int buflen;
97f1f27b 54 int size; //actual data size
b5ad2d21 55 int cached;
c688e1b3
SH
56};
57
97f1f27b
YY
58/* reserve buffer size, for cpuall in /proc/stat */
59#define BUF_RESERVE_SIZE 256
60
2c51f8dd
SH
61/*
62 * append pid to *src.
63 * src: a pointer to a char* in which ot append the pid.
64 * sz: the number of characters printed so far, minus trailing \0.
65 * asz: the allocated size so far
66 * pid: the pid to append
67 */
68static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
69{
2c51f8dd
SH
70 char tmp[30];
71
257f639b 72 int tmplen = sprintf(tmp, "%d\n", (int)pid);
2c51f8dd 73
3beb5342
SH
74 if (!*src || tmplen + *sz + 1 >= *asz) {
75 char *tmp;
2c51f8dd 76 do {
3beb5342
SH
77 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
78 } while (!tmp);
79 *src = tmp;
2c51f8dd
SH
80 *asz += BUF_RESERVE_SIZE;
81 }
3beb5342 82 memcpy((*src) +*sz , tmp, tmplen);
257f639b 83 *sz += tmplen;
3beb5342 84 (*src)[*sz] = '\0';
2c51f8dd
SH
85}
86
0afd85bd
SH
87static pid_t get_init_pid_for_task(pid_t task);
88
a05660a6
SH
89static int wait_for_pid(pid_t pid)
90{
91 int status, ret;
92
87dce5f6
SH
93 if (pid <= 0)
94 return -1;
95
a05660a6
SH
96again:
97 ret = waitpid(pid, &status, 0);
98 if (ret == -1) {
99 if (errno == EINTR)
100 goto again;
101 return -1;
102 }
103 if (ret != pid)
104 goto again;
105 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
106 return -1;
107 return 0;
108}
109
053a659d
SH
110/*
111 * Given a open file * to /proc/pid/{u,g}id_map, and an id
112 * valid in the caller's namespace, return the id mapped into
113 * pid's namespace.
114 * Returns the mapped id, or -1 on error.
115 */
116unsigned int
117convert_id_to_ns(FILE *idfile, unsigned int in_id)
118{
119 unsigned int nsuid, // base id for a range in the idfile's namespace
120 hostuid, // base id for a range in the caller's namespace
121 count; // number of ids in this range
122 char line[400];
123 int ret;
124
125 fseek(idfile, 0L, SEEK_SET);
126 while (fgets(line, 400, idfile)) {
127 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
128 if (ret != 3)
129 continue;
130 if (hostuid + count < hostuid || nsuid + count < nsuid) {
131 /*
132 * uids wrapped around - unexpected as this is a procfile,
133 * so just bail.
134 */
647c89e5 135 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
053a659d
SH
136 nsuid, hostuid, count, line);
137 return -1;
138 }
139 if (hostuid <= in_id && hostuid+count > in_id) {
140 /*
141 * now since hostuid <= in_id < hostuid+count, and
142 * hostuid+count and nsuid+count do not wrap around,
143 * we know that nsuid+(in_id-hostuid) which must be
144 * less that nsuid+(count) must not wrap around
145 */
146 return (in_id - hostuid) + nsuid;
147 }
148 }
149
150 // no answer found
151 return -1;
152}
153
341b21ad
SH
154/*
155 * for is_privileged_over,
156 * specify whether we require the calling uid to be root in his
157 * namespace
158 */
159#define NS_ROOT_REQD true
160#define NS_ROOT_OPT false
161
2c51f8dd
SH
162#define PROCLEN 100
163
341b21ad 164static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
758ad80c 165{
2c51f8dd
SH
166 char fpath[PROCLEN];
167 int ret;
053a659d
SH
168 bool answer = false;
169 uid_t nsuid;
170
341b21ad
SH
171 if (victim == -1 || uid == -1)
172 return false;
173
174 /*
175 * If the request is one not requiring root in the namespace,
176 * then having the same uid suffices. (i.e. uid 1000 has write
177 * access to files owned by uid 1000
178 */
179 if (!req_ns_root && uid == victim)
758ad80c
SH
180 return true;
181
2c51f8dd
SH
182 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
183 if (ret < 0 || ret >= PROCLEN)
184 return false;
053a659d
SH
185 FILE *f = fopen(fpath, "r");
186 if (!f)
187 return false;
188
341b21ad 189 /* if caller's not root in his namespace, reject */
053a659d
SH
190 nsuid = convert_id_to_ns(f, uid);
191 if (nsuid)
192 goto out;
193
341b21ad
SH
194 /*
195 * If victim is not mapped into caller's ns, reject.
196 * XXX I'm not sure this check is needed given that fuse
197 * will be sending requests where the vfs has converted
198 */
053a659d
SH
199 nsuid = convert_id_to_ns(f, victim);
200 if (nsuid == -1)
201 goto out;
202
203 answer = true;
204
205out:
206 fclose(f);
207 return answer;
758ad80c
SH
208}
209
210static bool perms_include(int fmode, mode_t req_mode)
211{
2ad6d2bd
SH
212 mode_t r;
213
214 switch (req_mode & O_ACCMODE) {
215 case O_RDONLY:
216 r = S_IROTH;
217 break;
218 case O_WRONLY:
219 r = S_IWOTH;
220 break;
221 case O_RDWR:
222 r = S_IROTH | S_IWOTH;
223 break;
224 default:
225 return false;
226 }
227 return ((fmode & r) == r);
758ad80c
SH
228}
229
a8b6c3e0
SH
230
231/*
232 * taskcg is a/b/c
233 * querycg is /a/b/c/d/e
234 * we return 'd'
235 */
3db25a35
SH
236static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
237{
238 char *start, *end;
239
240 if (strlen(taskcg) <= strlen(querycg)) {
241 fprintf(stderr, "%s: I was fed bad input\n", __func__);
242 return NULL;
243 }
244
245 if (strcmp(querycg, "/") == 0)
2c51f8dd 246 start = strdup(taskcg + 1);
3db25a35 247 else
2c51f8dd
SH
248 start = strdup(taskcg + strlen(querycg) + 1);
249 if (!start)
250 return NULL;
3db25a35
SH
251 end = strchr(start, '/');
252 if (end)
253 *end = '\0';
254 return start;
255}
256
2c51f8dd
SH
257static void stripnewline(char *x)
258{
259 size_t l = strlen(x);
260 if (l && x[l-1] == '\n')
261 x[l-1] = '\0';
262}
263
264static char *get_pid_cgroup(pid_t pid, const char *contrl)
265{
266 char fnam[PROCLEN];
267 FILE *f;
268 char *answer = NULL;
269 char *line = NULL;
270 size_t len = 0;
271 int ret;
777dd831
SH
272 const char *h = find_mounted_controller(contrl);
273 if (!h)
274 return NULL;
2c51f8dd
SH
275
276 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
277 if (ret < 0 || ret >= PROCLEN)
278 return NULL;
279 if (!(f = fopen(fnam, "r")))
280 return NULL;
281
282 while (getline(&line, &len, f) != -1) {
283 char *c1, *c2;
284 if (!line[0])
285 continue;
286 c1 = strchr(line, ':');
287 if (!c1)
288 goto out;
289 c1++;
290 c2 = strchr(c1, ':');
291 if (!c2)
292 goto out;
293 *c2 = '\0';
777dd831 294 if (strcmp(c1, h) != 0)
2c51f8dd
SH
295 continue;
296 c2++;
297 stripnewline(c2);
298 do {
299 answer = strdup(c2);
300 } while (!answer);
301 break;
302 }
303
304out:
305 fclose(f);
306 free(line);
307 return answer;
308}
309
758ad80c
SH
310/*
311 * check whether a fuse context may access a cgroup dir or file
312 *
313 * If file is not null, it is a cgroup file to check under cg.
314 * If file is null, then we are checking perms on cg itself.
315 *
316 * For files we can check the mode of the list_keys result.
317 * For cgroups, we must make assumptions based on the files under the
318 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
319 * yet.
320 */
321static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
322{
35482f91 323 struct cgfs_files *k = NULL;
2c51f8dd 324 bool ret = false;
758ad80c 325
35482f91
SH
326 k = cgfs_get_key(contrl, cg, file);
327 if (!k)
758ad80c 328 return false;
35482f91
SH
329
330 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
331 if (perms_include(k->mode >> 6, mode)) {
332 ret = true;
2c51f8dd 333 goto out;
758ad80c
SH
334 }
335 }
35482f91
SH
336 if (fc->gid == k->gid) {
337 if (perms_include(k->mode >> 3, mode)) {
338 ret = true;
339 goto out;
340 }
341 }
342 ret = perms_include(k->mode, mode);
758ad80c 343
2c51f8dd 344out:
35482f91 345 free_key(k);
2c51f8dd 346 return ret;
3db25a35
SH
347}
348
04b5cbdc
SH
349#define INITSCOPE "/init.scope"
350static void prune_init_slice(char *cg)
351{
352 char *point;
353 point = cg + strlen(cg) - strlen(INITSCOPE);
354 if (point < cg)
355 return;
356 if (strcmp(point, INITSCOPE) == 0) {
357 if (point == cg)
358 *(point+1) = '\0';
359 else
360 *point = '\0';
361 }
362}
363
3db25a35 364/*
0dcc31ea
SH
365 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
366 * If pid is in /a, he may act on /a/b, but not on /b.
3db25a35 367 * if the answer is false and nextcg is not NULL, then *nextcg will point
2c51f8dd
SH
368 * to a string containing the next cgroup directory under cg, which must be
369 * freed by the caller.
3db25a35
SH
370 */
371static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
372{
3db25a35 373 bool answer = false;
a8b6c3e0
SH
374 char *c2 = get_pid_cgroup(pid, contrl);
375 char *linecmp;
3db25a35 376
a8b6c3e0 377 if (!c2)
3db25a35 378 return false;
a8b6c3e0 379 prune_init_slice(c2);
3db25a35 380
a8b6c3e0
SH
381 /*
382 * callers pass in '/' for root cgroup, otherwise they pass
383 * in a cgroup without leading '/'
384 */
385 linecmp = *cg == '/' ? c2 : c2+1;
386 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
387 if (nextcg) {
388 *nextcg = get_next_cgroup_dir(linecmp, cg);
3db25a35 389 }
a8b6c3e0
SH
390 goto out;
391 }
392 answer = true;
393
394out:
395 free(c2);
396 return answer;
397}
398
399/*
0dcc31ea 400 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
a8b6c3e0
SH
401 */
402static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
403{
404 bool answer = false;
405 char *c2, *task_cg;
406 size_t target_len, task_len;
407
408 if (strcmp(cg, "/") == 0)
409 return true;
410
411 c2 = get_pid_cgroup(pid, contrl);
a8b6c3e0
SH
412 if (!c2)
413 return false;
ec3b236f 414 prune_init_slice(c2);
a8b6c3e0
SH
415
416 task_cg = c2 + 1;
417 target_len = strlen(cg);
418 task_len = strlen(task_cg);
a57cba3c
TA
419 if (task_len == 0) {
420 /* Task is in the root cg, it can see everything. This case is
421 * not handled by the strmcps below, since they test for the
422 * last /, but that is the first / that we've chopped off
423 * above.
424 */
425 answer = true;
426 goto out;
427 }
a8b6c3e0 428 if (strcmp(cg, task_cg) == 0) {
3db25a35
SH
429 answer = true;
430 goto out;
431 }
a8b6c3e0
SH
432 if (target_len < task_len) {
433 /* looking up a parent dir */
434 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
435 answer = true;
436 goto out;
437 }
438 if (target_len > task_len) {
439 /* looking up a child dir */
440 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
441 answer = true;
442 goto out;
443 }
3db25a35
SH
444
445out:
a8b6c3e0 446 free(c2);
3db25a35
SH
447 return answer;
448}
449
758ad80c 450/*
2c51f8dd
SH
451 * given /cgroup/freezer/a/b, return "freezer".
452 * the returned char* should NOT be freed.
758ad80c
SH
453 */
454static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
455{
456 const char *p1;
2c51f8dd 457 char *contr, *slash;
758ad80c
SH
458
459 if (strlen(path) < 9)
460 return NULL;
ac5d9d48
SH
461 if (*(path+7) != '/')
462 return NULL;
758ad80c 463 p1 = path+8;
2c51f8dd
SH
464 contr = strdupa(p1);
465 if (!contr)
466 return NULL;
467 slash = strstr(contr, "/");
758ad80c
SH
468 if (slash)
469 *slash = '\0';
470
758ad80c 471 int i;
35482f91
SH
472 for (i = 0; i < num_hierarchies; i++) {
473 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
474 return hierarchies[i];
758ad80c 475 }
758ad80c
SH
476 return NULL;
477}
478
479/*
480 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
481 * Note that the returned value may include files (keynames) etc
482 */
483static const char *find_cgroup_in_path(const char *path)
484{
485 const char *p1;
486
487 if (strlen(path) < 9)
488 return NULL;
489 p1 = strstr(path+8, "/");
490 if (!p1)
491 return NULL;
492 return p1+1;
493}
494
2c51f8dd 495/*
febf2b87
SH
496 * split the last path element from the path in @cg.
497 * @dir is newly allocated and should be freed, @last not
498*/
499static void get_cgdir_and_path(const char *cg, char **dir, char **last)
758ad80c 500{
758ad80c
SH
501 char *p;
502
2c51f8dd
SH
503 do {
504 *dir = strdup(cg);
505 } while (!*dir);
febf2b87
SH
506 *last = strrchr(cg, '/');
507 if (!*last) {
508 *last = NULL;
758ad80c
SH
509 return;
510 }
511 p = strrchr(*dir, '/');
512 *p = '\0';
513}
514
515/*
2ad6d2bd 516 * FUSE ops for /cgroup
758ad80c 517 */
2ad6d2bd 518
758ad80c
SH
519static int cg_getattr(const char *path, struct stat *sb)
520{
521 struct timespec now;
522 struct fuse_context *fc = fuse_get_context();
2c51f8dd 523 char * cgdir = NULL;
febf2b87 524 char *last = NULL, *path1, *path2;
35482f91 525 struct cgfs_files *k = NULL;
758ad80c 526 const char *cgroup;
2c51f8dd
SH
527 const char *controller = NULL;
528 int ret = -ENOENT;
758ad80c
SH
529
530
531 if (!fc)
532 return -EIO;
533
534 memset(sb, 0, sizeof(struct stat));
535
536 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
537 return -EINVAL;
538
539 sb->st_uid = sb->st_gid = 0;
540 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
541 sb->st_size = 0;
542
543 if (strcmp(path, "/cgroup") == 0) {
544 sb->st_mode = S_IFDIR | 00755;
545 sb->st_nlink = 2;
546 return 0;
547 }
548
549 controller = pick_controller_from_path(fc, path);
550 if (!controller)
551 return -EIO;
758ad80c
SH
552 cgroup = find_cgroup_in_path(path);
553 if (!cgroup) {
554 /* this is just /cgroup/controller, return it as a dir */
555 sb->st_mode = S_IFDIR | 00755;
556 sb->st_nlink = 2;
557 return 0;
558 }
341b21ad 559
febf2b87 560 get_cgdir_and_path(cgroup, &cgdir, &last);
758ad80c 561
febf2b87 562 if (!last) {
758ad80c
SH
563 path1 = "/";
564 path2 = cgdir;
565 } else {
566 path1 = cgdir;
febf2b87 567 path2 = last;
758ad80c
SH
568 }
569
0dcc31ea 570 pid_t initpid = get_init_pid_for_task(fc->pid);
87dce5f6
SH
571 if (initpid <= 0)
572 initpid = fc->pid;
758ad80c 573 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
febf2b87
SH
574 * Then check that caller's cgroup is under path if last is a child
575 * cgroup, or cgdir if last is a file */
758ad80c
SH
576
577 if (is_child_cgroup(controller, path1, path2)) {
0dcc31ea 578 if (!caller_may_see_dir(initpid, controller, cgroup)) {
a8b6c3e0
SH
579 ret = -ENOENT;
580 goto out;
581 }
0dcc31ea 582 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
f9a05025
SH
583 /* this is just /cgroup/controller, return it as a dir */
584 sb->st_mode = S_IFDIR | 00555;
585 sb->st_nlink = 2;
2c51f8dd
SH
586 ret = 0;
587 goto out;
588 }
589 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
590 ret = -EACCES;
591 goto out;
f9a05025 592 }
758ad80c 593
053a659d
SH
594 // get uid, gid, from '/tasks' file and make up a mode
595 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
596 sb->st_mode = S_IFDIR | 00755;
febf2b87 597 k = cgfs_get_key(controller, cgroup, NULL);
053a659d 598 if (!k) {
053a659d
SH
599 sb->st_uid = sb->st_gid = 0;
600 } else {
053a659d
SH
601 sb->st_uid = k->uid;
602 sb->st_gid = k->gid;
603 }
2c51f8dd 604 free_key(k);
758ad80c 605 sb->st_nlink = 2;
2c51f8dd
SH
606 ret = 0;
607 goto out;
758ad80c
SH
608 }
609
35482f91 610 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
758ad80c 611 sb->st_mode = S_IFREG | k->mode;
053a659d 612 sb->st_nlink = 1;
758ad80c
SH
613 sb->st_uid = k->uid;
614 sb->st_gid = k->gid;
7253e0a4 615 sb->st_size = 0;
2c51f8dd 616 free_key(k);
0dcc31ea 617 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
adc3867b
SH
618 ret = -ENOENT;
619 goto out;
620 }
621 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
622 ret = -EACCES;
623 goto out;
624 }
2c51f8dd
SH
625
626 ret = 0;
758ad80c
SH
627 }
628
2c51f8dd
SH
629out:
630 free(cgdir);
631 return ret;
758ad80c 632}
2183082c 633
758ad80c 634static int cg_opendir(const char *path, struct fuse_file_info *fi)
2183082c 635{
7f163b71 636 struct fuse_context *fc = fuse_get_context();
7f163b71 637 const char *cgroup;
c688e1b3 638 struct file_info *dir_info;
2c51f8dd 639 char *controller = NULL;
7f163b71
SH
640
641 if (!fc)
642 return -EIO;
643
c688e1b3
SH
644 if (strcmp(path, "/cgroup") == 0) {
645 cgroup = NULL;
646 controller = NULL;
647 } else {
648 // return list of keys for the controller, and list of child cgroups
649 controller = pick_controller_from_path(fc, path);
650 if (!controller)
651 return -EIO;
7f163b71 652
c688e1b3
SH
653 cgroup = find_cgroup_in_path(path);
654 if (!cgroup) {
655 /* this is just /cgroup/controller, return its contents */
656 cgroup = "/";
657 }
7f163b71
SH
658 }
659
0dcc31ea 660 pid_t initpid = get_init_pid_for_task(fc->pid);
87dce5f6
SH
661 if (initpid <= 0)
662 initpid = fc->pid;
a8b6c3e0 663 if (cgroup) {
0dcc31ea 664 if (!caller_may_see_dir(initpid, controller, cgroup))
a8b6c3e0
SH
665 return -ENOENT;
666 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
667 return -EACCES;
2c51f8dd 668 }
c688e1b3
SH
669
670 /* we'll free this at cg_releasedir */
2c51f8dd
SH
671 dir_info = malloc(sizeof(*dir_info));
672 if (!dir_info)
673 return -ENOMEM;
35482f91
SH
674 dir_info->controller = must_copy_string(controller);
675 dir_info->cgroup = must_copy_string(cgroup);
443d13f5 676 dir_info->type = LXC_TYPE_CGDIR;
c688e1b3 677 dir_info->buf = NULL;
8f6e8f5e 678 dir_info->file = NULL;
c688e1b3
SH
679 dir_info->buflen = 0;
680
681 fi->fh = (unsigned long)dir_info;
758ad80c
SH
682 return 0;
683}
684
758ad80c
SH
685static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
686 struct fuse_file_info *fi)
687{
c688e1b3 688 struct file_info *d = (struct file_info *)fi->fh;
35482f91 689 struct cgfs_files **list = NULL;
2c51f8dd
SH
690 int i, ret;
691 char *nextcg = NULL;
758ad80c 692 struct fuse_context *fc = fuse_get_context();
2c51f8dd 693 char **clist = NULL;
758ad80c 694
443d13f5 695 if (d->type != LXC_TYPE_CGDIR) {
b845ad01
SH
696 fprintf(stderr, "Internal error: file cache info used in readdir\n");
697 return -EIO;
698 }
c688e1b3
SH
699 if (!d->cgroup && !d->controller) {
700 // ls /var/lib/lxcfs/cgroup - just show list of controllers
758ad80c
SH
701 int i;
702
35482f91
SH
703 for (i = 0; i < num_hierarchies; i++) {
704 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
758ad80c
SH
705 return -EIO;
706 }
707 }
708 return 0;
709 }
710
35482f91 711 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
3db25a35 712 // not a valid cgroup
2c51f8dd
SH
713 ret = -EINVAL;
714 goto out;
715 }
3db25a35 716
0dcc31ea 717 pid_t initpid = get_init_pid_for_task(fc->pid);
87dce5f6
SH
718 if (initpid <= 0)
719 initpid = fc->pid;
0dcc31ea 720 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
3db25a35
SH
721 if (nextcg) {
722 int ret;
723 ret = filler(buf, nextcg, NULL, 0);
2c51f8dd
SH
724 free(nextcg);
725 if (ret != 0) {
726 ret = -EIO;
727 goto out;
728 }
3db25a35 729 }
2c51f8dd
SH
730 ret = 0;
731 goto out;
3db25a35
SH
732 }
733
758ad80c 734 for (i = 0; list[i]; i++) {
758ad80c 735 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2c51f8dd
SH
736 ret = -EIO;
737 goto out;
758ad80c
SH
738 }
739 }
740
741 // now get the list of child cgroups
758ad80c 742
35482f91 743 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2c51f8dd
SH
744 ret = 0;
745 goto out;
746 }
758ad80c 747 for (i = 0; clist[i]; i++) {
758ad80c 748 if (filler(buf, clist[i], NULL, 0) != 0) {
2c51f8dd
SH
749 ret = -EIO;
750 goto out;
758ad80c
SH
751 }
752 }
2c51f8dd
SH
753 ret = 0;
754
755out:
756 free_keys(list);
757 if (clist) {
758 for (i = 0; clist[i]; i++)
759 free(clist[i]);
760 free(clist);
761 }
762 return ret;
758ad80c
SH
763}
764
8f6e8f5e
SH
765static void do_release_file_info(struct file_info *f)
766{
2c51f8dd
SH
767 if (!f)
768 return;
769 free(f->controller);
770 free(f->cgroup);
771 free(f->file);
772 free(f->buf);
773 free(f);
8f6e8f5e
SH
774}
775
758ad80c
SH
776static int cg_releasedir(const char *path, struct fuse_file_info *fi)
777{
c688e1b3
SH
778 struct file_info *d = (struct file_info *)fi->fh;
779
8f6e8f5e 780 do_release_file_info(d);
758ad80c
SH
781 return 0;
782}
783
99978832
SH
784static int cg_open(const char *path, struct fuse_file_info *fi)
785{
99978832 786 const char *cgroup;
febf2b87 787 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
35482f91 788 struct cgfs_files *k = NULL;
8f6e8f5e 789 struct file_info *file_info;
99978832 790 struct fuse_context *fc = fuse_get_context();
2c51f8dd 791 int ret;
99978832
SH
792
793 if (!fc)
794 return -EIO;
795
796 controller = pick_controller_from_path(fc, path);
797 if (!controller)
798 return -EIO;
799 cgroup = find_cgroup_in_path(path);
800 if (!cgroup)
801 return -EINVAL;
802
febf2b87
SH
803 get_cgdir_and_path(cgroup, &cgdir, &last);
804 if (!last) {
99978832
SH
805 path1 = "/";
806 path2 = cgdir;
807 } else {
808 path1 = cgdir;
febf2b87 809 path2 = last;
99978832
SH
810 }
811
35482f91 812 k = cgfs_get_key(controller, path1, path2);
2c51f8dd
SH
813 if (!k) {
814 ret = -EINVAL;
815 goto out;
816 }
817 free_key(k);
99978832 818
0dcc31ea 819 pid_t initpid = get_init_pid_for_task(fc->pid);
87dce5f6
SH
820 if (initpid <= 0)
821 initpid = fc->pid;
0dcc31ea 822 if (!caller_may_see_dir(initpid, controller, path1)) {
a8b6c3e0
SH
823 ret = -ENOENT;
824 goto out;
825 }
2c51f8dd 826 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
8f6e8f5e 827 // should never get here
2c51f8dd
SH
828 ret = -EACCES;
829 goto out;
830 }
99978832 831
8f6e8f5e 832 /* we'll free this at cg_release */
2c51f8dd
SH
833 file_info = malloc(sizeof(*file_info));
834 if (!file_info) {
835 ret = -ENOMEM;
836 goto out;
837 }
35482f91
SH
838 file_info->controller = must_copy_string(controller);
839 file_info->cgroup = must_copy_string(path1);
840 file_info->file = must_copy_string(path2);
443d13f5 841 file_info->type = LXC_TYPE_CGFILE;
8f6e8f5e
SH
842 file_info->buf = NULL;
843 file_info->buflen = 0;
844
845 fi->fh = (unsigned long)file_info;
2c51f8dd
SH
846 ret = 0;
847
848out:
849 free(cgdir);
850 return ret;
8f6e8f5e
SH
851}
852
853static int cg_release(const char *path, struct fuse_file_info *fi)
854{
855 struct file_info *f = (struct file_info *)fi->fh;
856
857 do_release_file_info(f);
858 return 0;
99978832
SH
859}
860
5b2dfd85
SH
861#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
862
863static bool wait_for_sock(int sock, int timeout)
a05660a6 864{
5b2dfd85 865 struct epoll_event ev;
c26e12cb 866 int epfd, ret, now, starttime, deltatime, saved_errno;
5b2dfd85 867
c26e12cb
SH
868 if ((starttime = time(NULL)) < 0)
869 return false;
870
871 if ((epfd = epoll_create(1)) < 0) {
5b2dfd85
SH
872 fprintf(stderr, "Failed to create epoll socket: %m\n");
873 return false;
874 }
875
876 ev.events = POLLIN_SET;
877 ev.data.fd = sock;
878 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
879 fprintf(stderr, "Failed adding socket to epoll: %m\n");
880 close(epfd);
881 return false;
882 }
883
c26e12cb
SH
884again:
885 if ((now = time(NULL)) < 0) {
886 close(epfd);
887 return false;
888 }
a05660a6 889
c26e12cb
SH
890 deltatime = (starttime + timeout) - now;
891 if (deltatime < 0) { // timeout
892 errno = 0;
5b2dfd85 893 return false;
c26e12cb
SH
894 }
895 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
896 if (ret < 0 && errno == EINTR)
897 goto again;
898 saved_errno = errno;
899 close(epfd);
900
901 if (ret <= 0) {
902 errno = saved_errno;
5b2dfd85
SH
903 return false;
904 }
905 return true;
906}
a05660a6 907
5b2dfd85
SH
908static int msgrecv(int sockfd, void *buf, size_t len)
909{
910 if (!wait_for_sock(sockfd, 2))
a05660a6
SH
911 return -1;
912 return recv(sockfd, buf, len, MSG_DONTWAIT);
913}
914
01e71852
SH
915#define SEND_CREDS_OK 0
916#define SEND_CREDS_NOTSK 1
917#define SEND_CREDS_FAIL 2
918static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
a05660a6
SH
919{
920 struct msghdr msg = { 0 };
921 struct iovec iov;
922 struct cmsghdr *cmsg;
923 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
924 char buf[1];
925 buf[0] = 'p';
926
01e71852
SH
927 if (pingfirst) {
928 if (msgrecv(sock, buf, 1) != 1) {
1420baf8 929 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
01e71852
SH
930 __func__);
931 return SEND_CREDS_FAIL;
932 }
a05660a6
SH
933 }
934
935 msg.msg_control = cmsgbuf;
936 msg.msg_controllen = sizeof(cmsgbuf);
937
938 cmsg = CMSG_FIRSTHDR(&msg);
939 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
940 cmsg->cmsg_level = SOL_SOCKET;
941 cmsg->cmsg_type = SCM_CREDENTIALS;
942 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
943
944 msg.msg_name = NULL;
945 msg.msg_namelen = 0;
946
947 buf[0] = v;
948 iov.iov_base = buf;
949 iov.iov_len = sizeof(buf);
950 msg.msg_iov = &iov;
951 msg.msg_iovlen = 1;
952
953 if (sendmsg(sock, &msg, 0) < 0) {
1420baf8 954 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
a05660a6
SH
955 strerror(errno));
956 if (errno == 3)
01e71852
SH
957 return SEND_CREDS_NOTSK;
958 return SEND_CREDS_FAIL;
a05660a6
SH
959 }
960
01e71852 961 return SEND_CREDS_OK;
a05660a6
SH
962}
963
964static bool recv_creds(int sock, struct ucred *cred, char *v)
965{
966 struct msghdr msg = { 0 };
967 struct iovec iov;
968 struct cmsghdr *cmsg;
969 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
970 char buf[1];
971 int ret;
972 int optval = 1;
973
974 *v = '1';
975
976 cred->pid = -1;
977 cred->uid = -1;
978 cred->gid = -1;
979
980 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
1420baf8 981 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
a05660a6
SH
982 return false;
983 }
984 buf[0] = '1';
985 if (write(sock, buf, 1) != 1) {
1420baf8 986 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
a05660a6
SH
987 return false;
988 }
989
990 msg.msg_name = NULL;
991 msg.msg_namelen = 0;
992 msg.msg_control = cmsgbuf;
993 msg.msg_controllen = sizeof(cmsgbuf);
994
995 iov.iov_base = buf;
996 iov.iov_len = sizeof(buf);
997 msg.msg_iov = &iov;
998 msg.msg_iovlen = 1;
999
5b2dfd85
SH
1000 if (!wait_for_sock(sock, 2)) {
1001 fprintf(stderr, "Timed out waiting for scm_cred: %s\n",
6ee867dc
SH
1002 strerror(errno));
1003 return false;
1004 }
1005 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
a05660a6 1006 if (ret < 0) {
1420baf8 1007 fprintf(stderr, "Failed to receive scm_cred: %s\n",
a05660a6
SH
1008 strerror(errno));
1009 return false;
1010 }
1011
1012 cmsg = CMSG_FIRSTHDR(&msg);
1013
1014 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
1015 cmsg->cmsg_level == SOL_SOCKET &&
1016 cmsg->cmsg_type == SCM_CREDENTIALS) {
1017 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
1018 }
1019 *v = buf[0];
1020
1021 return true;
1022}
1023
1024
1025/*
4775fba1
SH
1026 * pid_to_ns - reads pids from a ucred over a socket, then writes the
1027 * int value back over the socket. This shifts the pid from the
1028 * sender's pidns into tpid's pidns.
a05660a6 1029 */
4775fba1 1030static void pid_to_ns(int sock, pid_t tpid)
a05660a6
SH
1031{
1032 char v = '0';
1033 struct ucred cred;
1034
1035 while (recv_creds(sock, &cred, &v)) {
1036 if (v == '1')
67bd113f 1037 _exit(0);
a05660a6 1038 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
67bd113f 1039 _exit(1);
a05660a6 1040 }
67bd113f 1041 _exit(0);
a05660a6
SH
1042}
1043
1044/*
4775fba1 1045 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
a05660a6 1046 * in your old pidns. Only children which you fork will be in the target
4775fba1 1047 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
a05660a6
SH
1048 * actually convert pids
1049 */
4775fba1 1050static void pid_to_ns_wrapper(int sock, pid_t tpid)
a05660a6 1051{
ea56f722 1052 int newnsfd = -1, ret, cpipe[2];
a05660a6
SH
1053 char fnam[100];
1054 pid_t cpid;
ea56f722 1055 char v;
a05660a6 1056
c0adec85
SH
1057 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1058 if (ret < 0 || ret >= sizeof(fnam))
67bd113f 1059 _exit(1);
a05660a6
SH
1060 newnsfd = open(fnam, O_RDONLY);
1061 if (newnsfd < 0)
67bd113f 1062 _exit(1);
a05660a6 1063 if (setns(newnsfd, 0) < 0)
67bd113f 1064 _exit(1);
a05660a6
SH
1065 close(newnsfd);
1066
ea56f722 1067 if (pipe(cpipe) < 0)
67bd113f 1068 _exit(1);
a05660a6 1069
ea56f722 1070 cpid = fork();
a05660a6 1071 if (cpid < 0)
67bd113f 1072 _exit(1);
ea56f722
SH
1073
1074 if (!cpid) {
1075 char b = '1';
1076 close(cpipe[0]);
1077 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1078 fprintf(stderr, "%s (child): erorr on write: %s\n",
1079 __func__, strerror(errno));
1080 }
1081 close(cpipe[1]);
4775fba1 1082 pid_to_ns(sock, tpid);
ff96a5f9 1083 _exit(1); // not reached
ea56f722
SH
1084 }
1085 // give the child 1 second to be done forking and
ff96a5f9 1086 // write its ack
5b2dfd85 1087 if (!wait_for_sock(cpipe[0], 1))
ff96a5f9 1088 _exit(1);
ea56f722 1089 ret = read(cpipe[0], &v, 1);
ff96a5f9
SH
1090 if (ret != sizeof(char) || v != '1')
1091 _exit(1);
ea56f722 1092
a05660a6 1093 if (!wait_for_pid(cpid))
67bd113f
SH
1094 _exit(1);
1095 _exit(0);
a05660a6
SH
1096}
1097
1098/*
1099 * To read cgroup files with a particular pid, we will setns into the child
1100 * pidns, open a pipe, fork a child - which will be the first to really be in
35482f91 1101 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
a05660a6
SH
1102 */
1103static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
1104{
1105 int sock[2] = {-1, -1};
2c51f8dd 1106 char *tmpdata = NULL;
a05660a6
SH
1107 int ret;
1108 pid_t qpid, cpid = -1;
1109 bool answer = false;
1110 char v = '0';
1111 struct ucred cred;
2c51f8dd 1112 size_t sz = 0, asz = 0;
a05660a6 1113
35482f91 1114 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
a05660a6
SH
1115 return false;
1116
1117 /*
1118 * Now we read the pids from returned data one by one, pass
1119 * them into a child in the target namespace, read back the
1120 * translated pids, and put them into our to-return data
1121 */
1122
1123 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1124 perror("socketpair");
2c51f8dd
SH
1125 free(tmpdata);
1126 return false;
a05660a6
SH
1127 }
1128
1129 cpid = fork();
1130 if (cpid == -1)
1131 goto out;
1132
ff96a5f9 1133 if (!cpid) // child - exits when done
4775fba1 1134 pid_to_ns_wrapper(sock[1], tpid);
a05660a6
SH
1135
1136 char *ptr = tmpdata;
1137 cred.uid = 0;
1138 cred.gid = 0;
1139 while (sscanf(ptr, "%d\n", &qpid) == 1) {
1140 cred.pid = qpid;
01e71852
SH
1141 ret = send_creds(sock[0], &cred, v, true);
1142
1143 if (ret == SEND_CREDS_NOTSK)
1144 goto next;
1145 if (ret == SEND_CREDS_FAIL)
a05660a6
SH
1146 goto out;
1147
1148 // read converted results
5b2dfd85
SH
1149 if (!wait_for_sock(sock[0], 2)) {
1150 fprintf(stderr, "%s: timed out waiting for pid from child: %s\n",
6ee867dc 1151 __func__, strerror(errno));
a05660a6
SH
1152 goto out;
1153 }
1154 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
6ee867dc
SH
1155 fprintf(stderr, "%s: error reading pid from child: %s\n",
1156 __func__, strerror(errno));
a05660a6
SH
1157 goto out;
1158 }
2c51f8dd 1159 must_strcat_pid(d, &sz, &asz, qpid);
01e71852 1160next:
a05660a6
SH
1161 ptr = strchr(ptr, '\n');
1162 if (!ptr)
1163 break;
1164 ptr++;
1165 }
1166
1167 cred.pid = getpid();
1168 v = '1';
01e71852 1169 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
a05660a6 1170 // failed to ask child to exit
6ee867dc
SH
1171 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1172 __func__, strerror(errno));
a05660a6
SH
1173 goto out;
1174 }
1175
1176 answer = true;
1177
1178out:
2c51f8dd 1179 free(tmpdata);
a05660a6
SH
1180 if (cpid != -1)
1181 wait_for_pid(cpid);
1182 if (sock[0] != -1) {
1183 close(sock[0]);
1184 close(sock[1]);
1185 }
1186 return answer;
1187}
1188
99978832
SH
1189static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1190 struct fuse_file_info *fi)
1191{
99978832 1192 struct fuse_context *fc = fuse_get_context();
8f6e8f5e 1193 struct file_info *f = (struct file_info *)fi->fh;
35482f91 1194 struct cgfs_files *k = NULL;
2c51f8dd
SH
1195 char *data = NULL;
1196 int ret, s;
1197 bool r;
99978832 1198
443d13f5 1199 if (f->type != LXC_TYPE_CGFILE) {
b845ad01
SH
1200 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1201 return -EIO;
1202 }
1203
99978832 1204 if (offset)
7253e0a4 1205 return 0;
99978832
SH
1206
1207 if (!fc)
1208 return -EIO;
1209
8f6e8f5e 1210 if (!f->controller)
99978832
SH
1211 return -EINVAL;
1212
35482f91 1213 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2c51f8dd
SH
1214 return -EINVAL;
1215 }
1216 free_key(k);
99978832 1217
99978832 1218
2c51f8dd
SH
1219 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { // should never get here
1220 ret = -EACCES;
1221 goto out;
1222 }
a05660a6 1223
2c51f8dd
SH
1224 if (strcmp(f->file, "tasks") == 0 ||
1225 strcmp(f->file, "/tasks") == 0 ||
1226 strcmp(f->file, "/cgroup.procs") == 0 ||
1227 strcmp(f->file, "cgroup.procs") == 0)
1228 // special case - we have to translate the pids
1229 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
1230 else
35482f91 1231 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
99978832 1232
2c51f8dd
SH
1233 if (!r) {
1234 ret = -EINVAL;
1235 goto out;
1236 }
99978832 1237
2c51f8dd
SH
1238 if (!data) {
1239 ret = 0;
1240 goto out;
99978832 1241 }
2c51f8dd
SH
1242 s = strlen(data);
1243 if (s > size)
1244 s = size;
1245 memcpy(buf, data, s);
1246 if (s > 0 && s < size && data[s-1] != '\n')
1247 buf[s++] = '\n';
99978832 1248
2c51f8dd
SH
1249 ret = s;
1250
1251out:
1252 free(data);
1253 return ret;
99978832
SH
1254}
1255
4775fba1
SH
1256static void pid_from_ns(int sock, pid_t tpid)
1257{
1258 pid_t vpid;
1259 struct ucred cred;
1260 char v;
6ee867dc 1261 int ret;
4775fba1
SH
1262
1263 cred.uid = 0;
1264 cred.gid = 0;
6ee867dc 1265 while (1) {
5b2dfd85
SH
1266 if (!wait_for_sock(sock, 2)) {
1267 fprintf(stderr, "%s: timeout reading from parent\n", __func__);
67bd113f 1268 _exit(1);
6ee867dc
SH
1269 }
1270 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1271 fprintf(stderr, "%s: bad read from parent: %s\n",
1272 __func__, strerror(errno));
67bd113f 1273 _exit(1);
6ee867dc 1274 }
4775fba1 1275 if (vpid == -1) // done
01e71852 1276 break;
4775fba1
SH
1277 v = '0';
1278 cred.pid = vpid;
01e71852 1279 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
4775fba1
SH
1280 v = '1';
1281 cred.pid = getpid();
01e71852 1282 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
67bd113f 1283 _exit(1);
4775fba1
SH
1284 }
1285 }
67bd113f 1286 _exit(0);
4775fba1
SH
1287}
1288
1289static void pid_from_ns_wrapper(int sock, pid_t tpid)
1290{
ea56f722 1291 int newnsfd = -1, ret, cpipe[2];
4775fba1
SH
1292 char fnam[100];
1293 pid_t cpid;
ea56f722 1294 char v;
4775fba1 1295
c0adec85
SH
1296 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1297 if (ret < 0 || ret >= sizeof(fnam))
67bd113f 1298 _exit(1);
4775fba1
SH
1299 newnsfd = open(fnam, O_RDONLY);
1300 if (newnsfd < 0)
67bd113f 1301 _exit(1);
4775fba1 1302 if (setns(newnsfd, 0) < 0)
67bd113f 1303 _exit(1);
4775fba1
SH
1304 close(newnsfd);
1305
ea56f722 1306 if (pipe(cpipe) < 0)
67bd113f 1307 _exit(1);
ea56f722
SH
1308
1309loop:
4775fba1
SH
1310 cpid = fork();
1311
1312 if (cpid < 0)
67bd113f 1313 _exit(1);
ea56f722
SH
1314
1315 if (!cpid) {
1316 char b = '1';
1317 close(cpipe[0]);
1318 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1319 fprintf(stderr, "%s (child): erorr on write: %s\n",
1320 __func__, strerror(errno));
1321 }
1322 close(cpipe[1]);
4775fba1 1323 pid_from_ns(sock, tpid);
ea56f722
SH
1324 }
1325
1326 // give the child 1 second to be done forking and
c26e12cb 1327 // write its ack
5b2dfd85 1328 if (!wait_for_sock(cpipe[0], 1))
ea56f722
SH
1329 goto again;
1330 ret = read(cpipe[0], &v, 1);
1331 if (ret != sizeof(char) || v != '1') {
1332 goto again;
1333 }
1334
4775fba1 1335 if (!wait_for_pid(cpid))
67bd113f
SH
1336 _exit(1);
1337 _exit(0);
ea56f722
SH
1338
1339again:
1340 kill(cpid, SIGKILL);
1341 wait_for_pid(cpid);
1342 goto loop;
4775fba1
SH
1343}
1344
8ee2a503
SH
1345/*
1346 * Given host @uid, return the uid to which it maps in
1347 * @pid's user namespace, or -1 if none.
1348 */
1349bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
1350{
1351 FILE *f;
1352 char line[400];
1353
1354 sprintf(line, "/proc/%d/uid_map", pid);
1355 if ((f = fopen(line, "r")) == NULL) {
1356 return false;
1357 }
1358
1359 *answer = convert_id_to_ns(f, uid);
1360 fclose(f);
1361
1362 if (*answer == -1)
1363 return false;
1364 return true;
1365}
1366
1367/*
1368 * get_pid_creds: get the real uid and gid of @pid from
1369 * /proc/$$/status
1370 * (XXX should we use euid here?)
1371 */
1372void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
1373{
1374 char line[400];
1375 uid_t u;
1376 gid_t g;
1377 FILE *f;
1378
1379 *uid = -1;
1380 *gid = -1;
1381 sprintf(line, "/proc/%d/status", pid);
1382 if ((f = fopen(line, "r")) == NULL) {
1383 fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno));
1384 return;
1385 }
1386 while (fgets(line, 400, f)) {
1387 if (strncmp(line, "Uid:", 4) == 0) {
1388 if (sscanf(line+4, "%u", &u) != 1) {
1389 fprintf(stderr, "bad uid line for pid %u\n", pid);
1390 fclose(f);
1391 return;
1392 }
1393 *uid = u;
1394 } else if (strncmp(line, "Gid:", 4) == 0) {
1395 if (sscanf(line+4, "%u", &g) != 1) {
1396 fprintf(stderr, "bad gid line for pid %u\n", pid);
1397 fclose(f);
1398 return;
1399 }
1400 *gid = g;
1401 }
1402 }
1403 fclose(f);
1404}
1405
1406/*
1407 * May the requestor @r move victim @v to a new cgroup?
1408 * This is allowed if
1409 * . they are the same task
1410 * . they are ownedy by the same uid
1411 * . @r is root on the host, or
1412 * . @v's uid is mapped into @r's where @r is root.
1413 */
1414bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
1415{
1416 uid_t v_uid, tmpuid;
1417 gid_t v_gid;
1418
1419 if (r == v)
1420 return true;
1421 if (r_uid == 0)
1422 return true;
1423 get_pid_creds(v, &v_uid, &v_gid);
1424 if (r_uid == v_uid)
1425 return true;
1426 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
1427 && hostuid_to_ns(v_uid, r, &tmpuid))
1428 return true;
1429 return false;
1430}
1431
1432static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
1433 const char *file, const char *buf)
4775fba1
SH
1434{
1435 int sock[2] = {-1, -1};
1436 pid_t qpid, cpid = -1;
35482f91 1437 FILE *pids_file = NULL;
4775fba1
SH
1438 bool answer = false, fail = false;
1439
35482f91
SH
1440 pids_file = open_pids_file(contrl, cg);
1441 if (!pids_file)
1442 return false;
1443
4775fba1
SH
1444 /*
1445 * write the pids to a socket, have helper in writer's pidns
1446 * call movepid for us
1447 */
1448 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1449 perror("socketpair");
35482f91 1450 goto out;
4775fba1
SH
1451 }
1452
1453 cpid = fork();
1454 if (cpid == -1)
1455 goto out;
1456
35482f91
SH
1457 if (!cpid) { // child
1458 fclose(pids_file);
4775fba1 1459 pid_from_ns_wrapper(sock[1], tpid);
35482f91 1460 }
4775fba1
SH
1461
1462 const char *ptr = buf;
1463 while (sscanf(ptr, "%d", &qpid) == 1) {
1464 struct ucred cred;
1465 char v;
1466
1467 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
6ee867dc
SH
1468 fprintf(stderr, "%s: error writing pid to child: %s\n",
1469 __func__, strerror(errno));
4775fba1
SH
1470 goto out;
1471 }
1472
01e71852
SH
1473 if (recv_creds(sock[0], &cred, &v)) {
1474 if (v == '0') {
8ee2a503
SH
1475 if (!may_move_pid(tpid, tuid, cred.pid)) {
1476 fail = true;
1477 break;
1478 }
35482f91 1479 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
01e71852
SH
1480 fail = true;
1481 }
4775fba1
SH
1482 }
1483
1484 ptr = strchr(ptr, '\n');
1485 if (!ptr)
1486 break;
1487 ptr++;
1488 }
1489
1490 /* All good, write the value */
1491 qpid = -1;
1492 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1420baf8 1493 fprintf(stderr, "Warning: failed to ask child to exit\n");
4775fba1
SH
1494
1495 if (!fail)
1496 answer = true;
1497
1498out:
1499 if (cpid != -1)
1500 wait_for_pid(cpid);
1501 if (sock[0] != -1) {
1502 close(sock[0]);
1503 close(sock[1]);
1504 }
35482f91
SH
1505 if (pids_file) {
1506 if (fclose(pids_file) != 0)
1507 answer = false;
1508 }
4775fba1
SH
1509 return answer;
1510}
1511
2ad6d2bd
SH
1512int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1513 struct fuse_file_info *fi)
1514{
2ad6d2bd 1515 struct fuse_context *fc = fuse_get_context();
2c51f8dd 1516 char *localbuf = NULL;
35482f91 1517 struct cgfs_files *k = NULL;
8f6e8f5e 1518 struct file_info *f = (struct file_info *)fi->fh;
2c51f8dd 1519 bool r;
2ad6d2bd 1520
443d13f5 1521 if (f->type != LXC_TYPE_CGFILE) {
b845ad01
SH
1522 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1523 return -EIO;
1524 }
1525
2ad6d2bd 1526 if (offset)
7253e0a4 1527 return 0;
2ad6d2bd
SH
1528
1529 if (!fc)
1530 return -EIO;
1531
2c51f8dd 1532 localbuf = alloca(size+1);
47cbf0e5
SH
1533 localbuf[size] = '\0';
1534 memcpy(localbuf, buf, size);
2ad6d2bd 1535
35482f91 1536 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2c51f8dd
SH
1537 size = -EINVAL;
1538 goto out;
1539 }
2ad6d2bd 1540
2c51f8dd
SH
1541 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
1542 size = -EACCES;
1543 goto out;
1544 }
4775fba1 1545
2c51f8dd
SH
1546 if (strcmp(f->file, "tasks") == 0 ||
1547 strcmp(f->file, "/tasks") == 0 ||
1548 strcmp(f->file, "/cgroup.procs") == 0 ||
1549 strcmp(f->file, "cgroup.procs") == 0)
1550 // special case - we have to translate the pids
8ee2a503 1551 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2c51f8dd 1552 else
35482f91 1553 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2ad6d2bd 1554
2c51f8dd
SH
1555 if (!r)
1556 size = -EINVAL;
2ad6d2bd 1557
2c51f8dd
SH
1558out:
1559 free_key(k);
1560 return size;
2ad6d2bd
SH
1561}
1562
341b21ad
SH
1563int cg_chown(const char *path, uid_t uid, gid_t gid)
1564{
1565 struct fuse_context *fc = fuse_get_context();
febf2b87 1566 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
35482f91 1567 struct cgfs_files *k = NULL;
341b21ad 1568 const char *cgroup;
2c51f8dd 1569 int ret;
341b21ad
SH
1570
1571 if (!fc)
1572 return -EIO;
1573
1574 if (strcmp(path, "/cgroup") == 0)
1575 return -EINVAL;
1576
1577 controller = pick_controller_from_path(fc, path);
1578 if (!controller)
f9a05025 1579 return -EINVAL;
341b21ad
SH
1580 cgroup = find_cgroup_in_path(path);
1581 if (!cgroup)
1582 /* this is just /cgroup/controller */
1583 return -EINVAL;
1584
febf2b87 1585 get_cgdir_and_path(cgroup, &cgdir, &last);
341b21ad 1586
febf2b87 1587 if (!last) {
341b21ad
SH
1588 path1 = "/";
1589 path2 = cgdir;
1590 } else {
1591 path1 = cgdir;
febf2b87 1592 path2 = last;
341b21ad
SH
1593 }
1594
1595 if (is_child_cgroup(controller, path1, path2)) {
1596 // get uid, gid, from '/tasks' file and make up a mode
1597 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
35482f91 1598 k = cgfs_get_key(controller, cgroup, "tasks");
341b21ad
SH
1599
1600 } else
35482f91 1601 k = cgfs_get_key(controller, path1, path2);
341b21ad 1602
2c51f8dd
SH
1603 if (!k) {
1604 ret = -EINVAL;
1605 goto out;
1606 }
341b21ad
SH
1607
1608 /*
1609 * This being a fuse request, the uid and gid must be valid
1610 * in the caller's namespace. So we can just check to make
1611 * sure that the caller is root in his uid, and privileged
1612 * over the file's current owner.
1613 */
2c51f8dd
SH
1614 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
1615 ret = -EACCES;
1616 goto out;
1617 }
341b21ad 1618
1f69d62e 1619 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2c51f8dd
SH
1620
1621out:
1622 free_key(k);
1623 free(cgdir);
1624
1625 return ret;
341b21ad 1626}
2ad6d2bd 1627
fd2e4e03
SH
1628int cg_chmod(const char *path, mode_t mode)
1629{
0a1bb5ea 1630 struct fuse_context *fc = fuse_get_context();
febf2b87 1631 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
35482f91 1632 struct cgfs_files *k = NULL;
0a1bb5ea 1633 const char *cgroup;
2c51f8dd 1634 int ret;
0a1bb5ea
SH
1635
1636 if (!fc)
1637 return -EIO;
1638
1639 if (strcmp(path, "/cgroup") == 0)
1640 return -EINVAL;
1641
1642 controller = pick_controller_from_path(fc, path);
1643 if (!controller)
f9a05025 1644 return -EINVAL;
0a1bb5ea
SH
1645 cgroup = find_cgroup_in_path(path);
1646 if (!cgroup)
1647 /* this is just /cgroup/controller */
1648 return -EINVAL;
1649
febf2b87 1650 get_cgdir_and_path(cgroup, &cgdir, &last);
0a1bb5ea 1651
febf2b87 1652 if (!last) {
0a1bb5ea
SH
1653 path1 = "/";
1654 path2 = cgdir;
1655 } else {
1656 path1 = cgdir;
febf2b87 1657 path2 = last;
0a1bb5ea
SH
1658 }
1659
1660 if (is_child_cgroup(controller, path1, path2)) {
1661 // get uid, gid, from '/tasks' file and make up a mode
1662 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
35482f91 1663 k = cgfs_get_key(controller, cgroup, "tasks");
0a1bb5ea
SH
1664
1665 } else
35482f91 1666 k = cgfs_get_key(controller, path1, path2);
0a1bb5ea 1667
2c51f8dd
SH
1668 if (!k) {
1669 ret = -EINVAL;
1670 goto out;
1671 }
0a1bb5ea
SH
1672
1673 /*
1674 * This being a fuse request, the uid and gid must be valid
1675 * in the caller's namespace. So we can just check to make
1676 * sure that the caller is root in his uid, and privileged
1677 * over the file's current owner.
1678 */
2c51f8dd
SH
1679 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1680 ret = -EPERM;
1681 goto out;
1682 }
0a1bb5ea 1683
35482f91 1684 if (!cgfs_chmod_file(controller, cgroup, mode)) {
2c51f8dd
SH
1685 ret = -EINVAL;
1686 goto out;
1687 }
1688
1689 ret = 0;
1690out:
1691 free_key(k);
1692 free(cgdir);
1693 return ret;
fd2e4e03
SH
1694}
1695
ab54b798
SH
1696int cg_mkdir(const char *path, mode_t mode)
1697{
1698 struct fuse_context *fc = fuse_get_context();
febf2b87 1699 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
ab54b798 1700 const char *cgroup;
2c51f8dd 1701 int ret;
ab54b798 1702
ab54b798
SH
1703 if (!fc)
1704 return -EIO;
1705
1706
1707 controller = pick_controller_from_path(fc, path);
1708 if (!controller)
f9a05025 1709 return -EINVAL;
ab54b798
SH
1710
1711 cgroup = find_cgroup_in_path(path);
1712 if (!cgroup)
f9a05025 1713 return -EINVAL;
ab54b798 1714
febf2b87
SH
1715 get_cgdir_and_path(cgroup, &cgdir, &last);
1716 if (!last)
ab54b798
SH
1717 path1 = "/";
1718 else
1719 path1 = cgdir;
1720
0dcc31ea 1721 pid_t initpid = get_init_pid_for_task(fc->pid);
87dce5f6
SH
1722 if (initpid <= 0)
1723 initpid = fc->pid;
0dcc31ea 1724 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
87dce5f6
SH
1725 if (!next)
1726 ret = -EINVAL;
1727 else if (last && strcmp(next, last) == 0)
a8b6c3e0
SH
1728 ret = -EEXIST;
1729 else
1730 ret = -ENOENT;
1731 goto out;
1732 }
1733
2c51f8dd
SH
1734 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
1735 ret = -EACCES;
1736 goto out;
1737 }
0dcc31ea 1738 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2c51f8dd
SH
1739 ret = -EACCES;
1740 goto out;
1741 }
ab54b798 1742
af869b9c 1743 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
ab54b798 1744
2c51f8dd
SH
1745out:
1746 free(cgdir);
a8b6c3e0 1747 free(next);
2c51f8dd 1748 return ret;
ab54b798
SH
1749}
1750
50d8d5b5
SH
1751static int cg_rmdir(const char *path)
1752{
1753 struct fuse_context *fc = fuse_get_context();
febf2b87 1754 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
50d8d5b5 1755 const char *cgroup;
2c51f8dd 1756 int ret;
50d8d5b5
SH
1757
1758 if (!fc)
1759 return -EIO;
1760
50d8d5b5
SH
1761 controller = pick_controller_from_path(fc, path);
1762 if (!controller)
f9a05025 1763 return -EINVAL;
50d8d5b5
SH
1764
1765 cgroup = find_cgroup_in_path(path);
1766 if (!cgroup)
f9a05025 1767 return -EINVAL;
50d8d5b5 1768
febf2b87
SH
1769 get_cgdir_and_path(cgroup, &cgdir, &last);
1770 if (!last) {
2c51f8dd
SH
1771 ret = -EINVAL;
1772 goto out;
1773 }
50d8d5b5 1774
0dcc31ea 1775 pid_t initpid = get_init_pid_for_task(fc->pid);
87dce5f6
SH
1776 if (initpid <= 0)
1777 initpid = fc->pid;
0dcc31ea 1778 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
febf2b87 1779 if (!last || strcmp(next, last) == 0)
a8b6c3e0
SH
1780 ret = -EBUSY;
1781 else
1782 ret = -ENOENT;
1783 goto out;
1784 }
1785
2c51f8dd
SH
1786 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
1787 ret = -EACCES;
1788 goto out;
1789 }
0dcc31ea 1790 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2c51f8dd
SH
1791 ret = -EACCES;
1792 goto out;
1793 }
50d8d5b5 1794
35482f91 1795 if (!cgfs_remove(controller, cgroup)) {
2c51f8dd
SH
1796 ret = -EINVAL;
1797 goto out;
1798 }
50d8d5b5 1799
2c51f8dd
SH
1800 ret = 0;
1801
1802out:
1803 free(cgdir);
a8b6c3e0 1804 free(next);
2c51f8dd 1805 return ret;
50d8d5b5
SH
1806}
1807
2dc17609
SH
1808static bool startswith(const char *line, const char *pref)
1809{
1810 if (strncmp(line, pref, strlen(pref)) == 0)
1811 return true;
1812 return false;
1813}
1814
1815static void get_mem_cached(char *memstat, unsigned long *v)
1816{
1817 char *eol;
1818
1819 *v = 0;
1820 while (*memstat) {
1821 if (startswith(memstat, "total_cache")) {
1822 sscanf(memstat + 11, "%lu", v);
1823 *v /= 1024;
1824 return;
1825 }
1826 eol = strchr(memstat, '\n');
1827 if (!eol)
1828 return;
1829 memstat = eol+1;
1830 }
1831}
1832
49878439 1833static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2f919d9d 1834{
49878439
YY
1835 char *eol;
1836 char key[32];
2f919d9d 1837
49878439
YY
1838 memset(key, 0, 32);
1839 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2f919d9d 1840
49878439
YY
1841 size_t len = strlen(key);
1842 *v = 0;
1843
1844 while (*str) {
1845 if (startswith(str, key)) {
2f919d9d
SH
1846 sscanf(str + len, "%lu", v);
1847 return;
1848 }
1849 eol = strchr(str, '\n');
49878439 1850 if (!eol)
2f919d9d 1851 return;
49878439
YY
1852 str = eol+1;
1853 }
1854}
1855
53b43826
SH
1856static int read_file(const char *path, char *buf, size_t size,
1857 struct file_info *d)
1858{
1859 size_t linelen = 0, total_len = 0, rv = 0;
1860 char *line = NULL;
1861 char *cache = d->buf;
1862 size_t cache_size = d->buflen;
1863 FILE *f = fopen(path, "r");
1864 if (!f)
1865 return 0;
1866
1867 while (getline(&line, &linelen, f) != -1) {
1868 size_t l = snprintf(cache, cache_size, "%s", line);
1869 if (l < 0) {
1870 perror("Error writing to cache");
1871 rv = 0;
1872 goto err;
1873 }
1874 if (l >= cache_size) {
1875 fprintf(stderr, "Internal error: truncated write to cache\n");
1876 rv = 0;
1877 goto err;
1878 }
1879 if (l < cache_size) {
1880 cache += l;
1881 cache_size -= l;
1882 total_len += l;
1883 } else {
1884 cache += cache_size;
1885 total_len += cache_size;
1886 cache_size = 0;
1887 break;
1888 }
1889 }
1890
1891 d->size = total_len;
1892 if (total_len > size ) total_len = size;
1893
1894 /* read from off 0 */
1895 memcpy(buf, d->buf, total_len);
1896 rv = total_len;
1897 err:
1898 fclose(f);
1899 free(line);
1900 return rv;
1901}
1902
758ad80c 1903/*
2ad6d2bd 1904 * FUSE ops for /proc
758ad80c 1905 */
758ad80c 1906
7bc95a75
SH
1907static unsigned long get_memlimit(const char *cgroup)
1908{
1909 char *memlimit_str = NULL;
1910 unsigned long memlimit = -1;
1911
35482f91 1912 if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
7bc95a75
SH
1913 memlimit = strtoul(memlimit_str, NULL, 10);
1914
1915 free(memlimit_str);
1916
1917 return memlimit;
1918}
1919
1920static unsigned long get_min_memlimit(const char *cgroup)
1921{
1922 char *copy = strdupa(cgroup);
1923 unsigned long memlimit = 0, retlimit;
1924
1925 retlimit = get_memlimit(copy);
1926
1927 while (strcmp(copy, "/") != 0) {
1928 copy = dirname(copy);
1929 memlimit = get_memlimit(copy);
1930 if (memlimit != -1 && memlimit < retlimit)
1931 retlimit = memlimit;
1932 };
1933
1934 return retlimit;
1935}
1936
23ce2127
SH
1937static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1938 struct fuse_file_info *fi)
1939{
2dc17609 1940 struct fuse_context *fc = fuse_get_context();
97f1f27b 1941 struct file_info *d = (struct file_info *)fi->fh;
2c51f8dd 1942 char *cg;
4622ad78 1943 char *memusage_str = NULL, *memstat_str = NULL,
b731895e
NW
1944 *memswlimit_str = NULL, *memswusage_str = NULL,
1945 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
4622ad78
TG
1946 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
1947 cached = 0, hosttotal = 0;
2dc17609 1948 char *line = NULL;
e1068397 1949 size_t linelen = 0, total_len = 0, rv = 0;
97f1f27b
YY
1950 char *cache = d->buf;
1951 size_t cache_size = d->buflen;
2c51f8dd 1952 FILE *f = NULL;
2dc17609 1953
97f1f27b
YY
1954 if (offset){
1955 if (offset > d->size)
1956 return -EINVAL;
b5ad2d21
SH
1957 if (!d->cached)
1958 return 0;
97f1f27b
YY
1959 int left = d->size - offset;
1960 total_len = left > size ? size: left;
1961 memcpy(buf, cache + offset, total_len);
1962 return total_len;
1963 }
2dc17609 1964
0dcc31ea 1965 pid_t initpid = get_init_pid_for_task(fc->pid);
87dce5f6
SH
1966 if (initpid <= 0)
1967 initpid = fc->pid;
0dcc31ea 1968 cg = get_pid_cgroup(initpid, "memory");
2dc17609 1969 if (!cg)
53b43826 1970 return read_file("/proc/meminfo", buf, size, d);
2dc17609 1971
7bc95a75 1972 memlimit = get_min_memlimit(cg);
35482f91 1973 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
2c51f8dd 1974 goto err;
35482f91 1975 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
2c51f8dd 1976 goto err;
4622ad78
TG
1977
1978 // Following values are allowed to fail, because swapaccount might be turned
1979 // off for current kernel
1980 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
1981 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
1982 {
b731895e
NW
1983 /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
1984 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
1985 goto err;
1986 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
1987 goto err;
1988
4622ad78
TG
1989 memswlimit = strtoul(memswlimit_str, NULL, 10);
1990 memswusage = strtoul(memswusage_str, NULL, 10);
b731895e
NW
1991
1992 if (!strcmp(memswlimit_str, memswlimit_default_str))
a2de34ba 1993 memswlimit = 0;
b731895e 1994 if (!strcmp(memswusage_str, memswusage_default_str))
a2de34ba
SH
1995 memswusage = 0;
1996
b731895e
NW
1997 memswlimit = memswlimit / 1024;
1998 memswusage = memswusage / 1024;
4622ad78 1999 }
b731895e
NW
2000
2001 memusage = strtoul(memusage_str, NULL, 10);
2002 memlimit /= 1024;
2003 memusage /= 1024;
2004
2dc17609
SH
2005 get_mem_cached(memstat_str, &cached);
2006
2007 f = fopen("/proc/meminfo", "r");
2008 if (!f)
2c51f8dd 2009 goto err;
2dc17609
SH
2010
2011 while (getline(&line, &linelen, f) != -1) {
2012 size_t l;
2013 char *printme, lbuf[100];
2014
2015 memset(lbuf, 0, 100);
2016 if (startswith(line, "MemTotal:")) {
2017 sscanf(line+14, "%lu", &hosttotal);
2018 if (hosttotal < memlimit)
2019 memlimit = hosttotal;
2020 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
2021 printme = lbuf;
2022 } else if (startswith(line, "MemFree:")) {
2023 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
2024 printme = lbuf;
2025 } else if (startswith(line, "MemAvailable:")) {
2026 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
2027 printme = lbuf;
4622ad78
TG
2028 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
2029 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit);
2030 printme = lbuf;
2031 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
2032 snprintf(lbuf, 100, "SwapFree: %8lu kB\n",
2033 (memswlimit - memlimit) - (memswusage - memusage));
2034 printme = lbuf;
2dc17609
SH
2035 } else if (startswith(line, "Buffers:")) {
2036 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
2037 printme = lbuf;
2038 } else if (startswith(line, "Cached:")) {
2039 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
2040 printme = lbuf;
2041 } else if (startswith(line, "SwapCached:")) {
2042 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
2043 printme = lbuf;
2044 } else
2045 printme = line;
97f1f27b
YY
2046
2047 l = snprintf(cache, cache_size, "%s", printme);
e1068397
MM
2048 if (l < 0) {
2049 perror("Error writing to cache");
2050 rv = 0;
2051 goto err;
2052
2053 }
2054 if (l >= cache_size) {
2055 fprintf(stderr, "Internal error: truncated write to cache\n");
2056 rv = 0;
2057 goto err;
2058 }
2059
97f1f27b
YY
2060 cache += l;
2061 cache_size -= l;
2f919d9d 2062 total_len += l;
2dc17609
SH
2063 }
2064
b5ad2d21 2065 d->cached = 1;
97f1f27b
YY
2066 d->size = total_len;
2067 if (total_len > size ) total_len = size;
2068 memcpy(buf, d->buf, total_len);
2069
e1068397 2070 rv = total_len;
2c51f8dd
SH
2071err:
2072 if (f)
2073 fclose(f);
92c84dc4 2074 free(line);
2c51f8dd 2075 free(cg);
2c51f8dd 2076 free(memusage_str);
4622ad78
TG
2077 free(memswlimit_str);
2078 free(memswusage_str);
2c51f8dd 2079 free(memstat_str);
b731895e
NW
2080 free(memswlimit_default_str);
2081 free(memswusage_default_str);
e1068397 2082 return rv;
23ce2127
SH
2083}
2084
2085/*
2086 * Read the cpuset.cpus for cg
2c51f8dd 2087 * Return the answer in a newly allocated string which must be freed
23ce2127
SH
2088 */
2089static char *get_cpuset(const char *cg)
2090{
2091 char *answer;
2092
35482f91 2093 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
23ce2127
SH
2094 return NULL;
2095 return answer;
2096}
2097
fa47bb52 2098bool cpu_in_cpuset(int cpu, const char *cpuset);
23ce2127 2099
aeb56147
SH
2100static bool cpuline_in_cpuset(const char *line, const char *cpuset)
2101{
2102 int cpu;
2103
2104 if (sscanf(line, "processor : %d", &cpu) != 1)
2105 return false;
2106 return cpu_in_cpuset(cpu, cpuset);
2107}
2108
23ce2127
SH
2109/*
2110 * check whether this is a '^processor" line in /proc/cpuinfo
2111 */
2112static bool is_processor_line(const char *line)
2113{
2114 int cpu;
2115
2116 if (sscanf(line, "processor : %d", &cpu) == 1)
2117 return true;
2118 return false;
2119}
2120
23ce2127
SH
2121static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
2122 struct fuse_file_info *fi)
2123{
2124 struct fuse_context *fc = fuse_get_context();
97f1f27b 2125 struct file_info *d = (struct file_info *)fi->fh;
2c51f8dd
SH
2126 char *cg;
2127 char *cpuset = NULL;
23ce2127 2128 char *line = NULL;
e1068397 2129 size_t linelen = 0, total_len = 0, rv = 0;
23ce2127
SH
2130 bool am_printing = false;
2131 int curcpu = -1;
97f1f27b
YY
2132 char *cache = d->buf;
2133 size_t cache_size = d->buflen;
2c51f8dd 2134 FILE *f = NULL;
23ce2127 2135
97f1f27b
YY
2136 if (offset){
2137 if (offset > d->size)
2138 return -EINVAL;
b5ad2d21
SH
2139 if (!d->cached)
2140 return 0;
97f1f27b
YY
2141 int left = d->size - offset;
2142 total_len = left > size ? size: left;
2143 memcpy(buf, cache + offset, total_len);
2f919d9d 2144 return total_len;
97f1f27b 2145 }
23ce2127 2146
0dcc31ea 2147 pid_t initpid = get_init_pid_for_task(fc->pid);
87dce5f6
SH
2148 if (initpid <= 0)
2149 initpid = fc->pid;
0dcc31ea 2150 cg = get_pid_cgroup(initpid, "cpuset");
23ce2127 2151 if (!cg)
53b43826 2152 return read_file("proc/cpuinfo", buf, size, d);
23ce2127
SH
2153
2154 cpuset = get_cpuset(cg);
2155 if (!cpuset)
2c51f8dd 2156 goto err;
23ce2127
SH
2157
2158 f = fopen("/proc/cpuinfo", "r");
2159 if (!f)
2c51f8dd 2160 goto err;
23ce2127
SH
2161
2162 while (getline(&line, &linelen, f) != -1) {
2163 size_t l;
2164 if (is_processor_line(line)) {
aeb56147 2165 am_printing = cpuline_in_cpuset(line, cpuset);
23ce2127
SH
2166 if (am_printing) {
2167 curcpu ++;
97f1f27b 2168 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
e1068397
MM
2169 if (l < 0) {
2170 perror("Error writing to cache");
2171 rv = 0;
2172 goto err;
2173 }
2174 if (l >= cache_size) {
2175 fprintf(stderr, "Internal error: truncated write to cache\n");
2176 rv = 0;
2177 goto err;
2178 }
97f1f27b
YY
2179 if (l < cache_size){
2180 cache += l;
2181 cache_size -= l;
2182 total_len += l;
2183 }else{
2184 cache += cache_size;
2185 total_len += cache_size;
2186 cache_size = 0;
2187 break;
2188 }
23ce2127
SH
2189 }
2190 continue;
2191 }
2192 if (am_printing) {
97f1f27b 2193 l = snprintf(cache, cache_size, "%s", line);
e1068397
MM
2194 if (l < 0) {
2195 perror("Error writing to cache");
2196 rv = 0;
2197 goto err;
2198 }
2199 if (l >= cache_size) {
2200 fprintf(stderr, "Internal error: truncated write to cache\n");
2201 rv = 0;
2202 goto err;
2203 }
97f1f27b
YY
2204 if (l < cache_size) {
2205 cache += l;
2206 cache_size -= l;
2207 total_len += l;
2208 } else {
2209 cache += cache_size;
2210 total_len += cache_size;
2211 cache_size = 0;
2212 break;
2213 }
23ce2127
SH
2214 }
2215 }
2216
b5ad2d21 2217 d->cached = 1;
97f1f27b
YY
2218 d->size = total_len;
2219 if (total_len > size ) total_len = size;
2220
2221 /* read from off 0 */
2222 memcpy(buf, d->buf, total_len);
e1068397 2223 rv = total_len;
2c51f8dd
SH
2224err:
2225 if (f)
2226 fclose(f);
92c84dc4 2227 free(line);
2c51f8dd
SH
2228 free(cpuset);
2229 free(cg);
e1068397 2230 return rv;
23ce2127
SH
2231}
2232
2233static int proc_stat_read(char *buf, size_t size, off_t offset,
2234 struct fuse_file_info *fi)
2235{
aeb56147 2236 struct fuse_context *fc = fuse_get_context();
97f1f27b 2237 struct file_info *d = (struct file_info *)fi->fh;
2c51f8dd
SH
2238 char *cg;
2239 char *cpuset = NULL;
aeb56147 2240 char *line = NULL;
e1068397 2241 size_t linelen = 0, total_len = 0, rv = 0;
2a0fde62 2242 int curcpu = -1; /* cpu numbering starts at 0 */
97f1f27b
YY
2243 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
2244 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
2245 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
2246#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
2247 char cpuall[CPUALL_MAX_SIZE];
2248 /* reserve for cpu all */
2249 char *cache = d->buf + CPUALL_MAX_SIZE;
2250 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
2c51f8dd 2251 FILE *f = NULL;
aeb56147 2252
97f1f27b
YY
2253 if (offset){
2254 if (offset > d->size)
2255 return -EINVAL;
b5ad2d21
SH
2256 if (!d->cached)
2257 return 0;
97f1f27b
YY
2258 int left = d->size - offset;
2259 total_len = left > size ? size: left;
2260 memcpy(buf, d->buf + offset, total_len);
2f919d9d 2261 return total_len;
97f1f27b 2262 }
aeb56147 2263
0dcc31ea 2264 pid_t initpid = get_init_pid_for_task(fc->pid);
87dce5f6
SH
2265 if (initpid <= 0)
2266 initpid = fc->pid;
0dcc31ea 2267 cg = get_pid_cgroup(initpid, "cpuset");
aeb56147 2268 if (!cg)
53b43826 2269 return read_file("/proc/stat", buf, size, d);
aeb56147
SH
2270
2271 cpuset = get_cpuset(cg);
2272 if (!cpuset)
2c51f8dd 2273 goto err;
aeb56147
SH
2274
2275 f = fopen("/proc/stat", "r");
2276 if (!f)
2c51f8dd 2277 goto err;
aeb56147 2278
97f1f27b
YY
2279 //skip first line
2280 if (getline(&line, &linelen, f) < 0) {
2281 fprintf(stderr, "proc_stat_read read first line failed\n");
2c51f8dd 2282 goto err;
97f1f27b
YY
2283 }
2284
aeb56147
SH
2285 while (getline(&line, &linelen, f) != -1) {
2286 size_t l;
2287 int cpu;
2a0fde62 2288 char cpu_char[10]; /* That's a lot of cores */
aeb56147
SH
2289 char *c;
2290
2a0fde62
CB
2291 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
2292 /* not a ^cpuN line containing a number N, just print it */
97f1f27b 2293 l = snprintf(cache, cache_size, "%s", line);
e1068397
MM
2294 if (l < 0) {
2295 perror("Error writing to cache");
2296 rv = 0;
2297 goto err;
2298 }
2299 if (l >= cache_size) {
2300 fprintf(stderr, "Internal error: truncated write to cache\n");
2301 rv = 0;
2302 goto err;
2303 }
2304 if (l < cache_size) {
97f1f27b
YY
2305 cache += l;
2306 cache_size -= l;
2307 total_len += l;
2308 continue;
e1068397 2309 } else {
97f1f27b
YY
2310 //no more space, break it
2311 cache += cache_size;
2312 total_len += cache_size;
2313 cache_size = 0;
2314 break;
2315 }
aeb56147 2316 }
2a0fde62
CB
2317
2318 if (sscanf(cpu_char, "%d", &cpu) != 1)
2319 continue;
aeb56147
SH
2320 if (!cpu_in_cpuset(cpu, cpuset))
2321 continue;
2322 curcpu ++;
2323
2324 c = strchr(line, ' ');
2325 if (!c)
2326 continue;
25c5e8fb 2327 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
e1068397
MM
2328 if (l < 0) {
2329 perror("Error writing to cache");
2330 rv = 0;
2331 goto err;
2332
2333 }
2334 if (l >= cache_size) {
2335 fprintf(stderr, "Internal error: truncated write to cache\n");
2336 rv = 0;
2337 goto err;
2338 }
2339
97f1f27b
YY
2340 cache += l;
2341 cache_size -= l;
aeb56147 2342 total_len += l;
2f919d9d 2343
97f1f27b
YY
2344 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
2345 &softirq, &steal, &guest) != 9)
2346 continue;
2347 user_sum += user;
2348 nice_sum += nice;
2349 system_sum += system;
2350 idle_sum += idle;
2351 iowait_sum += iowait;
2352 irq_sum += irq;
2353 softirq_sum += softirq;
2354 steal_sum += steal;
2f919d9d 2355 guest_sum += guest;
97f1f27b
YY
2356 }
2357
2358 cache = d->buf;
2359
2f919d9d 2360 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
97f1f27b
YY
2361 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
2362 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
2363 memcpy(cache, cpuall, cpuall_len);
2f919d9d 2364 cache += cpuall_len;
2c51f8dd 2365 } else{
97f1f27b
YY
2366 /* shouldn't happen */
2367 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
2368 cpuall_len = 0;
2369 }
2370
2371 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
2372 total_len += cpuall_len;
b5ad2d21 2373 d->cached = 1;
97f1f27b
YY
2374 d->size = total_len;
2375 if (total_len > size ) total_len = size;
2376
2377 memcpy(buf, d->buf, total_len);
e1068397 2378 rv = total_len;
2c51f8dd
SH
2379
2380err:
2381 if (f)
2382 fclose(f);
92c84dc4 2383 free(line);
2c51f8dd
SH
2384 free(cpuset);
2385 free(cg);
e1068397 2386 return rv;
23ce2127
SH
2387}
2388
0afd85bd 2389static long int getreaperage(pid_t pid)
41bb9357
SH
2390{
2391 char fnam[100];
41bb9357 2392 struct stat sb;
0afd85bd
SH
2393 int ret;
2394 pid_t qpid;
5ca64c2a 2395
0afd85bd 2396 qpid = get_init_pid_for_task(pid);
87dce5f6 2397 if (qpid <= 0)
c0adec85
SH
2398 return 0;
2399
0afd85bd
SH
2400 ret = snprintf(fnam, 100, "/proc/%d", qpid);
2401 if (ret < 0 || ret >= 100)
41bb9357 2402 return 0;
ea56f722 2403
0afd85bd 2404 if (lstat(fnam, &sb) < 0)
41bb9357 2405 return 0;
41bb9357 2406
0afd85bd 2407 return time(NULL) - sb.st_ctime;
41bb9357
SH
2408}
2409
0b6af11b
SH
2410/*
2411 * fork a task which switches to @task's namespace and writes '1'.
2412 * over a unix sock so we can read the task's reaper's pid in our
2413 * namespace
2414 */
2415void write_task_init_pid_exit(int sock, pid_t target)
41bb9357 2416{
0b6af11b
SH
2417 struct ucred cred;
2418 char fnam[100];
2419 pid_t pid;
2420 char v;
2421 int fd, ret;
38056ebc 2422
0b6af11b
SH
2423 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
2424 if (ret < 0 || ret >= sizeof(fnam))
ff96a5f9 2425 _exit(1);
0b6af11b
SH
2426
2427 fd = open(fnam, O_RDONLY);
2428 if (fd < 0) {
ff96a5f9
SH
2429 perror("write_task_init_pid_exit open of ns/pid");
2430 _exit(1);
0b6af11b
SH
2431 }
2432 if (setns(fd, 0)) {
ff96a5f9 2433 perror("write_task_init_pid_exit setns 1");
0b6af11b 2434 close(fd);
ff96a5f9 2435 _exit(1);
0b6af11b
SH
2436 }
2437 pid = fork();
2438 if (pid < 0)
ff96a5f9 2439 _exit(1);
0b6af11b 2440 if (pid != 0) {
87dce5f6
SH
2441 if (!wait_for_pid(pid))
2442 _exit(1);
ff96a5f9 2443 _exit(0);
0b6af11b
SH
2444 }
2445
2446 /* we are the child */
2447 cred.uid = 0;
2448 cred.gid = 0;
2449 cred.pid = 1;
2450 v = '1';
87dce5f6
SH
2451 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
2452 _exit(1);
ff96a5f9 2453 _exit(0);
0b6af11b
SH
2454}
2455
395a8b77 2456static pid_t get_init_pid_for_task(pid_t task)
0b6af11b
SH
2457{
2458 int sock[2];
2459 pid_t pid;
2460 pid_t ret = -1;
2461 char v = '0';
2462 struct ucred cred;
2463
2464 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2465 perror("socketpair");
2466 return -1;
2467 }
2468
2469 pid = fork();
2470 if (pid < 0)
2471 goto out;
2472 if (!pid) {
2473 close(sock[1]);
2474 write_task_init_pid_exit(sock[0], task);
87dce5f6 2475 _exit(0);
0b6af11b
SH
2476 }
2477
2478 if (!recv_creds(sock[1], &cred, &v))
2479 goto out;
2480 ret = cred.pid;
2481
2482out:
2483 close(sock[0]);
2484 close(sock[1]);
87dce5f6
SH
2485 if (pid > 0)
2486 wait_for_pid(pid);
0b6af11b
SH
2487 return ret;
2488}
2489
2490static unsigned long get_reaper_busy(pid_t task)
2491{
0dcc31ea 2492 pid_t initpid = get_init_pid_for_task(task);
0b6af11b
SH
2493 char *cgroup = NULL, *usage_str = NULL;
2494 unsigned long usage = 0;
2495
87dce5f6 2496 if (initpid <= 0)
41bb9357 2497 return 0;
0b6af11b 2498
0dcc31ea 2499 cgroup = get_pid_cgroup(initpid, "cpuacct");
0b6af11b
SH
2500 if (!cgroup)
2501 goto out;
2502 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
2503 goto out;
2504 usage = strtoul(usage_str, NULL, 10);
87e96963 2505 usage /= 1000000000;
0b6af11b
SH
2506
2507out:
2508 free(cgroup);
2509 free(usage_str);
2510 return usage;
41bb9357
SH
2511}
2512
2513/*
2514 * We read /proc/uptime and reuse its second field.
2515 * For the first field, we use the mtime for the reaper for
2516 * the calling pid as returned by getreaperage
2517 */
23ce2127
SH
2518static int proc_uptime_read(char *buf, size_t size, off_t offset,
2519 struct fuse_file_info *fi)
2520{
41bb9357 2521 struct fuse_context *fc = fuse_get_context();
97f1f27b 2522 struct file_info *d = (struct file_info *)fi->fh;
ff96a5f9 2523 long int reaperage = getreaperage(fc->pid);
0b6af11b 2524 unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
b5ad2d21 2525 char *cache = d->buf;
97f1f27b 2526 size_t total_len = 0;
41bb9357 2527
97f1f27b
YY
2528 if (offset){
2529 if (offset > d->size)
2530 return -EINVAL;
b5ad2d21
SH
2531 if (!d->cached)
2532 return 0;
2533 int left = d->size - offset;
2534 total_len = left > size ? size: left;
2535 memcpy(buf, cache + offset, total_len);
2536 return total_len;
97f1f27b
YY
2537 }
2538
0b6af11b 2539 idletime = reaperage - busytime;
f6c0b279
SH
2540 if (idletime > reaperage)
2541 idletime = reaperage;
2542
b5ad2d21 2543 total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
e1068397
MM
2544 if (total_len < 0){
2545 perror("Error writing to cache");
2546 return 0;
2547 }
cdcdb29b 2548
b5ad2d21
SH
2549 d->size = (int)total_len;
2550 d->cached = 1;
2551
2552 if (total_len > size) total_len = size;
2553
2554 memcpy(buf, d->buf, total_len);
97f1f27b 2555 return total_len;
23ce2127
SH
2556}
2557
49878439
YY
2558static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2559 struct fuse_file_info *fi)
2560{
2561 char dev_name[72];
2562 struct fuse_context *fc = fuse_get_context();
97f1f27b 2563 struct file_info *d = (struct file_info *)fi->fh;
2c51f8dd
SH
2564 char *cg;
2565 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
49878439
YY
2566 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2567 unsigned long read = 0, write = 0;
2568 unsigned long read_merged = 0, write_merged = 0;
2569 unsigned long read_sectors = 0, write_sectors = 0;
2570 unsigned long read_ticks = 0, write_ticks = 0;
2571 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2572 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
b5ad2d21
SH
2573 char *cache = d->buf;
2574 size_t cache_size = d->buflen;
49878439 2575 char *line = NULL;
e1068397 2576 size_t linelen = 0, total_len = 0, rv = 0;
49878439
YY
2577 unsigned int major = 0, minor = 0;
2578 int i = 0;
2c51f8dd 2579 FILE *f = NULL;
49878439 2580
97f1f27b
YY
2581 if (offset){
2582 if (offset > d->size)
2583 return -EINVAL;
b5ad2d21
SH
2584 if (!d->cached)
2585 return 0;
2586 int left = d->size - offset;
2587 total_len = left > size ? size: left;
2588 memcpy(buf, cache + offset, total_len);
2589 return total_len;
97f1f27b 2590 }
49878439 2591
0dcc31ea 2592 pid_t initpid = get_init_pid_for_task(fc->pid);
87dce5f6
SH
2593 if (initpid <= 0)
2594 initpid = fc->pid;
0dcc31ea 2595 cg = get_pid_cgroup(initpid, "blkio");
49878439 2596 if (!cg)
53b43826 2597 return read_file("/proc/diskstats", buf, size, d);
49878439 2598
35482f91 2599 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2c51f8dd 2600 goto err;
35482f91 2601 if (!cgfs_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2c51f8dd 2602 goto err;
35482f91 2603 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2c51f8dd 2604 goto err;
35482f91 2605 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2c51f8dd 2606 goto err;
35482f91 2607 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2c51f8dd 2608 goto err;
49878439
YY
2609
2610
2611 f = fopen("/proc/diskstats", "r");
2612 if (!f)
2c51f8dd 2613 goto err;
49878439
YY
2614
2615 while (getline(&line, &linelen, f) != -1) {
2616 size_t l;
2617 char *printme, lbuf[256];
2618
c0adec85 2619 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
49878439
YY
2620 if(i == 3){
2621 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2622 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2623 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2624 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2625 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2626 read_sectors = read_sectors/512;
2627 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2628 write_sectors = write_sectors/512;
2f919d9d 2629
49878439
YY
2630 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2631 rd_svctm = rd_svctm/1000000;
2632 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2633 rd_wait = rd_wait/1000000;
2634 read_ticks = rd_svctm + rd_wait;
2635
2636 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2637 wr_svctm = wr_svctm/1000000;
2638 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2639 wr_wait = wr_wait/1000000;
2640 write_ticks = wr_svctm + wr_wait;
2641
2642 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2643 tot_ticks = tot_ticks/1000000;
2644 }else{
2645 continue;
2646 }
2647
2648 memset(lbuf, 0, 256);
2649 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2f919d9d 2650 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
49878439
YY
2651 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2652 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2653 printme = lbuf;
2654 } else
2655 continue;
2656
b5ad2d21 2657 l = snprintf(cache, cache_size, "%s", printme);
e1068397
MM
2658 if (l < 0) {
2659 perror("Error writing to fuse buf");
2660 rv = 0;
2661 goto err;
2662 }
b5ad2d21 2663 if (l >= cache_size) {
e1068397
MM
2664 fprintf(stderr, "Internal error: truncated write to cache\n");
2665 rv = 0;
2666 goto err;
2667 }
b5ad2d21
SH
2668 cache += l;
2669 cache_size -= l;
49878439
YY
2670 total_len += l;
2671 }
2672
b5ad2d21 2673 d->cached = 1;
97f1f27b 2674 d->size = total_len;
b5ad2d21
SH
2675 if (total_len > size ) total_len = size;
2676 memcpy(buf, d->buf, total_len);
2677
e1068397 2678 rv = total_len;
2c51f8dd
SH
2679err:
2680 free(cg);
2681 if (f)
2682 fclose(f);
49878439 2683 free(line);
2c51f8dd
SH
2684 free(io_serviced_str);
2685 free(io_merged_str);
2686 free(io_service_bytes_str);
2687 free(io_wait_time_str);
2688 free(io_service_time_str);
e1068397 2689 return rv;
49878439
YY
2690}
2691
23ce2127
SH
2692static off_t get_procfile_size(const char *which)
2693{
2694 FILE *f = fopen(which, "r");
2695 char *line = NULL;
2696 size_t len = 0;
2697 ssize_t sz, answer = 0;
2698 if (!f)
2699 return 0;
2700
2701 while ((sz = getline(&line, &len, f)) != -1)
2702 answer += sz;
2703 fclose (f);
92c84dc4 2704 free(line);
23ce2127
SH
2705
2706 return answer;
2707}
2708
758ad80c
SH
2709static int proc_getattr(const char *path, struct stat *sb)
2710{
35629743
SH
2711 struct timespec now;
2712
2713 memset(sb, 0, sizeof(struct stat));
2714 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2715 return -EINVAL;
2716 sb->st_uid = sb->st_gid = 0;
2717 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2718 if (strcmp(path, "/proc") == 0) {
2719 sb->st_mode = S_IFDIR | 00555;
2720 sb->st_nlink = 2;
2721 return 0;
2722 }
2723 if (strcmp(path, "/proc/meminfo") == 0 ||
2724 strcmp(path, "/proc/cpuinfo") == 0 ||
2725 strcmp(path, "/proc/uptime") == 0 ||
49878439
YY
2726 strcmp(path, "/proc/stat") == 0 ||
2727 strcmp(path, "/proc/diskstats") == 0) {
7253e0a4 2728 sb->st_size = 0;
35629743
SH
2729 sb->st_mode = S_IFREG | 00444;
2730 sb->st_nlink = 1;
2731 return 0;
2732 }
2733
2734 return -ENOENT;
2735}
2736
2737static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2738 struct fuse_file_info *fi)
2739{
2740 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2741 filler(buf, "meminfo", NULL, 0) != 0 ||
2742 filler(buf, "stat", NULL, 0) != 0 ||
49878439
YY
2743 filler(buf, "uptime", NULL, 0) != 0 ||
2744 filler(buf, "diskstats", NULL, 0) != 0)
758ad80c 2745 return -EINVAL;
758ad80c
SH
2746 return 0;
2747}
2748
35629743
SH
2749static int proc_open(const char *path, struct fuse_file_info *fi)
2750{
96fc5ee6
SH
2751 int type = -1;
2752 struct file_info *info;
2753
2754 if (strcmp(path, "/proc/meminfo") == 0)
2755 type = LXC_TYPE_PROC_MEMINFO;
2756 else if (strcmp(path, "/proc/cpuinfo") == 0)
2757 type = LXC_TYPE_PROC_CPUINFO;
2758 else if (strcmp(path, "/proc/uptime") == 0)
2759 type = LXC_TYPE_PROC_UPTIME;
2760 else if (strcmp(path, "/proc/stat") == 0)
2761 type = LXC_TYPE_PROC_STAT;
2762 else if (strcmp(path, "/proc/diskstats") == 0)
2763 type = LXC_TYPE_PROC_DISKSTATS;
2764 if (type == -1)
2765 return -ENOENT;
2766
2c51f8dd
SH
2767 info = malloc(sizeof(*info));
2768 if (!info)
2769 return -ENOMEM;
2770
96fc5ee6
SH
2771 memset(info, 0, sizeof(*info));
2772 info->type = type;
2773
97f1f27b 2774 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
2c51f8dd
SH
2775 do {
2776 info->buf = malloc(info->buflen);
2777 } while (!info->buf);
97f1f27b
YY
2778 memset(info->buf, 0, info->buflen);
2779 /* set actual size to buffer size */
2f919d9d 2780 info->size = info->buflen;
97f1f27b 2781
96fc5ee6
SH
2782 fi->fh = (unsigned long)info;
2783 return 0;
2784}
2785
2786static int proc_release(const char *path, struct fuse_file_info *fi)
2787{
2788 struct file_info *f = (struct file_info *)fi->fh;
2789
2790 do_release_file_info(f);
2791 return 0;
35629743
SH
2792}
2793
35629743
SH
2794static int proc_read(const char *path, char *buf, size_t size, off_t offset,
2795 struct fuse_file_info *fi)
2796{
96fc5ee6
SH
2797 struct file_info *f = (struct file_info *) fi->fh;
2798
2799 switch (f->type) {
2f919d9d 2800 case LXC_TYPE_PROC_MEMINFO:
23ce2127 2801 return proc_meminfo_read(buf, size, offset, fi);
96fc5ee6 2802 case LXC_TYPE_PROC_CPUINFO:
23ce2127 2803 return proc_cpuinfo_read(buf, size, offset, fi);
96fc5ee6 2804 case LXC_TYPE_PROC_UPTIME:
23ce2127 2805 return proc_uptime_read(buf, size, offset, fi);
96fc5ee6 2806 case LXC_TYPE_PROC_STAT:
23ce2127 2807 return proc_stat_read(buf, size, offset, fi);
96fc5ee6 2808 case LXC_TYPE_PROC_DISKSTATS:
49878439 2809 return proc_diskstats_read(buf, size, offset, fi);
96fc5ee6
SH
2810 default:
2811 return -EINVAL;
2812 }
35629743
SH
2813}
2814
2ad6d2bd
SH
2815/*
2816 * FUSE ops for /
2817 * these just delegate to the /proc and /cgroup ops as
2818 * needed
2819 */
758ad80c
SH
2820
2821static int lxcfs_getattr(const char *path, struct stat *sb)
2822{
2823 if (strcmp(path, "/") == 0) {
2824 sb->st_mode = S_IFDIR | 00755;
2825 sb->st_nlink = 2;
2826 return 0;
2827 }
2828 if (strncmp(path, "/cgroup", 7) == 0) {
2829 return cg_getattr(path, sb);
2830 }
35629743 2831 if (strncmp(path, "/proc", 5) == 0) {
758ad80c
SH
2832 return proc_getattr(path, sb);
2833 }
2834 return -EINVAL;
2835}
2836
2837static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
2838{
2839 if (strcmp(path, "/") == 0)
2840 return 0;
2841
2842 if (strncmp(path, "/cgroup", 7) == 0) {
2843 return cg_opendir(path, fi);
2844 }
35629743
SH
2845 if (strcmp(path, "/proc") == 0)
2846 return 0;
2847 return -ENOENT;
758ad80c
SH
2848}
2849
2850static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2851 struct fuse_file_info *fi)
2852{
2853 if (strcmp(path, "/") == 0) {
2854 if (filler(buf, "proc", NULL, 0) != 0 ||
2855 filler(buf, "cgroup", NULL, 0) != 0)
2856 return -EINVAL;
2857 return 0;
2858 }
35629743 2859 if (strncmp(path, "/cgroup", 7) == 0)
758ad80c 2860 return cg_readdir(path, buf, filler, offset, fi);
35629743
SH
2861 if (strcmp(path, "/proc") == 0)
2862 return proc_readdir(path, buf, filler, offset, fi);
758ad80c
SH
2863 return -EINVAL;
2864}
2865
2866static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
2867{
2868 if (strcmp(path, "/") == 0)
2869 return 0;
2870 if (strncmp(path, "/cgroup", 7) == 0) {
2871 return cg_releasedir(path, fi);
2872 }
35629743
SH
2873 if (strcmp(path, "/proc") == 0)
2874 return 0;
758ad80c
SH
2875 return -EINVAL;
2876}
2877
99978832
SH
2878static int lxcfs_open(const char *path, struct fuse_file_info *fi)
2879{
35629743 2880 if (strncmp(path, "/cgroup", 7) == 0)
99978832 2881 return cg_open(path, fi);
35629743
SH
2882 if (strncmp(path, "/proc", 5) == 0)
2883 return proc_open(path, fi);
99978832
SH
2884
2885 return -EINVAL;
2886}
2887
2888static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
2889 struct fuse_file_info *fi)
2890{
35629743 2891 if (strncmp(path, "/cgroup", 7) == 0)
99978832 2892 return cg_read(path, buf, size, offset, fi);
35629743
SH
2893 if (strncmp(path, "/proc", 5) == 0)
2894 return proc_read(path, buf, size, offset, fi);
99978832
SH
2895
2896 return -EINVAL;
2897}
2898
2ad6d2bd
SH
2899int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
2900 struct fuse_file_info *fi)
2901{
2902 if (strncmp(path, "/cgroup", 7) == 0) {
2903 return cg_write(path, buf, size, offset, fi);
2904 }
2905
2906 return -EINVAL;
2907}
2908
99978832
SH
2909static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2910{
2911 return 0;
2912}
2913
2914static int lxcfs_release(const char *path, struct fuse_file_info *fi)
758ad80c 2915{
8f6e8f5e
SH
2916 if (strncmp(path, "/cgroup", 7) == 0)
2917 return cg_release(path, fi);
8f6e8f5e 2918 if (strncmp(path, "/proc", 5) == 0)
96fc5ee6 2919 return proc_release(path, fi);
8f6e8f5e
SH
2920
2921 return -EINVAL;
99978832
SH
2922}
2923
2924static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2925{
2926 return 0;
758ad80c
SH
2927}
2928
ab54b798
SH
2929int lxcfs_mkdir(const char *path, mode_t mode)
2930{
2931 if (strncmp(path, "/cgroup", 7) == 0)
2932 return cg_mkdir(path, mode);
2933
2934 return -EINVAL;
2935}
2936
341b21ad
SH
2937int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2938{
2939 if (strncmp(path, "/cgroup", 7) == 0)
2940 return cg_chown(path, uid, gid);
2941
2942 return -EINVAL;
2943}
2944
2ad6d2bd
SH
2945/*
2946 * cat first does a truncate before doing ops->write. This doesn't
2947 * really make sense for cgroups. So just return 0 always but do
2948 * nothing.
2949 */
2950int lxcfs_truncate(const char *path, off_t newsize)
2951{
2952 if (strncmp(path, "/cgroup", 7) == 0)
2953 return 0;
2954 return -EINVAL;
2955}
2956
50d8d5b5
SH
2957int lxcfs_rmdir(const char *path)
2958{
2959 if (strncmp(path, "/cgroup", 7) == 0)
2960 return cg_rmdir(path);
2961 return -EINVAL;
2962}
2963
fd2e4e03
SH
2964int lxcfs_chmod(const char *path, mode_t mode)
2965{
2966 if (strncmp(path, "/cgroup", 7) == 0)
2967 return cg_chmod(path, mode);
2968 return -EINVAL;
2969}
2970
758ad80c
SH
2971const struct fuse_operations lxcfs_ops = {
2972 .getattr = lxcfs_getattr,
2973 .readlink = NULL,
2974 .getdir = NULL,
2975 .mknod = NULL,
ab54b798 2976 .mkdir = lxcfs_mkdir,
758ad80c 2977 .unlink = NULL,
50d8d5b5 2978 .rmdir = lxcfs_rmdir,
758ad80c
SH
2979 .symlink = NULL,
2980 .rename = NULL,
2981 .link = NULL,
fd2e4e03 2982 .chmod = lxcfs_chmod,
341b21ad 2983 .chown = lxcfs_chown,
2ad6d2bd 2984 .truncate = lxcfs_truncate,
758ad80c 2985 .utime = NULL,
99978832
SH
2986
2987 .open = lxcfs_open,
2988 .read = lxcfs_read,
2989 .release = lxcfs_release,
2ad6d2bd 2990 .write = lxcfs_write,
99978832 2991
758ad80c 2992 .statfs = NULL,
99978832
SH
2993 .flush = lxcfs_flush,
2994 .fsync = lxcfs_fsync,
758ad80c
SH
2995
2996 .setxattr = NULL,
2997 .getxattr = NULL,
2998 .listxattr = NULL,
2999 .removexattr = NULL,
3000
3001 .opendir = lxcfs_opendir,
3002 .readdir = lxcfs_readdir,
3003 .releasedir = lxcfs_releasedir,
3004
3005 .fsyncdir = NULL,
3006 .init = NULL,
3007 .destroy = NULL,
3008 .access = NULL,
3009 .create = NULL,
3010 .ftruncate = NULL,
3011 .fgetattr = NULL,
3012};
3013
99978832 3014static void usage(const char *me)
758ad80c
SH
3015{
3016 fprintf(stderr, "Usage:\n");
3017 fprintf(stderr, "\n");
0b0f73db
SH
3018 fprintf(stderr, "%s mountpoint\n", me);
3019 fprintf(stderr, "%s -h\n", me);
758ad80c
SH
3020 exit(1);
3021}
3022
99978832 3023static bool is_help(char *w)
758ad80c
SH
3024{
3025 if (strcmp(w, "-h") == 0 ||
3026 strcmp(w, "--help") == 0 ||
3027 strcmp(w, "-help") == 0 ||
3028 strcmp(w, "help") == 0)
3029 return true;
3030 return false;
3031}
3032
0b0f73db
SH
3033void swallow_arg(int *argcp, char *argv[], char *which)
3034{
3035 int i;
3036
3037 for (i = 1; argv[i]; i++) {
3038 if (strcmp(argv[i], which) != 0)
3039 continue;
3040 for (; argv[i]; i++) {
3041 argv[i] = argv[i+1];
3042 }
3043 (*argcp)--;
3044 return;
3045 }
3046}
3047
3048void swallow_option(int *argcp, char *argv[], char *opt, char *v)
3049{
3050 int i;
3051
3052 for (i = 1; argv[i]; i++) {
3053 if (!argv[i+1])
3054 continue;
3055 if (strcmp(argv[i], opt) != 0)
3056 continue;
3057 if (strcmp(argv[i+1], v) != 0) {
3058 fprintf(stderr, "Warning: unexpected fuse option %s\n", v);
3059 exit(1);
3060 }
3061 for (; argv[i+1]; i++) {
3062 argv[i] = argv[i+2];
3063 }
3064 (*argcp) -= 2;
3065 return;
3066 }
3067}
3068
758ad80c
SH
3069int main(int argc, char *argv[])
3070{
c0adec85 3071 int ret = -1;
0b0f73db
SH
3072 /*
3073 * what we pass to fuse_main is:
3074 * argv[0] -s -f -o allow_other,directio argv[1] NULL
3075 */
2c51f8dd
SH
3076 int nargs = 5, cnt = 0;
3077 char *newargv[6];
758ad80c 3078
977ac879 3079#ifdef FORTRAVIS
df062bcb
SH
3080 /* for travis which runs on 12.04 */
3081 if (glib_check_version (2, 36, 0) != NULL)
3082 g_type_init ();
977ac879 3083#endif
df062bcb 3084
0b0f73db
SH
3085 /* accomodate older init scripts */
3086 swallow_arg(&argc, argv, "-s");
3087 swallow_arg(&argc, argv, "-f");
3088 swallow_option(&argc, argv, "-o", "allow_other");
3089
2e9c0b32
SH
3090 if (argc == 2 && strcmp(argv[1], "--version") == 0) {
3091 fprintf(stderr, "%s\n", VERSION);
3092 exit(0);
3093 }
0b0f73db 3094 if (argc != 2 || is_help(argv[1]))
758ad80c
SH
3095 usage(argv[0]);
3096
38a76a91 3097 newargv[cnt++] = argv[0];
38a76a91
SH
3098 newargv[cnt++] = "-f";
3099 newargv[cnt++] = "-o";
f466a31e 3100 newargv[cnt++] = "allow_other,direct_io,entry_timeout=0.5,attr_timeout=0.5";
38a76a91
SH
3101 newargv[cnt++] = argv[1];
3102 newargv[cnt++] = NULL;
758ad80c 3103
35482f91 3104 if (!cgfs_setup_controllers())
c0adec85 3105 goto out;
758ad80c 3106
35482f91 3107 ret = fuse_main(nargs, newargv, &lxcfs_ops, NULL);
758ad80c 3108
c0adec85 3109out:
758ad80c 3110 return ret;
2183082c 3111}