]> git.proxmox.com Git - mirror_lxcfs.git/blob - lxcfs.c
Add /proc/diskstats
[mirror_lxcfs.git] / lxcfs.c
1 /* lxcfs
2 *
3 * Copyright © 2014 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 /*
10 * NOTES - make sure to run this as -s to avoid threading.
11 * TODO - can we enforce that here from the code?
12 */
13 #define FUSE_USE_VERSION 26
14
15 #include <stdio.h>
16 #include <dirent.h>
17 #include <fcntl.h>
18 #include <fuse.h>
19 #include <unistd.h>
20 #include <errno.h>
21 #include <stdbool.h>
22 #include <time.h>
23 #include <string.h>
24 #include <stdlib.h>
25 #include <libgen.h>
26 #include <sched.h>
27 #include <linux/sched.h>
28 #include <sys/socket.h>
29 #include <sys/mount.h>
30 #include <wait.h>
31
32 #include <nih/alloc.h>
33 #include <nih/string.h>
34
35 #include "cgmanager.h"
36
37 struct lxcfs_state {
38 /*
39 * a null-terminated, nih-allocated list of the mounted subsystems. We
40 * detect this at startup.
41 */
42 char **subsystems;
43 };
44 #define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
45
46 /*
47 * TODO - return value should denote whether child exited with failure
48 * so callers can return errors. Esp read/write of tasks and cgroup.procs
49 */
50 static int wait_for_pid(pid_t pid)
51 {
52 int status, ret;
53
54 again:
55 ret = waitpid(pid, &status, 0);
56 if (ret == -1) {
57 if (errno == EINTR)
58 goto again;
59 return -1;
60 }
61 if (ret != pid)
62 goto again;
63 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
64 return -1;
65 return 0;
66 }
67
68 /*
69 * Given a open file * to /proc/pid/{u,g}id_map, and an id
70 * valid in the caller's namespace, return the id mapped into
71 * pid's namespace.
72 * Returns the mapped id, or -1 on error.
73 */
74 unsigned int
75 convert_id_to_ns(FILE *idfile, unsigned int in_id)
76 {
77 unsigned int nsuid, // base id for a range in the idfile's namespace
78 hostuid, // base id for a range in the caller's namespace
79 count; // number of ids in this range
80 char line[400];
81 int ret;
82
83 fseek(idfile, 0L, SEEK_SET);
84 while (fgets(line, 400, idfile)) {
85 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
86 if (ret != 3)
87 continue;
88 if (hostuid + count < hostuid || nsuid + count < nsuid) {
89 /*
90 * uids wrapped around - unexpected as this is a procfile,
91 * so just bail.
92 */
93 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
94 nsuid, hostuid, count, line);
95 return -1;
96 }
97 if (hostuid <= in_id && hostuid+count > in_id) {
98 /*
99 * now since hostuid <= in_id < hostuid+count, and
100 * hostuid+count and nsuid+count do not wrap around,
101 * we know that nsuid+(in_id-hostuid) which must be
102 * less that nsuid+(count) must not wrap around
103 */
104 return (in_id - hostuid) + nsuid;
105 }
106 }
107
108 // no answer found
109 return -1;
110 }
111
112 /*
113 * for is_privileged_over,
114 * specify whether we require the calling uid to be root in his
115 * namespace
116 */
117 #define NS_ROOT_REQD true
118 #define NS_ROOT_OPT false
119
120 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
121 {
122 nih_local char *fpath = NULL;
123 bool answer = false;
124 uid_t nsuid;
125
126 if (victim == -1 || uid == -1)
127 return false;
128
129 /*
130 * If the request is one not requiring root in the namespace,
131 * then having the same uid suffices. (i.e. uid 1000 has write
132 * access to files owned by uid 1000
133 */
134 if (!req_ns_root && uid == victim)
135 return true;
136
137 fpath = NIH_MUST( nih_sprintf(NULL, "/proc/%d/uid_map", pid) );
138 FILE *f = fopen(fpath, "r");
139 if (!f)
140 return false;
141
142 /* if caller's not root in his namespace, reject */
143 nsuid = convert_id_to_ns(f, uid);
144 if (nsuid)
145 goto out;
146
147 /*
148 * If victim is not mapped into caller's ns, reject.
149 * XXX I'm not sure this check is needed given that fuse
150 * will be sending requests where the vfs has converted
151 */
152 nsuid = convert_id_to_ns(f, victim);
153 if (nsuid == -1)
154 goto out;
155
156 answer = true;
157
158 out:
159 fclose(f);
160 return answer;
161 }
162
163 static bool perms_include(int fmode, mode_t req_mode)
164 {
165 mode_t r;
166
167 switch (req_mode & O_ACCMODE) {
168 case O_RDONLY:
169 r = S_IROTH;
170 break;
171 case O_WRONLY:
172 r = S_IWOTH;
173 break;
174 case O_RDWR:
175 r = S_IROTH | S_IWOTH;
176 break;
177 default:
178 return false;
179 }
180 return ((fmode & r) == r);
181 }
182
183 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
184 {
185 char *start, *end;
186
187 if (strlen(taskcg) <= strlen(querycg)) {
188 fprintf(stderr, "%s: I was fed bad input\n", __func__);
189 return NULL;
190 }
191
192 if (strcmp(querycg, "/") == 0)
193 start = NIH_MUST( nih_strdup(NULL, taskcg + 1) );
194 else
195 start = NIH_MUST( nih_strdup(NULL, taskcg + strlen(querycg) + 1) );
196 end = strchr(start, '/');
197 if (end)
198 *end = '\0';
199 return start;
200 }
201
202 /*
203 * check whether a fuse context may access a cgroup dir or file
204 *
205 * If file is not null, it is a cgroup file to check under cg.
206 * If file is null, then we are checking perms on cg itself.
207 *
208 * For files we can check the mode of the list_keys result.
209 * For cgroups, we must make assumptions based on the files under the
210 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
211 * yet.
212 */
213 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
214 {
215 nih_local struct cgm_keys **list = NULL;
216 int i;
217
218 if (!file)
219 file = "tasks";
220
221 if (*file == '/')
222 file++;
223
224 if (!cgm_list_keys(contrl, cg, &list))
225 return false;
226 for (i = 0; list[i]; i++) {
227 if (strcmp(list[i]->name, file) == 0) {
228 struct cgm_keys *k = list[i];
229 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
230 if (perms_include(k->mode >> 6, mode))
231 return true;
232 }
233 if (fc->gid == k->gid) {
234 if (perms_include(k->mode >> 3, mode))
235 return true;
236 }
237 return perms_include(k->mode, mode);
238 }
239 }
240
241 return false;
242 }
243
244 static void stripnewline(char *x)
245 {
246 size_t l = strlen(x);
247 if (l && x[l-1] == '\n')
248 x[l-1] = '\0';
249 }
250
251 /*
252 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
253 * If caller is in /a, he may act on /a/b, but not on /b.
254 * if the answer is false and nextcg is not NULL, then *nextcg will point
255 * to a nih_alloc'd string containing the next cgroup directory under cg
256 */
257 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
258 {
259 nih_local char *fnam = NULL;
260 FILE *f;
261 bool answer = false;
262 char *line = NULL;
263 size_t len = 0;
264
265 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
266 if (!(f = fopen(fnam, "r")))
267 return false;
268
269 while (getline(&line, &len, f) != -1) {
270 char *c1, *c2, *linecmp;
271 if (!line[0])
272 continue;
273 c1 = strchr(line, ':');
274 if (!c1)
275 goto out;
276 c1++;
277 c2 = strchr(c1, ':');
278 if (!c2)
279 goto out;
280 *c2 = '\0';
281 if (strcmp(c1, contrl) != 0)
282 continue;
283 c2++;
284 stripnewline(c2);
285 /*
286 * callers pass in '/' for root cgroup, otherwise they pass
287 * in a cgroup without leading '/'
288 */
289 linecmp = *cg == '/' ? c2 : c2+1;
290 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
291 if (nextcg)
292 *nextcg = get_next_cgroup_dir(linecmp, cg);
293 goto out;
294 }
295 answer = true;
296 goto out;
297 }
298
299 out:
300 fclose(f);
301 free(line);
302 return answer;
303 }
304
305 /*
306 * given /cgroup/freezer/a/b, return "freezer". this will be nih-allocated
307 * and needs to be nih_freed.
308 */
309 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
310 {
311 const char *p1;
312 char *ret, *slash;
313
314 if (strlen(path) < 9)
315 return NULL;
316 p1 = path+8;
317 ret = nih_strdup(NULL, p1);
318 if (!ret)
319 return ret;
320 slash = strstr(ret, "/");
321 if (slash)
322 *slash = '\0';
323
324 /* verify that it is a subsystem */
325 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
326 int i;
327 if (!list) {
328 nih_free(ret);
329 return NULL;
330 }
331 for (i = 0; list[i]; i++) {
332 if (strcmp(list[i], ret) == 0)
333 return ret;
334 }
335 nih_free(ret);
336 return NULL;
337 }
338
339 /*
340 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
341 * Note that the returned value may include files (keynames) etc
342 */
343 static const char *find_cgroup_in_path(const char *path)
344 {
345 const char *p1;
346
347 if (strlen(path) < 9)
348 return NULL;
349 p1 = strstr(path+8, "/");
350 if (!p1)
351 return NULL;
352 return p1+1;
353 }
354
355 static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
356 {
357 nih_local char **list = NULL;
358 int i;
359
360 if (!f)
361 return false;
362 if (*f == '/')
363 f++;
364
365 if (!cgm_list_children(contr, dir, &list))
366 return false;
367 for (i = 0; list[i]; i++) {
368 if (strcmp(list[i], f) == 0)
369 return true;
370 }
371
372 return false;
373 }
374
375 static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
376 {
377 nih_local struct cgm_keys **list = NULL;
378 struct cgm_keys *k;
379 int i;
380
381 if (!f)
382 return NULL;
383 if (*f == '/')
384 f++;
385 if (!cgm_list_keys(contr, dir, &list))
386 return NULL;
387 for (i = 0; list[i]; i++) {
388 if (strcmp(list[i]->name, f) == 0) {
389 k = NIH_MUST( nih_alloc(NULL, (sizeof(*k))) );
390 k->name = NIH_MUST( nih_strdup(k, list[i]->name) );
391 k->uid = list[i]->uid;
392 k->gid = list[i]->gid;
393 k->mode = list[i]->mode;
394 return k;
395 }
396 }
397
398 return NULL;
399 }
400
401 static void get_cgdir_and_path(const char *cg, char **dir, char **file)
402 {
403 char *p;
404
405 *dir = NIH_MUST( nih_strdup(NULL, cg) );
406 *file = strrchr(cg, '/');
407 if (!*file) {
408 *file = NULL;
409 return;
410 }
411 p = strrchr(*dir, '/');
412 *p = '\0';
413 }
414
415 static size_t get_file_size(const char *contrl, const char *cg, const char *f)
416 {
417 nih_local char *data = NULL;
418 size_t s;
419 if (!cgm_get_value(contrl, cg, f, &data))
420 return -EINVAL;
421 s = strlen(data);
422 return s;
423 }
424
425 /*
426 * FUSE ops for /cgroup
427 */
428
429 static int cg_getattr(const char *path, struct stat *sb)
430 {
431 struct timespec now;
432 struct fuse_context *fc = fuse_get_context();
433 nih_local char * cgdir = NULL;
434 char *fpath = NULL, *path1, *path2;
435 nih_local struct cgm_keys *k = NULL;
436 const char *cgroup;
437 nih_local char *controller = NULL;
438
439
440 if (!fc)
441 return -EIO;
442
443 memset(sb, 0, sizeof(struct stat));
444
445 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
446 return -EINVAL;
447
448 sb->st_uid = sb->st_gid = 0;
449 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
450 sb->st_size = 0;
451
452 if (strcmp(path, "/cgroup") == 0) {
453 sb->st_mode = S_IFDIR | 00755;
454 sb->st_nlink = 2;
455 return 0;
456 }
457
458 controller = pick_controller_from_path(fc, path);
459 if (!controller)
460 return -EIO;
461 cgroup = find_cgroup_in_path(path);
462 if (!cgroup) {
463 /* this is just /cgroup/controller, return it as a dir */
464 sb->st_mode = S_IFDIR | 00755;
465 sb->st_nlink = 2;
466 return 0;
467 }
468
469 get_cgdir_and_path(cgroup, &cgdir, &fpath);
470
471 if (!fpath) {
472 path1 = "/";
473 path2 = cgdir;
474 } else {
475 path1 = cgdir;
476 path2 = fpath;
477 }
478
479 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
480 * Then check that caller's cgroup is under path if fpath is a child
481 * cgroup, or cgdir if fpath is a file */
482
483 if (is_child_cgroup(controller, path1, path2)) {
484 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
485 /* this is just /cgroup/controller, return it as a dir */
486 sb->st_mode = S_IFDIR | 00555;
487 sb->st_nlink = 2;
488 return 0;
489 }
490 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
491 return -EACCES;
492
493 // get uid, gid, from '/tasks' file and make up a mode
494 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
495 sb->st_mode = S_IFDIR | 00755;
496 k = get_cgroup_key(controller, cgroup, "tasks");
497 if (!k) {
498 sb->st_uid = sb->st_gid = 0;
499 } else {
500 sb->st_uid = k->uid;
501 sb->st_gid = k->gid;
502 }
503 sb->st_nlink = 2;
504 return 0;
505 }
506
507 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
508 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL))
509 return -ENOENT;
510 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
511 return -EACCES;
512
513 sb->st_mode = S_IFREG | k->mode;
514 sb->st_nlink = 1;
515 sb->st_uid = k->uid;
516 sb->st_gid = k->gid;
517 sb->st_size = get_file_size(controller, path1, path2);
518 return 0;
519 }
520
521 return -ENOENT;
522 }
523
524 /*
525 * TODO - cache these results in a table for use in opendir, free
526 * in releasedir
527 */
528 static int cg_opendir(const char *path, struct fuse_file_info *fi)
529 {
530 struct fuse_context *fc = fuse_get_context();
531 nih_local struct cgm_keys **list = NULL;
532 const char *cgroup;
533 nih_local char *controller = NULL;
534 nih_local char *nextcg = NULL;
535
536 if (!fc)
537 return -EIO;
538
539 if (strcmp(path, "/cgroup") == 0)
540 return 0;
541
542 // return list of keys for the controller, and list of child cgroups
543 controller = pick_controller_from_path(fc, path);
544 if (!controller)
545 return -EIO;
546
547 cgroup = find_cgroup_in_path(path);
548 if (!cgroup) {
549 /* this is just /cgroup/controller, return its contents */
550 cgroup = "/";
551 }
552
553 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
554 return -EACCES;
555 return 0;
556 }
557
558 static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
559 struct fuse_file_info *fi)
560 {
561 struct fuse_context *fc = fuse_get_context();
562
563 if (!fc)
564 return -EIO;
565
566 if (strcmp(path, "/cgroup") == 0) {
567 // get list of controllers
568 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
569 int i;
570
571 if (!list)
572 return -EIO;
573
574 for (i = 0; list[i]; i++) {
575 if (filler(buf, list[i], NULL, 0) != 0) {
576 return -EIO;
577 }
578 }
579 return 0;
580 }
581
582 // return list of keys for the controller, and list of child cgroups
583 nih_local struct cgm_keys **list = NULL;
584 const char *cgroup;
585 nih_local char *controller = NULL;
586 int i;
587 nih_local char *nextcg = NULL;
588
589 controller = pick_controller_from_path(fc, path);
590 if (!controller)
591 return -EIO;
592
593 cgroup = find_cgroup_in_path(path);
594 if (!cgroup) {
595 /* this is just /cgroup/controller, return its contents */
596 cgroup = "/";
597 }
598
599 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
600 return -EACCES;
601
602 if (!cgm_list_keys(controller, cgroup, &list))
603 // not a valid cgroup
604 return -EINVAL;
605
606 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, &nextcg)) {
607 if (nextcg) {
608 int ret;
609 ret = filler(buf, nextcg, NULL, 0);
610 if (ret != 0)
611 return -EIO;
612 }
613 return 0;
614 }
615
616 for (i = 0; list[i]; i++) {
617 if (filler(buf, list[i]->name, NULL, 0) != 0) {
618 return -EIO;
619 }
620 }
621
622 // now get the list of child cgroups
623 nih_local char **clist;
624
625 if (!cgm_list_children(controller, cgroup, &clist))
626 return 0;
627 for (i = 0; clist[i]; i++) {
628 if (filler(buf, clist[i], NULL, 0) != 0) {
629 return -EIO;
630 }
631 }
632 return 0;
633 }
634
635 static int cg_releasedir(const char *path, struct fuse_file_info *fi)
636 {
637 return 0;
638 }
639
640 /*
641 * TODO - cache info here for read/write, release in cg_release.
642 */
643 static int cg_open(const char *path, struct fuse_file_info *fi)
644 {
645 nih_local char *controller = NULL;
646 const char *cgroup;
647 char *fpath = NULL, *path1, *path2;
648 nih_local char * cgdir = NULL;
649 nih_local struct cgm_keys *k = NULL;
650 struct fuse_context *fc = fuse_get_context();
651
652 if (!fc)
653 return -EIO;
654
655 controller = pick_controller_from_path(fc, path);
656 if (!controller)
657 return -EIO;
658 cgroup = find_cgroup_in_path(path);
659 if (!cgroup)
660 return -EINVAL;
661
662 get_cgdir_and_path(cgroup, &cgdir, &fpath);
663 if (!fpath) {
664 path1 = "/";
665 path2 = cgdir;
666 } else {
667 path1 = cgdir;
668 path2 = fpath;
669 }
670
671 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
672 if (!fc_may_access(fc, controller, path1, path2, fi->flags))
673 // should never get here
674 return -EACCES;
675
676 return 0;
677 }
678
679 return -EINVAL;
680 }
681
682 static int msgrecv(int sockfd, void *buf, size_t len)
683 {
684 struct timeval tv;
685 fd_set rfds;
686
687 FD_ZERO(&rfds);
688 FD_SET(sockfd, &rfds);
689 tv.tv_sec = 2;
690 tv.tv_usec = 0;
691
692 if (select(sockfd+1, &rfds, NULL, NULL, &tv) < 0)
693 return -1;
694 return recv(sockfd, buf, len, MSG_DONTWAIT);
695 }
696
697 #define SEND_CREDS_OK 0
698 #define SEND_CREDS_NOTSK 1
699 #define SEND_CREDS_FAIL 2
700 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
701 {
702 struct msghdr msg = { 0 };
703 struct iovec iov;
704 struct cmsghdr *cmsg;
705 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
706 char buf[1];
707 buf[0] = 'p';
708
709 if (pingfirst) {
710 if (msgrecv(sock, buf, 1) != 1) {
711 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
712 __func__);
713 return SEND_CREDS_FAIL;
714 }
715 }
716
717 msg.msg_control = cmsgbuf;
718 msg.msg_controllen = sizeof(cmsgbuf);
719
720 cmsg = CMSG_FIRSTHDR(&msg);
721 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
722 cmsg->cmsg_level = SOL_SOCKET;
723 cmsg->cmsg_type = SCM_CREDENTIALS;
724 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
725
726 msg.msg_name = NULL;
727 msg.msg_namelen = 0;
728
729 buf[0] = v;
730 iov.iov_base = buf;
731 iov.iov_len = sizeof(buf);
732 msg.msg_iov = &iov;
733 msg.msg_iovlen = 1;
734
735 if (sendmsg(sock, &msg, 0) < 0) {
736 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
737 strerror(errno));
738 if (errno == 3)
739 return SEND_CREDS_NOTSK;
740 return SEND_CREDS_FAIL;
741 }
742
743 return SEND_CREDS_OK;
744 }
745
746 static bool recv_creds(int sock, struct ucred *cred, char *v)
747 {
748 struct msghdr msg = { 0 };
749 struct iovec iov;
750 struct cmsghdr *cmsg;
751 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
752 char buf[1];
753 int ret;
754 int optval = 1;
755
756 *v = '1';
757
758 cred->pid = -1;
759 cred->uid = -1;
760 cred->gid = -1;
761
762 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
763 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
764 return false;
765 }
766 buf[0] = '1';
767 if (write(sock, buf, 1) != 1) {
768 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
769 return false;
770 }
771
772 msg.msg_name = NULL;
773 msg.msg_namelen = 0;
774 msg.msg_control = cmsgbuf;
775 msg.msg_controllen = sizeof(cmsgbuf);
776
777 iov.iov_base = buf;
778 iov.iov_len = sizeof(buf);
779 msg.msg_iov = &iov;
780 msg.msg_iovlen = 1;
781
782 // retry logic is not ideal, especially as we are not
783 // threaded. Sleep at most 1 second waiting for the client
784 // to send us the scm_cred
785 ret = recvmsg(sock, &msg, 0);
786 if (ret < 0) {
787 fprintf(stderr, "Failed to receive scm_cred: %s\n",
788 strerror(errno));
789 return false;
790 }
791
792 cmsg = CMSG_FIRSTHDR(&msg);
793
794 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
795 cmsg->cmsg_level == SOL_SOCKET &&
796 cmsg->cmsg_type == SCM_CREDENTIALS) {
797 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
798 }
799 *v = buf[0];
800
801 return true;
802 }
803
804
805 /*
806 * pid_to_ns - reads pids from a ucred over a socket, then writes the
807 * int value back over the socket. This shifts the pid from the
808 * sender's pidns into tpid's pidns.
809 */
810 static void pid_to_ns(int sock, pid_t tpid)
811 {
812 char v = '0';
813 struct ucred cred;
814
815 while (recv_creds(sock, &cred, &v)) {
816 if (v == '1')
817 exit(0);
818 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
819 exit(1);
820 }
821 exit(0);
822 }
823
824 /*
825 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
826 * in your old pidns. Only children which you fork will be in the target
827 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
828 * actually convert pids
829 */
830 static void pid_to_ns_wrapper(int sock, pid_t tpid)
831 {
832 int newnsfd = -1;
833 char fnam[100];
834 pid_t cpid;
835
836 sprintf(fnam, "/proc/%d/ns/pid", tpid);
837 newnsfd = open(fnam, O_RDONLY);
838 if (newnsfd < 0)
839 exit(1);
840 if (setns(newnsfd, 0) < 0)
841 exit(1);
842 close(newnsfd);
843
844 cpid = fork();
845
846 if (cpid < 0)
847 exit(1);
848 if (!cpid)
849 pid_to_ns(sock, tpid);
850 if (!wait_for_pid(cpid))
851 exit(1);
852 exit(0);
853 }
854
855 /*
856 * To read cgroup files with a particular pid, we will setns into the child
857 * pidns, open a pipe, fork a child - which will be the first to really be in
858 * the child ns - which does the cgm_get_value and writes the data to the pipe.
859 */
860 static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
861 {
862 int sock[2] = {-1, -1};
863 nih_local char *tmpdata = NULL;
864 int ret;
865 pid_t qpid, cpid = -1;
866 bool answer = false;
867 char v = '0';
868 struct ucred cred;
869 struct timeval tv;
870 fd_set s;
871
872 if (!cgm_get_value(contrl, cg, file, &tmpdata))
873 return false;
874
875 /*
876 * Now we read the pids from returned data one by one, pass
877 * them into a child in the target namespace, read back the
878 * translated pids, and put them into our to-return data
879 */
880
881 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
882 perror("socketpair");
883 exit(1);
884 }
885
886 cpid = fork();
887 if (cpid == -1)
888 goto out;
889
890 if (!cpid) // child
891 pid_to_ns_wrapper(sock[1], tpid);
892
893 char *ptr = tmpdata;
894 cred.uid = 0;
895 cred.gid = 0;
896 while (sscanf(ptr, "%d\n", &qpid) == 1) {
897 cred.pid = qpid;
898 ret = send_creds(sock[0], &cred, v, true);
899
900 if (ret == SEND_CREDS_NOTSK)
901 goto next;
902 if (ret == SEND_CREDS_FAIL)
903 goto out;
904
905 // read converted results
906 FD_ZERO(&s);
907 FD_SET(sock[0], &s);
908 tv.tv_sec = 1;
909 tv.tv_usec = 0;
910 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
911 if (ret <= 0) {
912 kill(cpid, SIGTERM);
913 goto out;
914 }
915 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
916 kill(cpid, SIGTERM);
917 perror("read");
918 goto out;
919 }
920 NIH_MUST( nih_strcat_sprintf(d, NULL, "%d\n", qpid) );
921 next:
922 ptr = strchr(ptr, '\n');
923 if (!ptr)
924 break;
925 ptr++;
926 }
927
928 cred.pid = getpid();
929 v = '1';
930 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
931 // failed to ask child to exit
932 kill(cpid, SIGTERM);
933 goto out;
934 }
935
936 answer = true;
937
938 out:
939 if (cpid != -1)
940 wait_for_pid(cpid);
941 if (sock[0] != -1) {
942 close(sock[0]);
943 close(sock[1]);
944 }
945 return answer;
946 }
947
948 static int cg_read(const char *path, char *buf, size_t size, off_t offset,
949 struct fuse_file_info *fi)
950 {
951 nih_local char *controller = NULL;
952 const char *cgroup;
953 char *fpath = NULL, *path1, *path2;
954 struct fuse_context *fc = fuse_get_context();
955 nih_local char * cgdir = NULL;
956 nih_local struct cgm_keys *k = NULL;
957
958 if (offset)
959 return -EIO;
960
961 if (!fc)
962 return -EIO;
963
964 controller = pick_controller_from_path(fc, path);
965 if (!controller)
966 return -EINVAL;
967 cgroup = find_cgroup_in_path(path);
968 if (!cgroup)
969 return -EINVAL;
970
971 get_cgdir_and_path(cgroup, &cgdir, &fpath);
972 if (!fpath) {
973 path1 = "/";
974 path2 = cgdir;
975 } else {
976 path1 = cgdir;
977 path2 = fpath;
978 }
979
980 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
981 nih_local char *data = NULL;
982 int s;
983 bool r;
984
985 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
986 // should never get here
987 return -EACCES;
988
989 if (strcmp(path2, "tasks") == 0 ||
990 strcmp(path2, "/tasks") == 0 ||
991 strcmp(path2, "/cgroup.procs") == 0 ||
992 strcmp(path2, "cgroup.procs") == 0)
993 // special case - we have to translate the pids
994 r = do_read_pids(fc->pid, controller, path1, path2, &data);
995 else
996 r = cgm_get_value(controller, path1, path2, &data);
997
998 if (!r)
999 return -EINVAL;
1000
1001 if (!data)
1002 return 0;
1003 s = strlen(data);
1004 if (s > size)
1005 s = size;
1006 memcpy(buf, data, s);
1007
1008 return s;
1009 }
1010
1011 return -EINVAL;
1012 }
1013
1014 static void pid_from_ns(int sock, pid_t tpid)
1015 {
1016 pid_t vpid;
1017 struct ucred cred;
1018 char v;
1019
1020 cred.uid = 0;
1021 cred.gid = 0;
1022 while (read(sock, &vpid, sizeof(pid_t)) == sizeof(pid_t)) {
1023 if (vpid == -1) // done
1024 break;
1025 v = '0';
1026 cred.pid = vpid;
1027 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
1028 v = '1';
1029 cred.pid = getpid();
1030 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
1031 exit(1);
1032 }
1033 }
1034 exit(0);
1035 }
1036
1037 static void pid_from_ns_wrapper(int sock, pid_t tpid)
1038 {
1039 int newnsfd = -1;
1040 char fnam[100];
1041 pid_t cpid;
1042
1043 sprintf(fnam, "/proc/%d/ns/pid", tpid);
1044 newnsfd = open(fnam, O_RDONLY);
1045 if (newnsfd < 0)
1046 exit(1);
1047 if (setns(newnsfd, 0) < 0)
1048 exit(1);
1049 close(newnsfd);
1050
1051 cpid = fork();
1052
1053 if (cpid < 0)
1054 exit(1);
1055 if (!cpid)
1056 pid_from_ns(sock, tpid);
1057 if (!wait_for_pid(cpid))
1058 exit(1);
1059 exit(0);
1060 }
1061
1062 static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
1063 {
1064 int sock[2] = {-1, -1};
1065 pid_t qpid, cpid = -1;
1066 bool answer = false, fail = false;
1067
1068 /*
1069 * write the pids to a socket, have helper in writer's pidns
1070 * call movepid for us
1071 */
1072 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1073 perror("socketpair");
1074 exit(1);
1075 }
1076
1077 cpid = fork();
1078 if (cpid == -1)
1079 goto out;
1080
1081 if (!cpid) // child
1082 pid_from_ns_wrapper(sock[1], tpid);
1083
1084 const char *ptr = buf;
1085 while (sscanf(ptr, "%d", &qpid) == 1) {
1086 struct ucred cred;
1087 char v;
1088
1089 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1090 kill(cpid, SIGTERM);
1091 perror("write");
1092 goto out;
1093 }
1094
1095 if (recv_creds(sock[0], &cred, &v)) {
1096 if (v == '0') {
1097 if (!cgm_move_pid(contrl, cg, cred.pid))
1098 fail = true;
1099 }
1100 }
1101
1102 ptr = strchr(ptr, '\n');
1103 if (!ptr)
1104 break;
1105 ptr++;
1106 }
1107
1108 /* All good, write the value */
1109 qpid = -1;
1110 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1111 fprintf(stderr, "Warning: failed to ask child to exit\n");
1112
1113 if (!fail)
1114 answer = true;
1115
1116 out:
1117 if (cpid != -1)
1118 wait_for_pid(cpid);
1119 if (sock[0] != -1) {
1120 close(sock[0]);
1121 close(sock[1]);
1122 }
1123 return answer;
1124 }
1125
1126 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1127 struct fuse_file_info *fi)
1128 {
1129 nih_local char *controller = NULL;
1130 const char *cgroup;
1131 char *fpath = NULL, *path1, *path2;
1132 struct fuse_context *fc = fuse_get_context();
1133 nih_local char * cgdir = NULL;
1134 nih_local struct cgm_keys *k = NULL;
1135 nih_local char *localbuf = NULL;
1136
1137 if (offset)
1138 return -EINVAL;
1139
1140 if (!fc)
1141 return -EIO;
1142
1143 localbuf = NIH_MUST( nih_alloc(NULL, size+1) );
1144 localbuf[size] = '\0';
1145 memcpy(localbuf, buf, size);
1146 controller = pick_controller_from_path(fc, path);
1147 if (!controller)
1148 return -EINVAL;
1149 cgroup = find_cgroup_in_path(path);
1150 if (!cgroup)
1151 return -EINVAL;
1152
1153 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1154 if (!fpath) {
1155 path1 = "/";
1156 path2 = cgdir;
1157 } else {
1158 path1 = cgdir;
1159 path2 = fpath;
1160 }
1161
1162 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
1163 bool r;
1164
1165 if (!fc_may_access(fc, controller, path1, path2, O_WRONLY))
1166 return -EACCES;
1167
1168 if (strcmp(path2, "tasks") == 0 ||
1169 strcmp(path2, "/tasks") == 0 ||
1170 strcmp(path2, "/cgroup.procs") == 0 ||
1171 strcmp(path2, "cgroup.procs") == 0)
1172 // special case - we have to translate the pids
1173 r = do_write_pids(fc->pid, controller, path1, path2, localbuf);
1174 else
1175 r = cgm_set_value(controller, path1, path2, localbuf);
1176
1177 if (!r)
1178 return -EINVAL;
1179
1180 return size;
1181 }
1182
1183 return -EINVAL;
1184 }
1185
1186 int cg_chown(const char *path, uid_t uid, gid_t gid)
1187 {
1188 struct fuse_context *fc = fuse_get_context();
1189 nih_local char * cgdir = NULL;
1190 char *fpath = NULL, *path1, *path2;
1191 nih_local struct cgm_keys *k = NULL;
1192 const char *cgroup;
1193 nih_local char *controller = NULL;
1194
1195
1196 if (!fc)
1197 return -EIO;
1198
1199 if (strcmp(path, "/cgroup") == 0)
1200 return -EINVAL;
1201
1202 controller = pick_controller_from_path(fc, path);
1203 if (!controller)
1204 return -EINVAL;
1205 cgroup = find_cgroup_in_path(path);
1206 if (!cgroup)
1207 /* this is just /cgroup/controller */
1208 return -EINVAL;
1209
1210 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1211
1212 if (!fpath) {
1213 path1 = "/";
1214 path2 = cgdir;
1215 } else {
1216 path1 = cgdir;
1217 path2 = fpath;
1218 }
1219
1220 if (is_child_cgroup(controller, path1, path2)) {
1221 // get uid, gid, from '/tasks' file and make up a mode
1222 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1223 k = get_cgroup_key(controller, cgroup, "tasks");
1224
1225 } else
1226 k = get_cgroup_key(controller, path1, path2);
1227
1228 if (!k)
1229 return -EINVAL;
1230
1231 /*
1232 * This being a fuse request, the uid and gid must be valid
1233 * in the caller's namespace. So we can just check to make
1234 * sure that the caller is root in his uid, and privileged
1235 * over the file's current owner.
1236 */
1237 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD))
1238 return -EACCES;
1239
1240 if (!cgm_chown_file(controller, cgroup, uid, gid))
1241 return -EINVAL;
1242 return 0;
1243 }
1244
1245 int cg_chmod(const char *path, mode_t mode)
1246 {
1247 struct fuse_context *fc = fuse_get_context();
1248 nih_local char * cgdir = NULL;
1249 char *fpath = NULL, *path1, *path2;
1250 nih_local struct cgm_keys *k = NULL;
1251 const char *cgroup;
1252 nih_local char *controller = NULL;
1253
1254 if (!fc)
1255 return -EIO;
1256
1257 if (strcmp(path, "/cgroup") == 0)
1258 return -EINVAL;
1259
1260 controller = pick_controller_from_path(fc, path);
1261 if (!controller)
1262 return -EINVAL;
1263 cgroup = find_cgroup_in_path(path);
1264 if (!cgroup)
1265 /* this is just /cgroup/controller */
1266 return -EINVAL;
1267
1268 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1269
1270 if (!fpath) {
1271 path1 = "/";
1272 path2 = cgdir;
1273 } else {
1274 path1 = cgdir;
1275 path2 = fpath;
1276 }
1277
1278 if (is_child_cgroup(controller, path1, path2)) {
1279 // get uid, gid, from '/tasks' file and make up a mode
1280 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1281 k = get_cgroup_key(controller, cgroup, "tasks");
1282
1283 } else
1284 k = get_cgroup_key(controller, path1, path2);
1285
1286 if (!k)
1287 return -EINVAL;
1288
1289 /*
1290 * This being a fuse request, the uid and gid must be valid
1291 * in the caller's namespace. So we can just check to make
1292 * sure that the caller is root in his uid, and privileged
1293 * over the file's current owner.
1294 */
1295 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT))
1296 return -EPERM;
1297
1298 if (!cgm_chmod_file(controller, cgroup, mode))
1299 return -EINVAL;
1300 return 0;
1301 }
1302
1303 int cg_mkdir(const char *path, mode_t mode)
1304 {
1305 struct fuse_context *fc = fuse_get_context();
1306 nih_local struct cgm_keys **list = NULL;
1307 char *fpath = NULL, *path1;
1308 nih_local char * cgdir = NULL;
1309 const char *cgroup;
1310 nih_local char *controller = NULL;
1311
1312 if (!fc)
1313 return -EIO;
1314
1315
1316 controller = pick_controller_from_path(fc, path);
1317 if (!controller)
1318 return -EINVAL;
1319
1320 cgroup = find_cgroup_in_path(path);
1321 if (!cgroup)
1322 return -EINVAL;
1323
1324 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1325 if (!fpath)
1326 path1 = "/";
1327 else
1328 path1 = cgdir;
1329
1330 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR))
1331 return -EACCES;
1332
1333
1334 if (!cgm_create(controller, cgroup, fc->uid, fc->gid))
1335 return -EINVAL;
1336
1337 return 0;
1338 }
1339
1340 static int cg_rmdir(const char *path)
1341 {
1342 struct fuse_context *fc = fuse_get_context();
1343 nih_local struct cgm_keys **list = NULL;
1344 char *fpath = NULL;
1345 nih_local char * cgdir = NULL;
1346 const char *cgroup;
1347 nih_local char *controller = NULL;
1348
1349 if (!fc)
1350 return -EIO;
1351
1352
1353 controller = pick_controller_from_path(fc, path);
1354 if (!controller)
1355 return -EINVAL;
1356
1357 cgroup = find_cgroup_in_path(path);
1358 if (!cgroup)
1359 return -EINVAL;
1360
1361 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1362 if (!fpath)
1363 return -EINVAL;
1364
1365 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY))
1366 return -EACCES;
1367
1368 if (!cgm_remove(controller, cgroup))
1369 return -EINVAL;
1370
1371 return 0;
1372 }
1373
1374 static bool startswith(const char *line, const char *pref)
1375 {
1376 if (strncmp(line, pref, strlen(pref)) == 0)
1377 return true;
1378 return false;
1379 }
1380
1381 static void get_mem_cached(char *memstat, unsigned long *v)
1382 {
1383 char *eol;
1384
1385 *v = 0;
1386 while (*memstat) {
1387 if (startswith(memstat, "total_cache")) {
1388 sscanf(memstat + 11, "%lu", v);
1389 *v /= 1024;
1390 return;
1391 }
1392 eol = strchr(memstat, '\n');
1393 if (!eol)
1394 return;
1395 memstat = eol+1;
1396 }
1397 }
1398
1399 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
1400 {
1401 char *eol;
1402 char key[32];
1403
1404 memset(key, 0, 32);
1405 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
1406
1407 size_t len = strlen(key);
1408 *v = 0;
1409
1410 while (*str) {
1411 if (startswith(str, key)) {
1412 sscanf(str + len, "%lu", v);
1413 return;
1414 }
1415 eol = strchr(str, '\n');
1416 if (!eol)
1417 return;
1418 str = eol+1;
1419 }
1420 }
1421
1422 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1423 {
1424 nih_local char *fnam = NULL;
1425 FILE *f;
1426 char *answer = NULL;
1427 char *line = NULL;
1428 size_t len = 0;
1429
1430 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
1431 if (!(f = fopen(fnam, "r")))
1432 return false;
1433
1434 while (getline(&line, &len, f) != -1) {
1435 char *c1, *c2;
1436 if (!line[0])
1437 continue;
1438 c1 = strchr(line, ':');
1439 if (!c1)
1440 goto out;
1441 c1++;
1442 c2 = strchr(c1, ':');
1443 if (!c2)
1444 goto out;
1445 *c2 = '\0';
1446 if (strcmp(c1, contrl) != 0)
1447 continue;
1448 c2++;
1449 stripnewline(c2);
1450 answer = NIH_MUST( nih_strdup(NULL, c2) );
1451 goto out;
1452 }
1453
1454 out:
1455 fclose(f);
1456 free(line);
1457 return answer;
1458 }
1459
1460 /*
1461 * FUSE ops for /proc
1462 */
1463
1464 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1465 struct fuse_file_info *fi)
1466 {
1467 struct fuse_context *fc = fuse_get_context();
1468 nih_local char *cg = get_pid_cgroup(fc->pid, "memory");
1469 nih_local char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
1470 unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
1471 char *line = NULL;
1472 size_t linelen = 0, total_len = 0;
1473 FILE *f;
1474
1475 if (offset)
1476 return -EINVAL;
1477
1478 if (!cg)
1479 return 0;
1480
1481 if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
1482 return 0;
1483 if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
1484 return 0;
1485 if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
1486 return 0;
1487 memlimit = strtoul(memlimit_str, NULL, 10);
1488 memusage = strtoul(memusage_str, NULL, 10);
1489 memlimit /= 1024;
1490 memusage /= 1024;
1491 get_mem_cached(memstat_str, &cached);
1492
1493 f = fopen("/proc/meminfo", "r");
1494 if (!f)
1495 return 0;
1496
1497 while (getline(&line, &linelen, f) != -1) {
1498 size_t l;
1499 char *printme, lbuf[100];
1500
1501 memset(lbuf, 0, 100);
1502 if (startswith(line, "MemTotal:")) {
1503 sscanf(line+14, "%lu", &hosttotal);
1504 if (hosttotal < memlimit)
1505 memlimit = hosttotal;
1506 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1507 printme = lbuf;
1508 } else if (startswith(line, "MemFree:")) {
1509 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1510 printme = lbuf;
1511 } else if (startswith(line, "MemAvailable:")) {
1512 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1513 printme = lbuf;
1514 } else if (startswith(line, "Buffers:")) {
1515 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1516 printme = lbuf;
1517 } else if (startswith(line, "Cached:")) {
1518 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
1519 printme = lbuf;
1520 } else if (startswith(line, "SwapCached:")) {
1521 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
1522 printme = lbuf;
1523 } else
1524 printme = line;
1525 l = snprintf(buf, size, "%s", printme);
1526 buf += l;
1527 size -= l;
1528 total_len += l;
1529 }
1530
1531 fclose(f);
1532 free(line);
1533 return total_len;
1534 }
1535
1536 /*
1537 * Read the cpuset.cpus for cg
1538 * Return the answer in a nih_alloced string
1539 */
1540 static char *get_cpuset(const char *cg)
1541 {
1542 char *answer;
1543
1544 if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
1545 return NULL;
1546 return answer;
1547 }
1548
1549 /*
1550 * Helper functions for cpuset_in-set
1551 */
1552 char *cpuset_nexttok(const char *c)
1553 {
1554 char *r = strchr(c+1, ',');
1555 if (r)
1556 return r+1;
1557 return NULL;
1558 }
1559
1560 int cpuset_getrange(const char *c, int *a, int *b)
1561 {
1562 int ret;
1563
1564 ret = sscanf(c, "%d-%d", a, b);
1565 return ret;
1566 }
1567
1568 /*
1569 * cpusets are in format "1,2-3,4"
1570 * iow, comma-delimited ranges
1571 */
1572 static bool cpu_in_cpuset(int cpu, const char *cpuset)
1573 {
1574 const char *c;
1575
1576 for (c = cpuset; c; c = cpuset_nexttok(c)) {
1577 int a, b, ret;
1578
1579 ret = cpuset_getrange(c, &a, &b);
1580 if (ret == 1 && cpu == a)
1581 return true;
1582 if (ret != 2) // bad cpuset!
1583 return false;
1584 if (cpu >= a && cpu <= b)
1585 return true;
1586 }
1587
1588 return false;
1589 }
1590
1591 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
1592 {
1593 int cpu;
1594
1595 if (sscanf(line, "processor : %d", &cpu) != 1)
1596 return false;
1597 return cpu_in_cpuset(cpu, cpuset);
1598 }
1599
1600 /*
1601 * check whether this is a '^processor" line in /proc/cpuinfo
1602 */
1603 static bool is_processor_line(const char *line)
1604 {
1605 int cpu;
1606
1607 if (sscanf(line, "processor : %d", &cpu) == 1)
1608 return true;
1609 return false;
1610 }
1611
1612 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
1613 struct fuse_file_info *fi)
1614 {
1615 struct fuse_context *fc = fuse_get_context();
1616 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1617 nih_local char *cpuset = NULL;
1618 char *line = NULL;
1619 size_t linelen = 0, total_len = 0;
1620 bool am_printing = false;
1621 int curcpu = -1;
1622 FILE *f;
1623
1624 if (offset)
1625 return -EINVAL;
1626
1627 if (!cg)
1628 return 0;
1629
1630 cpuset = get_cpuset(cg);
1631 if (!cpuset)
1632 return 0;
1633
1634 f = fopen("/proc/cpuinfo", "r");
1635 if (!f)
1636 return 0;
1637
1638 while (getline(&line, &linelen, f) != -1) {
1639 size_t l;
1640 if (is_processor_line(line)) {
1641 am_printing = cpuline_in_cpuset(line, cpuset);
1642 if (am_printing) {
1643 curcpu ++;
1644 l = snprintf(buf, size, "processor : %d\n", curcpu);
1645 buf += l;
1646 size -= l;
1647 total_len += l;
1648 }
1649 continue;
1650 }
1651 if (am_printing) {
1652 l = snprintf(buf, size, "%s", line);
1653 buf += l;
1654 size -= l;
1655 total_len += l;
1656 }
1657 }
1658
1659 fclose(f);
1660 free(line);
1661 return total_len;
1662 }
1663
1664 static int proc_stat_read(char *buf, size_t size, off_t offset,
1665 struct fuse_file_info *fi)
1666 {
1667 struct fuse_context *fc = fuse_get_context();
1668 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1669 nih_local char *cpuset = NULL;
1670 char *line = NULL;
1671 size_t linelen = 0, total_len = 0;
1672 int curcpu = -1; /* cpu numbering starts at 0 */
1673 FILE *f;
1674
1675 if (offset)
1676 return -EINVAL;
1677
1678 if (!cg)
1679 return 0;
1680
1681 cpuset = get_cpuset(cg);
1682 if (!cpuset)
1683 return 0;
1684
1685 f = fopen("/proc/stat", "r");
1686 if (!f)
1687 return 0;
1688
1689 while (getline(&line, &linelen, f) != -1) {
1690 size_t l;
1691 int cpu;
1692 char cpu_char[10]; /* That's a lot of cores */
1693 char *c;
1694
1695 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
1696 /* not a ^cpuN line containing a number N, just print it */
1697 l = snprintf(buf, size, "%s", line);
1698 buf += l;
1699 size -= l;
1700 total_len += l;
1701 continue;
1702 }
1703
1704 if (sscanf(cpu_char, "%d", &cpu) != 1)
1705 continue;
1706 if (!cpu_in_cpuset(cpu, cpuset))
1707 continue;
1708 curcpu ++;
1709
1710 c = strchr(line, ' ');
1711 if (!c)
1712 continue;
1713 l = snprintf(buf, size, "cpu%d %s", curcpu, c);
1714 buf += l;
1715 size -= l;
1716 total_len += l;
1717 }
1718
1719 fclose(f);
1720 free(line);
1721 return total_len;
1722 }
1723
1724 /*
1725 * How to guess what to present for uptime?
1726 * One thing we could do would be to take the date on the caller's
1727 * memory.usage_in_bytes file, which should equal the time of creation
1728 * of his cgroup. However, a task could be in a sub-cgroup of the
1729 * container. The same problem exists if we try to look at the ages
1730 * of processes in the caller's cgroup.
1731 *
1732 * So we'll fork a task that will enter the caller's pidns, mount a
1733 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
1734 *
1735 * For the second uptime #, we'll do as Stéphane had done, just copy
1736 * the number from /proc/uptime. Not sure how to best emulate 'idle'
1737 * time. Maybe someone can come up with a good algorithm and submit a
1738 * patch. Maybe something based on cpushare info?
1739 */
1740
1741 /* return age of the reaper for $pid, taken from ctime of its procdir */
1742 static long int get_pid1_time(pid_t pid)
1743 {
1744 char fnam[100];
1745 int fd;
1746 struct stat sb;
1747 int ret;
1748 pid_t npid;
1749
1750 if (unshare(CLONE_NEWNS))
1751 return 0;
1752
1753 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
1754 perror("rslave mount failed");
1755 return 0;
1756 }
1757
1758 sprintf(fnam, "/proc/%d/ns/pid", pid);
1759 fd = open(fnam, O_RDONLY);
1760 if (fd < 0) {
1761 perror("get_pid1_time open of ns/pid");
1762 return 0;
1763 }
1764 if (setns(fd, 0)) {
1765 perror("get_pid1_time setns 1");
1766 close(fd);
1767 return 0;
1768 }
1769 close(fd);
1770 npid = fork();
1771 if (npid < 0)
1772 return 0;
1773
1774 if (npid) {
1775 // child will do the writing for us
1776 wait_for_pid(npid);
1777 exit(0);
1778 }
1779
1780 umount2("/proc", MNT_DETACH);
1781
1782 if (mount("proc", "/proc", "proc", 0, NULL)) {
1783 perror("get_pid1_time mount");
1784 return 0;
1785 }
1786 ret = lstat("/proc/1", &sb);
1787 if (ret) {
1788 perror("get_pid1_time lstat");
1789 return 0;
1790 }
1791 return time(NULL) - sb.st_ctime;
1792 }
1793
1794 static long int getreaperage(pid_t qpid)
1795 {
1796 int pid, mypipe[2], ret;
1797 struct timeval tv;
1798 fd_set s;
1799 long int mtime, answer = 0;
1800
1801 if (pipe(mypipe)) {
1802 return 0;
1803 }
1804
1805 pid = fork();
1806
1807 if (!pid) { // child
1808 mtime = get_pid1_time(qpid);
1809 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
1810 fprintf(stderr, "Warning: bad write from getreaperage\n");
1811 exit(0);
1812 }
1813
1814 close(mypipe[1]);
1815 FD_ZERO(&s);
1816 FD_SET(mypipe[0], &s);
1817 tv.tv_sec = 1;
1818 tv.tv_usec = 0;
1819 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
1820 if (ret == -1) {
1821 perror("select");
1822 goto out;
1823 }
1824 if (!ret) {
1825 fprintf(stderr, "timed out\n");
1826 goto out;
1827 }
1828 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
1829 perror("read");
1830 goto out;
1831 }
1832 answer = mtime;
1833
1834 out:
1835 wait_for_pid(pid);
1836 close(mypipe[0]);
1837 return answer;
1838 }
1839
1840 static long int getprocidle(void)
1841 {
1842 FILE *f = fopen("/proc/uptime", "r");
1843 long int age, idle;
1844 int ret;
1845 if (!f)
1846 return 0;
1847 ret = fscanf(f, "%ld %ld", &age, &idle);
1848 fclose(f);
1849 if (ret != 2)
1850 return 0;
1851 return idle;
1852 }
1853
1854 /*
1855 * We read /proc/uptime and reuse its second field.
1856 * For the first field, we use the mtime for the reaper for
1857 * the calling pid as returned by getreaperage
1858 */
1859 static int proc_uptime_read(char *buf, size_t size, off_t offset,
1860 struct fuse_file_info *fi)
1861 {
1862 struct fuse_context *fc = fuse_get_context();
1863 long int reaperage = getreaperage(fc->pid);;
1864 long int idletime = getprocidle();
1865
1866 if (offset)
1867 return -EINVAL;
1868 return snprintf(buf, size, "%ld %ld\n", reaperage, idletime);
1869 }
1870
1871 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
1872 struct fuse_file_info *fi)
1873 {
1874 char dev_name[72];
1875 struct fuse_context *fc = fuse_get_context();
1876 nih_local char *cg = get_pid_cgroup(fc->pid, "blkio");
1877 nih_local char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
1878 *io_wait_time_str = NULL, *io_service_time_str = NULL;
1879 unsigned long read = 0, write = 0;
1880 unsigned long read_merged = 0, write_merged = 0;
1881 unsigned long read_sectors = 0, write_sectors = 0;
1882 unsigned long read_ticks = 0, write_ticks = 0;
1883 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
1884 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
1885 char *line = NULL;
1886 size_t linelen = 0, total_len = 0;
1887 unsigned int major = 0, minor = 0;
1888 int i = 0;
1889 FILE *f;
1890
1891 if (offset)
1892 return -EINVAL;
1893
1894 if (!cg)
1895 return 0;
1896
1897 if (!cgm_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
1898 return 0;
1899 if (!cgm_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
1900 return 0;
1901 if (!cgm_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
1902 return 0;
1903 if (!cgm_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
1904 return 0;
1905 if (!cgm_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
1906 return 0;
1907
1908
1909 f = fopen("/proc/diskstats", "r");
1910 if (!f)
1911 return 0;
1912
1913 while (getline(&line, &linelen, f) != -1) {
1914 size_t l;
1915 char *printme, lbuf[256];
1916
1917 i = sscanf(line, "%u %u %s", &major, &minor, dev_name);
1918 if(i == 3){
1919 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
1920 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
1921 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
1922 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
1923 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
1924 read_sectors = read_sectors/512;
1925 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
1926 write_sectors = write_sectors/512;
1927
1928 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
1929 rd_svctm = rd_svctm/1000000;
1930 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
1931 rd_wait = rd_wait/1000000;
1932 read_ticks = rd_svctm + rd_wait;
1933
1934 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
1935 wr_svctm = wr_svctm/1000000;
1936 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
1937 wr_wait = wr_wait/1000000;
1938 write_ticks = wr_svctm + wr_wait;
1939
1940 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
1941 tot_ticks = tot_ticks/1000000;
1942 }else{
1943 continue;
1944 }
1945
1946 memset(lbuf, 0, 256);
1947 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
1948 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
1949 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
1950 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
1951 printme = lbuf;
1952 } else
1953 continue;
1954
1955 l = snprintf(buf, size, "%s", printme);
1956 buf += l;
1957 size -= l;
1958 total_len += l;
1959 }
1960
1961 fclose(f);
1962 free(line);
1963 return total_len;
1964 }
1965
1966 static off_t get_procfile_size(const char *which)
1967 {
1968 FILE *f = fopen(which, "r");
1969 char *line = NULL;
1970 size_t len = 0;
1971 ssize_t sz, answer = 0;
1972 if (!f)
1973 return 0;
1974
1975 while ((sz = getline(&line, &len, f)) != -1)
1976 answer += sz;
1977 fclose (f);
1978 free(line);
1979
1980 return answer;
1981 }
1982
1983 static int proc_getattr(const char *path, struct stat *sb)
1984 {
1985 struct timespec now;
1986
1987 memset(sb, 0, sizeof(struct stat));
1988 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1989 return -EINVAL;
1990 sb->st_uid = sb->st_gid = 0;
1991 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1992 if (strcmp(path, "/proc") == 0) {
1993 sb->st_mode = S_IFDIR | 00555;
1994 sb->st_nlink = 2;
1995 return 0;
1996 }
1997 if (strcmp(path, "/proc/meminfo") == 0 ||
1998 strcmp(path, "/proc/cpuinfo") == 0 ||
1999 strcmp(path, "/proc/uptime") == 0 ||
2000 strcmp(path, "/proc/stat") == 0 ||
2001 strcmp(path, "/proc/diskstats") == 0) {
2002 sb->st_size = get_procfile_size(path);
2003 sb->st_mode = S_IFREG | 00444;
2004 sb->st_nlink = 1;
2005 return 0;
2006 }
2007
2008 return -ENOENT;
2009 }
2010
2011 static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2012 struct fuse_file_info *fi)
2013 {
2014 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2015 filler(buf, "meminfo", NULL, 0) != 0 ||
2016 filler(buf, "stat", NULL, 0) != 0 ||
2017 filler(buf, "uptime", NULL, 0) != 0 ||
2018 filler(buf, "diskstats", NULL, 0) != 0)
2019 return -EINVAL;
2020 return 0;
2021 }
2022
2023 static int proc_open(const char *path, struct fuse_file_info *fi)
2024 {
2025 if (strcmp(path, "/proc/meminfo") == 0 ||
2026 strcmp(path, "/proc/cpuinfo") == 0 ||
2027 strcmp(path, "/proc/uptime") == 0 ||
2028 strcmp(path, "/proc/stat") == 0 ||
2029 strcmp(path, "/proc/diskstats") == 0)
2030 return 0;
2031 return -ENOENT;
2032 }
2033
2034 static int proc_read(const char *path, char *buf, size_t size, off_t offset,
2035 struct fuse_file_info *fi)
2036 {
2037 if (strcmp(path, "/proc/meminfo") == 0)
2038 return proc_meminfo_read(buf, size, offset, fi);
2039 if (strcmp(path, "/proc/cpuinfo") == 0)
2040 return proc_cpuinfo_read(buf, size, offset, fi);
2041 if (strcmp(path, "/proc/uptime") == 0)
2042 return proc_uptime_read(buf, size, offset, fi);
2043 if (strcmp(path, "/proc/stat") == 0)
2044 return proc_stat_read(buf, size, offset, fi);
2045 if (strcmp(path, "/proc/diskstats") == 0)
2046 return proc_diskstats_read(buf, size, offset, fi);
2047 return -EINVAL;
2048 }
2049
2050 /*
2051 * FUSE ops for /
2052 * these just delegate to the /proc and /cgroup ops as
2053 * needed
2054 */
2055
2056 static int lxcfs_getattr(const char *path, struct stat *sb)
2057 {
2058 if (strcmp(path, "/") == 0) {
2059 sb->st_mode = S_IFDIR | 00755;
2060 sb->st_nlink = 2;
2061 return 0;
2062 }
2063 if (strncmp(path, "/cgroup", 7) == 0) {
2064 return cg_getattr(path, sb);
2065 }
2066 if (strncmp(path, "/proc", 5) == 0) {
2067 return proc_getattr(path, sb);
2068 }
2069 return -EINVAL;
2070 }
2071
2072 static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
2073 {
2074 if (strcmp(path, "/") == 0)
2075 return 0;
2076
2077 if (strncmp(path, "/cgroup", 7) == 0) {
2078 return cg_opendir(path, fi);
2079 }
2080 if (strcmp(path, "/proc") == 0)
2081 return 0;
2082 return -ENOENT;
2083 }
2084
2085 static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2086 struct fuse_file_info *fi)
2087 {
2088 if (strcmp(path, "/") == 0) {
2089 if (filler(buf, "proc", NULL, 0) != 0 ||
2090 filler(buf, "cgroup", NULL, 0) != 0)
2091 return -EINVAL;
2092 return 0;
2093 }
2094 if (strncmp(path, "/cgroup", 7) == 0)
2095 return cg_readdir(path, buf, filler, offset, fi);
2096 if (strcmp(path, "/proc") == 0)
2097 return proc_readdir(path, buf, filler, offset, fi);
2098 return -EINVAL;
2099 }
2100
2101 static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
2102 {
2103 if (strcmp(path, "/") == 0)
2104 return 0;
2105 if (strncmp(path, "/cgroup", 7) == 0) {
2106 return cg_releasedir(path, fi);
2107 }
2108 if (strcmp(path, "/proc") == 0)
2109 return 0;
2110 return -EINVAL;
2111 }
2112
2113 static int lxcfs_open(const char *path, struct fuse_file_info *fi)
2114 {
2115 if (strncmp(path, "/cgroup", 7) == 0)
2116 return cg_open(path, fi);
2117 if (strncmp(path, "/proc", 5) == 0)
2118 return proc_open(path, fi);
2119
2120 return -EINVAL;
2121 }
2122
2123 static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
2124 struct fuse_file_info *fi)
2125 {
2126 if (strncmp(path, "/cgroup", 7) == 0)
2127 return cg_read(path, buf, size, offset, fi);
2128 if (strncmp(path, "/proc", 5) == 0)
2129 return proc_read(path, buf, size, offset, fi);
2130
2131 return -EINVAL;
2132 }
2133
2134 int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
2135 struct fuse_file_info *fi)
2136 {
2137 if (strncmp(path, "/cgroup", 7) == 0) {
2138 return cg_write(path, buf, size, offset, fi);
2139 }
2140
2141 return -EINVAL;
2142 }
2143
2144 static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2145 {
2146 return 0;
2147 }
2148
2149 static int lxcfs_release(const char *path, struct fuse_file_info *fi)
2150 {
2151 return 0;
2152 }
2153
2154 static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2155 {
2156 return 0;
2157 }
2158
2159 int lxcfs_mkdir(const char *path, mode_t mode)
2160 {
2161 if (strncmp(path, "/cgroup", 7) == 0)
2162 return cg_mkdir(path, mode);
2163
2164 return -EINVAL;
2165 }
2166
2167 int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2168 {
2169 if (strncmp(path, "/cgroup", 7) == 0)
2170 return cg_chown(path, uid, gid);
2171
2172 return -EINVAL;
2173 }
2174
2175 /*
2176 * cat first does a truncate before doing ops->write. This doesn't
2177 * really make sense for cgroups. So just return 0 always but do
2178 * nothing.
2179 */
2180 int lxcfs_truncate(const char *path, off_t newsize)
2181 {
2182 if (strncmp(path, "/cgroup", 7) == 0)
2183 return 0;
2184 return -EINVAL;
2185 }
2186
2187 int lxcfs_rmdir(const char *path)
2188 {
2189 if (strncmp(path, "/cgroup", 7) == 0)
2190 return cg_rmdir(path);
2191 return -EINVAL;
2192 }
2193
2194 int lxcfs_chmod(const char *path, mode_t mode)
2195 {
2196 if (strncmp(path, "/cgroup", 7) == 0)
2197 return cg_chmod(path, mode);
2198 return -EINVAL;
2199 }
2200
2201 const struct fuse_operations lxcfs_ops = {
2202 .getattr = lxcfs_getattr,
2203 .readlink = NULL,
2204 .getdir = NULL,
2205 .mknod = NULL,
2206 .mkdir = lxcfs_mkdir,
2207 .unlink = NULL,
2208 .rmdir = lxcfs_rmdir,
2209 .symlink = NULL,
2210 .rename = NULL,
2211 .link = NULL,
2212 .chmod = lxcfs_chmod,
2213 .chown = lxcfs_chown,
2214 .truncate = lxcfs_truncate,
2215 .utime = NULL,
2216
2217 .open = lxcfs_open,
2218 .read = lxcfs_read,
2219 .release = lxcfs_release,
2220 .write = lxcfs_write,
2221
2222 .statfs = NULL,
2223 .flush = lxcfs_flush,
2224 .fsync = lxcfs_fsync,
2225
2226 .setxattr = NULL,
2227 .getxattr = NULL,
2228 .listxattr = NULL,
2229 .removexattr = NULL,
2230
2231 .opendir = lxcfs_opendir,
2232 .readdir = lxcfs_readdir,
2233 .releasedir = lxcfs_releasedir,
2234
2235 .fsyncdir = NULL,
2236 .init = NULL,
2237 .destroy = NULL,
2238 .access = NULL,
2239 .create = NULL,
2240 .ftruncate = NULL,
2241 .fgetattr = NULL,
2242 };
2243
2244 static void usage(const char *me)
2245 {
2246 fprintf(stderr, "Usage:\n");
2247 fprintf(stderr, "\n");
2248 fprintf(stderr, "%s [FUSE and mount options] mountpoint\n", me);
2249 exit(1);
2250 }
2251
2252 static bool is_help(char *w)
2253 {
2254 if (strcmp(w, "-h") == 0 ||
2255 strcmp(w, "--help") == 0 ||
2256 strcmp(w, "-help") == 0 ||
2257 strcmp(w, "help") == 0)
2258 return true;
2259 return false;
2260 }
2261
2262 int main(int argc, char *argv[])
2263 {
2264 int ret;
2265 struct lxcfs_state *d;
2266
2267 if (argc < 2 || is_help(argv[1]))
2268 usage(argv[0]);
2269
2270 d = malloc(sizeof(*d));
2271 if (!d)
2272 return -1;
2273
2274 if (!cgm_escape_cgroup())
2275 fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
2276
2277 if (!cgm_get_controllers(&d->subsystems))
2278 return -1;
2279
2280 ret = fuse_main(argc, argv, &lxcfs_ops, d);
2281
2282 return ret;
2283 }