]> git.proxmox.com Git - mirror_lxcfs.git/blob - lxcfs.c
write: make sure to null terminate the buffer
[mirror_lxcfs.git] / lxcfs.c
1 /* lxcfs
2 *
3 * Copyright © 2014 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 /*
10 * NOTES - make sure to run this as -s to avoid threading.
11 * TODO - can we enforce that here from the code?
12 */
13 #define FUSE_USE_VERSION 26
14
15 #include <stdio.h>
16 #include <dirent.h>
17 #include <fcntl.h>
18 #include <fuse.h>
19 #include <unistd.h>
20 #include <errno.h>
21 #include <stdbool.h>
22 #include <time.h>
23 #include <string.h>
24 #include <stdlib.h>
25 #include <libgen.h>
26 #include <sched.h>
27 #include <linux/sched.h>
28 #include <sys/socket.h>
29 #include <sys/mount.h>
30 #include <wait.h>
31
32 #include <nih/alloc.h>
33 #include <nih/string.h>
34
35 #include "cgmanager.h"
36
37 struct lxcfs_state {
38 /*
39 * a null-terminated, nih-allocated list of the mounted subsystems. We
40 * detect this at startup.
41 */
42 char **subsystems;
43 };
44 #define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
45
46 /*
47 * TODO - return value should denote whether child exited with failure
48 * so callers can return errors. Esp read/write of tasks and cgroup.procs
49 */
50 static int wait_for_pid(pid_t pid)
51 {
52 int status, ret;
53
54 again:
55 ret = waitpid(pid, &status, 0);
56 if (ret == -1) {
57 if (errno == EINTR)
58 goto again;
59 return -1;
60 }
61 if (ret != pid)
62 goto again;
63 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
64 return -1;
65 return 0;
66 }
67
68 /*
69 * Given a open file * to /proc/pid/{u,g}id_map, and an id
70 * valid in the caller's namespace, return the id mapped into
71 * pid's namespace.
72 * Returns the mapped id, or -1 on error.
73 */
74 unsigned int
75 convert_id_to_ns(FILE *idfile, unsigned int in_id)
76 {
77 unsigned int nsuid, // base id for a range in the idfile's namespace
78 hostuid, // base id for a range in the caller's namespace
79 count; // number of ids in this range
80 char line[400];
81 int ret;
82
83 fseek(idfile, 0L, SEEK_SET);
84 while (fgets(line, 400, idfile)) {
85 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
86 if (ret != 3)
87 continue;
88 if (hostuid + count < hostuid || nsuid + count < nsuid) {
89 /*
90 * uids wrapped around - unexpected as this is a procfile,
91 * so just bail.
92 */
93 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
94 nsuid, hostuid, count, line);
95 return -1;
96 }
97 if (hostuid <= in_id && hostuid+count > in_id) {
98 /*
99 * now since hostuid <= in_id < hostuid+count, and
100 * hostuid+count and nsuid+count do not wrap around,
101 * we know that nsuid+(in_id-hostuid) which must be
102 * less that nsuid+(count) must not wrap around
103 */
104 return (in_id - hostuid) + nsuid;
105 }
106 }
107
108 // no answer found
109 return -1;
110 }
111
112 /*
113 * for is_privileged_over,
114 * specify whether we require the calling uid to be root in his
115 * namespace
116 */
117 #define NS_ROOT_REQD true
118 #define NS_ROOT_OPT false
119
120 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
121 {
122 nih_local char *fpath = NULL;
123 bool answer = false;
124 uid_t nsuid;
125
126 if (victim == -1 || uid == -1)
127 return false;
128
129 /*
130 * If the request is one not requiring root in the namespace,
131 * then having the same uid suffices. (i.e. uid 1000 has write
132 * access to files owned by uid 1000
133 */
134 if (!req_ns_root && uid == victim)
135 return true;
136
137 fpath = NIH_MUST( nih_sprintf(NULL, "/proc/%d/uid_map", pid) );
138 FILE *f = fopen(fpath, "r");
139 if (!f)
140 return false;
141
142 /* if caller's not root in his namespace, reject */
143 nsuid = convert_id_to_ns(f, uid);
144 if (nsuid)
145 goto out;
146
147 /*
148 * If victim is not mapped into caller's ns, reject.
149 * XXX I'm not sure this check is needed given that fuse
150 * will be sending requests where the vfs has converted
151 */
152 nsuid = convert_id_to_ns(f, victim);
153 if (nsuid == -1)
154 goto out;
155
156 answer = true;
157
158 out:
159 fclose(f);
160 return answer;
161 }
162
163 static bool perms_include(int fmode, mode_t req_mode)
164 {
165 mode_t r;
166
167 switch (req_mode & O_ACCMODE) {
168 case O_RDONLY:
169 r = S_IROTH;
170 break;
171 case O_WRONLY:
172 r = S_IWOTH;
173 break;
174 case O_RDWR:
175 r = S_IROTH | S_IWOTH;
176 break;
177 default:
178 return false;
179 }
180 return ((fmode & r) == r);
181 }
182
183 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
184 {
185 char *start, *end;
186
187 if (strlen(taskcg) <= strlen(querycg)) {
188 fprintf(stderr, "%s: I was fed bad input\n", __func__);
189 return NULL;
190 }
191
192 if (strcmp(querycg, "/") == 0)
193 start = NIH_MUST( nih_strdup(NULL, taskcg + 1) );
194 else
195 start = NIH_MUST( nih_strdup(NULL, taskcg + strlen(querycg) + 1) );
196 end = strchr(start, '/');
197 if (end)
198 *end = '\0';
199 return start;
200 }
201
202 /*
203 * check whether a fuse context may access a cgroup dir or file
204 *
205 * If file is not null, it is a cgroup file to check under cg.
206 * If file is null, then we are checking perms on cg itself.
207 *
208 * For files we can check the mode of the list_keys result.
209 * For cgroups, we must make assumptions based on the files under the
210 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
211 * yet.
212 */
213 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
214 {
215 nih_local struct cgm_keys **list = NULL;
216 int i;
217
218 if (!file)
219 file = "tasks";
220
221 if (*file == '/')
222 file++;
223
224 if (!cgm_list_keys(contrl, cg, &list))
225 return false;
226 for (i = 0; list[i]; i++) {
227 if (strcmp(list[i]->name, file) == 0) {
228 struct cgm_keys *k = list[i];
229 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
230 if (perms_include(k->mode >> 6, mode))
231 return true;
232 }
233 if (fc->gid == k->gid) {
234 if (perms_include(k->mode >> 3, mode))
235 return true;
236 }
237 return perms_include(k->mode, mode);
238 }
239 }
240
241 return false;
242 }
243
244 static void stripnewline(char *x)
245 {
246 size_t l = strlen(x);
247 if (l && x[l-1] == '\n')
248 x[l-1] = '\0';
249 }
250
251 /*
252 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
253 * If caller is in /a, he may act on /a/b, but not on /b.
254 * if the answer is false and nextcg is not NULL, then *nextcg will point
255 * to a nih_alloc'd string containing the next cgroup directory under cg
256 */
257 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
258 {
259 nih_local char *fnam = NULL;
260 FILE *f;
261 bool answer = false;
262 char *line = NULL;
263 size_t len = 0;
264
265 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
266 if (!(f = fopen(fnam, "r")))
267 return false;
268
269 while (getline(&line, &len, f) != -1) {
270 char *c1, *c2, *linecmp;
271 if (!line[0])
272 continue;
273 c1 = strchr(line, ':');
274 if (!c1)
275 goto out;
276 c1++;
277 c2 = strchr(c1, ':');
278 if (!c2)
279 goto out;
280 *c2 = '\0';
281 if (strcmp(c1, contrl) != 0)
282 continue;
283 c2++;
284 stripnewline(c2);
285 /*
286 * callers pass in '/' for root cgroup, otherwise they pass
287 * in a cgroup without leading '/'
288 */
289 linecmp = *cg == '/' ? c2 : c2+1;
290 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
291 if (nextcg)
292 *nextcg = get_next_cgroup_dir(linecmp, cg);
293 goto out;
294 }
295 answer = true;
296 goto out;
297 }
298
299 out:
300 fclose(f);
301 free(line);
302 return answer;
303 }
304
305 /*
306 * given /cgroup/freezer/a/b, return "freezer". this will be nih-allocated
307 * and needs to be nih_freed.
308 */
309 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
310 {
311 const char *p1;
312 char *ret, *slash;
313
314 if (strlen(path) < 9)
315 return NULL;
316 p1 = path+8;
317 ret = nih_strdup(NULL, p1);
318 if (!ret)
319 return ret;
320 slash = strstr(ret, "/");
321 if (slash)
322 *slash = '\0';
323
324 /* verify that it is a subsystem */
325 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
326 int i;
327 if (!list) {
328 nih_free(ret);
329 return NULL;
330 }
331 for (i = 0; list[i]; i++) {
332 if (strcmp(list[i], ret) == 0)
333 return ret;
334 }
335 nih_free(ret);
336 return NULL;
337 }
338
339 /*
340 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
341 * Note that the returned value may include files (keynames) etc
342 */
343 static const char *find_cgroup_in_path(const char *path)
344 {
345 const char *p1;
346
347 if (strlen(path) < 9)
348 return NULL;
349 p1 = strstr(path+8, "/");
350 if (!p1)
351 return NULL;
352 return p1+1;
353 }
354
355 static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
356 {
357 nih_local char **list = NULL;
358 int i;
359
360 if (!f)
361 return false;
362 if (*f == '/')
363 f++;
364
365 if (!cgm_list_children(contr, dir, &list))
366 return false;
367 for (i = 0; list[i]; i++) {
368 if (strcmp(list[i], f) == 0)
369 return true;
370 }
371
372 return false;
373 }
374
375 static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
376 {
377 nih_local struct cgm_keys **list = NULL;
378 struct cgm_keys *k;
379 int i;
380
381 if (!f)
382 return NULL;
383 if (*f == '/')
384 f++;
385 if (!cgm_list_keys(contr, dir, &list))
386 return NULL;
387 for (i = 0; list[i]; i++) {
388 if (strcmp(list[i]->name, f) == 0) {
389 k = NIH_MUST( nih_alloc(NULL, (sizeof(*k))) );
390 k->name = NIH_MUST( nih_strdup(k, list[i]->name) );
391 k->uid = list[i]->uid;
392 k->gid = list[i]->gid;
393 k->mode = list[i]->mode;
394 return k;
395 }
396 }
397
398 return NULL;
399 }
400
401 static void get_cgdir_and_path(const char *cg, char **dir, char **file)
402 {
403 char *p;
404
405 *dir = NIH_MUST( nih_strdup(NULL, cg) );
406 *file = strrchr(cg, '/');
407 if (!*file) {
408 *file = NULL;
409 return;
410 }
411 p = strrchr(*dir, '/');
412 *p = '\0';
413 }
414
415 static size_t get_file_size(const char *contrl, const char *cg, const char *f)
416 {
417 nih_local char *data = NULL;
418 size_t s;
419 if (!cgm_get_value(contrl, cg, f, &data))
420 return -EINVAL;
421 s = strlen(data);
422 return s;
423 }
424
425 /*
426 * FUSE ops for /cgroup
427 */
428
429 static int cg_getattr(const char *path, struct stat *sb)
430 {
431 struct timespec now;
432 struct fuse_context *fc = fuse_get_context();
433 nih_local char * cgdir = NULL;
434 char *fpath = NULL, *path1, *path2;
435 nih_local struct cgm_keys *k = NULL;
436 const char *cgroup;
437 nih_local char *controller = NULL;
438
439
440 if (!fc)
441 return -EIO;
442
443 memset(sb, 0, sizeof(struct stat));
444
445 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
446 return -EINVAL;
447
448 sb->st_uid = sb->st_gid = 0;
449 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
450 sb->st_size = 0;
451
452 if (strcmp(path, "/cgroup") == 0) {
453 sb->st_mode = S_IFDIR | 00755;
454 sb->st_nlink = 2;
455 return 0;
456 }
457
458 controller = pick_controller_from_path(fc, path);
459 if (!controller)
460 return -EIO;
461 cgroup = find_cgroup_in_path(path);
462 if (!cgroup) {
463 /* this is just /cgroup/controller, return it as a dir */
464 sb->st_mode = S_IFDIR | 00755;
465 sb->st_nlink = 2;
466 return 0;
467 }
468
469 get_cgdir_and_path(cgroup, &cgdir, &fpath);
470
471 if (!fpath) {
472 path1 = "/";
473 path2 = cgdir;
474 } else {
475 path1 = cgdir;
476 path2 = fpath;
477 }
478
479 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
480 * Then check that caller's cgroup is under path if fpath is a child
481 * cgroup, or cgdir if fpath is a file */
482
483 if (is_child_cgroup(controller, path1, path2)) {
484 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
485 /* this is just /cgroup/controller, return it as a dir */
486 sb->st_mode = S_IFDIR | 00555;
487 sb->st_nlink = 2;
488 return 0;
489 }
490 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
491 return -EACCES;
492
493 // get uid, gid, from '/tasks' file and make up a mode
494 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
495 sb->st_mode = S_IFDIR | 00755;
496 k = get_cgroup_key(controller, cgroup, "tasks");
497 if (!k) {
498 sb->st_uid = sb->st_gid = 0;
499 } else {
500 sb->st_uid = k->uid;
501 sb->st_gid = k->gid;
502 }
503 sb->st_nlink = 2;
504 return 0;
505 }
506
507 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
508 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL))
509 return -ENOENT;
510 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
511 return -EACCES;
512
513 sb->st_mode = S_IFREG | k->mode;
514 sb->st_nlink = 1;
515 sb->st_uid = k->uid;
516 sb->st_gid = k->gid;
517 sb->st_size = get_file_size(controller, path1, path2);
518 return 0;
519 }
520
521 return -ENOENT;
522 }
523
524 /*
525 * TODO - cache these results in a table for use in opendir, free
526 * in releasedir
527 */
528 static int cg_opendir(const char *path, struct fuse_file_info *fi)
529 {
530 struct fuse_context *fc = fuse_get_context();
531 nih_local struct cgm_keys **list = NULL;
532 const char *cgroup;
533 nih_local char *controller = NULL;
534 nih_local char *nextcg = NULL;
535
536 if (!fc)
537 return -EIO;
538
539 if (strcmp(path, "/cgroup") == 0)
540 return 0;
541
542 // return list of keys for the controller, and list of child cgroups
543 controller = pick_controller_from_path(fc, path);
544 if (!controller)
545 return -EIO;
546
547 cgroup = find_cgroup_in_path(path);
548 if (!cgroup) {
549 /* this is just /cgroup/controller, return its contents */
550 cgroup = "/";
551 }
552
553 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
554 return -EACCES;
555 return 0;
556 }
557
558 static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
559 struct fuse_file_info *fi)
560 {
561 struct fuse_context *fc = fuse_get_context();
562
563 if (!fc)
564 return -EIO;
565
566 if (strcmp(path, "/cgroup") == 0) {
567 // get list of controllers
568 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
569 int i;
570
571 if (!list)
572 return -EIO;
573
574 for (i = 0; list[i]; i++) {
575 if (filler(buf, list[i], NULL, 0) != 0) {
576 return -EIO;
577 }
578 }
579 return 0;
580 }
581
582 // return list of keys for the controller, and list of child cgroups
583 nih_local struct cgm_keys **list = NULL;
584 const char *cgroup;
585 nih_local char *controller = NULL;
586 int i;
587 nih_local char *nextcg = NULL;
588
589 controller = pick_controller_from_path(fc, path);
590 if (!controller)
591 return -EIO;
592
593 cgroup = find_cgroup_in_path(path);
594 if (!cgroup) {
595 /* this is just /cgroup/controller, return its contents */
596 cgroup = "/";
597 }
598
599 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
600 return -EACCES;
601
602 if (!cgm_list_keys(controller, cgroup, &list))
603 // not a valid cgroup
604 return -EINVAL;
605
606 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, &nextcg)) {
607 if (nextcg) {
608 int ret;
609 ret = filler(buf, nextcg, NULL, 0);
610 if (ret != 0)
611 return -EIO;
612 }
613 return 0;
614 }
615
616 for (i = 0; list[i]; i++) {
617 if (filler(buf, list[i]->name, NULL, 0) != 0) {
618 return -EIO;
619 }
620 }
621
622 // now get the list of child cgroups
623 nih_local char **clist;
624
625 if (!cgm_list_children(controller, cgroup, &clist))
626 return 0;
627 for (i = 0; clist[i]; i++) {
628 if (filler(buf, clist[i], NULL, 0) != 0) {
629 return -EIO;
630 }
631 }
632 return 0;
633 }
634
635 static int cg_releasedir(const char *path, struct fuse_file_info *fi)
636 {
637 return 0;
638 }
639
640 /*
641 * TODO - cache info here for read/write, release in cg_release.
642 */
643 static int cg_open(const char *path, struct fuse_file_info *fi)
644 {
645 nih_local char *controller = NULL;
646 const char *cgroup;
647 char *fpath = NULL, *path1, *path2;
648 nih_local char * cgdir = NULL;
649 nih_local struct cgm_keys *k = NULL;
650 struct fuse_context *fc = fuse_get_context();
651
652 if (!fc)
653 return -EIO;
654
655 controller = pick_controller_from_path(fc, path);
656 if (!controller)
657 return -EIO;
658 cgroup = find_cgroup_in_path(path);
659 if (!cgroup)
660 return -EINVAL;
661
662 get_cgdir_and_path(cgroup, &cgdir, &fpath);
663 if (!fpath) {
664 path1 = "/";
665 path2 = cgdir;
666 } else {
667 path1 = cgdir;
668 path2 = fpath;
669 }
670
671 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
672 if (!fc_may_access(fc, controller, path1, path2, fi->flags))
673 // should never get here
674 return -EACCES;
675
676 return 0;
677 }
678
679 return -EINVAL;
680 }
681
682 static int msgrecv(int sockfd, void *buf, size_t len)
683 {
684 struct timeval tv;
685 fd_set rfds;
686
687 FD_ZERO(&rfds);
688 FD_SET(sockfd, &rfds);
689 tv.tv_sec = 2;
690 tv.tv_usec = 0;
691
692 if (select(sockfd+1, &rfds, NULL, NULL, &tv) < 0)
693 return -1;
694 return recv(sockfd, buf, len, MSG_DONTWAIT);
695 }
696
697 #define SEND_CREDS_OK 0
698 #define SEND_CREDS_NOTSK 1
699 #define SEND_CREDS_FAIL 2
700 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
701 {
702 struct msghdr msg = { 0 };
703 struct iovec iov;
704 struct cmsghdr *cmsg;
705 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
706 char buf[1];
707 buf[0] = 'p';
708
709 if (pingfirst) {
710 if (msgrecv(sock, buf, 1) != 1) {
711 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
712 __func__);
713 return SEND_CREDS_FAIL;
714 }
715 }
716
717 msg.msg_control = cmsgbuf;
718 msg.msg_controllen = sizeof(cmsgbuf);
719
720 cmsg = CMSG_FIRSTHDR(&msg);
721 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
722 cmsg->cmsg_level = SOL_SOCKET;
723 cmsg->cmsg_type = SCM_CREDENTIALS;
724 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
725
726 msg.msg_name = NULL;
727 msg.msg_namelen = 0;
728
729 buf[0] = v;
730 iov.iov_base = buf;
731 iov.iov_len = sizeof(buf);
732 msg.msg_iov = &iov;
733 msg.msg_iovlen = 1;
734
735 if (sendmsg(sock, &msg, 0) < 0) {
736 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
737 strerror(errno));
738 if (errno == 3)
739 return SEND_CREDS_NOTSK;
740 return SEND_CREDS_FAIL;
741 }
742
743 return SEND_CREDS_OK;
744 }
745
746 static bool recv_creds(int sock, struct ucred *cred, char *v)
747 {
748 struct msghdr msg = { 0 };
749 struct iovec iov;
750 struct cmsghdr *cmsg;
751 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
752 char buf[1];
753 int ret;
754 int optval = 1;
755
756 *v = '1';
757
758 cred->pid = -1;
759 cred->uid = -1;
760 cred->gid = -1;
761
762 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
763 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
764 return false;
765 }
766 buf[0] = '1';
767 if (write(sock, buf, 1) != 1) {
768 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
769 return false;
770 }
771
772 msg.msg_name = NULL;
773 msg.msg_namelen = 0;
774 msg.msg_control = cmsgbuf;
775 msg.msg_controllen = sizeof(cmsgbuf);
776
777 iov.iov_base = buf;
778 iov.iov_len = sizeof(buf);
779 msg.msg_iov = &iov;
780 msg.msg_iovlen = 1;
781
782 // retry logic is not ideal, especially as we are not
783 // threaded. Sleep at most 1 second waiting for the client
784 // to send us the scm_cred
785 ret = recvmsg(sock, &msg, 0);
786 if (ret < 0) {
787 fprintf(stderr, "Failed to receive scm_cred: %s\n",
788 strerror(errno));
789 return false;
790 }
791
792 cmsg = CMSG_FIRSTHDR(&msg);
793
794 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
795 cmsg->cmsg_level == SOL_SOCKET &&
796 cmsg->cmsg_type == SCM_CREDENTIALS) {
797 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
798 }
799 *v = buf[0];
800
801 return true;
802 }
803
804
805 /*
806 * pid_to_ns - reads pids from a ucred over a socket, then writes the
807 * int value back over the socket. This shifts the pid from the
808 * sender's pidns into tpid's pidns.
809 */
810 static void pid_to_ns(int sock, pid_t tpid)
811 {
812 char v = '0';
813 struct ucred cred;
814
815 while (recv_creds(sock, &cred, &v)) {
816 if (v == '1')
817 exit(0);
818 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
819 exit(1);
820 }
821 exit(0);
822 }
823
824 /*
825 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
826 * in your old pidns. Only children which you fork will be in the target
827 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
828 * actually convert pids
829 */
830 static void pid_to_ns_wrapper(int sock, pid_t tpid)
831 {
832 int newnsfd = -1;
833 char fnam[100];
834 pid_t cpid;
835
836 sprintf(fnam, "/proc/%d/ns/pid", tpid);
837 newnsfd = open(fnam, O_RDONLY);
838 if (newnsfd < 0)
839 exit(1);
840 if (setns(newnsfd, 0) < 0)
841 exit(1);
842 close(newnsfd);
843
844 cpid = fork();
845
846 if (cpid < 0)
847 exit(1);
848 if (!cpid)
849 pid_to_ns(sock, tpid);
850 if (!wait_for_pid(cpid))
851 exit(1);
852 exit(0);
853 }
854
855 /*
856 * To read cgroup files with a particular pid, we will setns into the child
857 * pidns, open a pipe, fork a child - which will be the first to really be in
858 * the child ns - which does the cgm_get_value and writes the data to the pipe.
859 */
860 static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
861 {
862 int sock[2] = {-1, -1};
863 nih_local char *tmpdata = NULL;
864 int ret;
865 pid_t qpid, cpid = -1;
866 bool answer = false;
867 char v = '0';
868 struct ucred cred;
869 struct timeval tv;
870 fd_set s;
871
872 if (!cgm_get_value(contrl, cg, file, &tmpdata))
873 return false;
874
875 /*
876 * Now we read the pids from returned data one by one, pass
877 * them into a child in the target namespace, read back the
878 * translated pids, and put them into our to-return data
879 */
880
881 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
882 perror("socketpair");
883 exit(1);
884 }
885
886 cpid = fork();
887 if (cpid == -1)
888 goto out;
889
890 if (!cpid) // child
891 pid_to_ns_wrapper(sock[1], tpid);
892
893 char *ptr = tmpdata;
894 cred.uid = 0;
895 cred.gid = 0;
896 while (sscanf(ptr, "%d\n", &qpid) == 1) {
897 cred.pid = qpid;
898 ret = send_creds(sock[0], &cred, v, true);
899
900 if (ret == SEND_CREDS_NOTSK)
901 goto next;
902 if (ret == SEND_CREDS_FAIL)
903 goto out;
904
905 // read converted results
906 FD_ZERO(&s);
907 FD_SET(sock[0], &s);
908 tv.tv_sec = 1;
909 tv.tv_usec = 0;
910 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
911 if (ret <= 0) {
912 kill(cpid, SIGTERM);
913 goto out;
914 }
915 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
916 kill(cpid, SIGTERM);
917 perror("read");
918 goto out;
919 }
920 NIH_MUST( nih_strcat_sprintf(d, NULL, "%d\n", qpid) );
921 next:
922 ptr = strchr(ptr, '\n');
923 if (!ptr)
924 break;
925 ptr++;
926 }
927
928 cred.pid = getpid();
929 v = '1';
930 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
931 // failed to ask child to exit
932 kill(cpid, SIGTERM);
933 goto out;
934 }
935
936 answer = true;
937
938 out:
939 if (cpid != -1)
940 wait_for_pid(cpid);
941 if (sock[0] != -1) {
942 close(sock[0]);
943 close(sock[1]);
944 }
945 return answer;
946 }
947
948 static int cg_read(const char *path, char *buf, size_t size, off_t offset,
949 struct fuse_file_info *fi)
950 {
951 nih_local char *controller = NULL;
952 const char *cgroup;
953 char *fpath = NULL, *path1, *path2;
954 struct fuse_context *fc = fuse_get_context();
955 nih_local char * cgdir = NULL;
956 nih_local struct cgm_keys *k = NULL;
957
958 if (offset)
959 return -EIO;
960
961 if (!fc)
962 return -EIO;
963
964 controller = pick_controller_from_path(fc, path);
965 if (!controller)
966 return -EINVAL;
967 cgroup = find_cgroup_in_path(path);
968 if (!cgroup)
969 return -EINVAL;
970
971 get_cgdir_and_path(cgroup, &cgdir, &fpath);
972 if (!fpath) {
973 path1 = "/";
974 path2 = cgdir;
975 } else {
976 path1 = cgdir;
977 path2 = fpath;
978 }
979
980 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
981 nih_local char *data = NULL;
982 int s;
983 bool r;
984
985 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
986 // should never get here
987 return -EACCES;
988
989 if (strcmp(path2, "tasks") == 0 ||
990 strcmp(path2, "/tasks") == 0 ||
991 strcmp(path2, "/cgroup.procs") == 0 ||
992 strcmp(path2, "cgroup.procs") == 0)
993 // special case - we have to translate the pids
994 r = do_read_pids(fc->pid, controller, path1, path2, &data);
995 else
996 r = cgm_get_value(controller, path1, path2, &data);
997
998 if (!r)
999 return -EINVAL;
1000
1001 if (!data)
1002 return 0;
1003 s = strlen(data);
1004 if (s > size)
1005 s = size;
1006 memcpy(buf, data, s);
1007
1008 return s;
1009 }
1010
1011 return -EINVAL;
1012 }
1013
1014 static void pid_from_ns(int sock, pid_t tpid)
1015 {
1016 pid_t vpid;
1017 struct ucred cred;
1018 char v;
1019
1020 cred.uid = 0;
1021 cred.gid = 0;
1022 while (read(sock, &vpid, sizeof(pid_t)) == sizeof(pid_t)) {
1023 if (vpid == -1) // done
1024 break;
1025 v = '0';
1026 cred.pid = vpid;
1027 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
1028 v = '1';
1029 cred.pid = getpid();
1030 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
1031 exit(1);
1032 }
1033 }
1034 exit(0);
1035 }
1036
1037 static void pid_from_ns_wrapper(int sock, pid_t tpid)
1038 {
1039 int newnsfd = -1;
1040 char fnam[100];
1041 pid_t cpid;
1042
1043 sprintf(fnam, "/proc/%d/ns/pid", tpid);
1044 newnsfd = open(fnam, O_RDONLY);
1045 if (newnsfd < 0)
1046 exit(1);
1047 if (setns(newnsfd, 0) < 0)
1048 exit(1);
1049 close(newnsfd);
1050
1051 cpid = fork();
1052
1053 if (cpid < 0)
1054 exit(1);
1055 if (!cpid)
1056 pid_from_ns(sock, tpid);
1057 if (!wait_for_pid(cpid))
1058 exit(1);
1059 exit(0);
1060 }
1061
1062 static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
1063 {
1064 int sock[2] = {-1, -1};
1065 pid_t qpid, cpid = -1;
1066 bool answer = false, fail = false;
1067
1068 /*
1069 * write the pids to a socket, have helper in writer's pidns
1070 * call movepid for us
1071 */
1072 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1073 perror("socketpair");
1074 exit(1);
1075 }
1076
1077 cpid = fork();
1078 if (cpid == -1)
1079 goto out;
1080
1081 if (!cpid) // child
1082 pid_from_ns_wrapper(sock[1], tpid);
1083
1084 const char *ptr = buf;
1085 while (sscanf(ptr, "%d", &qpid) == 1) {
1086 struct ucred cred;
1087 char v;
1088
1089 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1090 kill(cpid, SIGTERM);
1091 perror("write");
1092 goto out;
1093 }
1094
1095 if (recv_creds(sock[0], &cred, &v)) {
1096 if (v == '0') {
1097 if (!cgm_move_pid(contrl, cg, cred.pid))
1098 fail = true;
1099 }
1100 }
1101
1102 ptr = strchr(ptr, '\n');
1103 if (!ptr)
1104 break;
1105 ptr++;
1106 }
1107
1108 /* All good, write the value */
1109 qpid = -1;
1110 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1111 fprintf(stderr, "Warning: failed to ask child to exit\n");
1112
1113 if (!fail)
1114 answer = true;
1115
1116 out:
1117 if (cpid != -1)
1118 wait_for_pid(cpid);
1119 if (sock[0] != -1) {
1120 close(sock[0]);
1121 close(sock[1]);
1122 }
1123 return answer;
1124 }
1125
1126 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1127 struct fuse_file_info *fi)
1128 {
1129 nih_local char *controller = NULL;
1130 const char *cgroup;
1131 char *fpath = NULL, *path1, *path2;
1132 struct fuse_context *fc = fuse_get_context();
1133 nih_local char * cgdir = NULL;
1134 nih_local struct cgm_keys *k = NULL;
1135 nih_local char *localbuf = NULL;
1136
1137 if (offset)
1138 return -EINVAL;
1139
1140 if (!fc)
1141 return -EIO;
1142
1143 localbuf = NIH_MUST( nih_alloc(NULL, size+1) );
1144 localbuf[size] = '\0';
1145 memcpy(localbuf, buf, size);
1146 controller = pick_controller_from_path(fc, path);
1147 if (!controller)
1148 return -EINVAL;
1149 cgroup = find_cgroup_in_path(path);
1150 if (!cgroup)
1151 return -EINVAL;
1152
1153 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1154 if (!fpath) {
1155 path1 = "/";
1156 path2 = cgdir;
1157 } else {
1158 path1 = cgdir;
1159 path2 = fpath;
1160 }
1161
1162 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
1163 bool r;
1164
1165 if (!fc_may_access(fc, controller, path1, path2, O_WRONLY))
1166 return -EACCES;
1167
1168 if (strcmp(path2, "tasks") == 0 ||
1169 strcmp(path2, "/tasks") == 0 ||
1170 strcmp(path2, "/cgroup.procs") == 0 ||
1171 strcmp(path2, "cgroup.procs") == 0)
1172 // special case - we have to translate the pids
1173 r = do_write_pids(fc->pid, controller, path1, path2, localbuf);
1174 else
1175 r = cgm_set_value(controller, path1, path2, localbuf);
1176
1177 if (!r)
1178 return -EINVAL;
1179
1180 return size;
1181 }
1182
1183 return -EINVAL;
1184 }
1185
1186 int cg_chown(const char *path, uid_t uid, gid_t gid)
1187 {
1188 struct fuse_context *fc = fuse_get_context();
1189 nih_local char * cgdir = NULL;
1190 char *fpath = NULL, *path1, *path2;
1191 nih_local struct cgm_keys *k = NULL;
1192 const char *cgroup;
1193 nih_local char *controller = NULL;
1194
1195
1196 if (!fc)
1197 return -EIO;
1198
1199 if (strcmp(path, "/cgroup") == 0)
1200 return -EINVAL;
1201
1202 controller = pick_controller_from_path(fc, path);
1203 if (!controller)
1204 return -EINVAL;
1205 cgroup = find_cgroup_in_path(path);
1206 if (!cgroup)
1207 /* this is just /cgroup/controller */
1208 return -EINVAL;
1209
1210 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1211
1212 if (!fpath) {
1213 path1 = "/";
1214 path2 = cgdir;
1215 } else {
1216 path1 = cgdir;
1217 path2 = fpath;
1218 }
1219
1220 if (is_child_cgroup(controller, path1, path2)) {
1221 // get uid, gid, from '/tasks' file and make up a mode
1222 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1223 k = get_cgroup_key(controller, cgroup, "tasks");
1224
1225 } else
1226 k = get_cgroup_key(controller, path1, path2);
1227
1228 if (!k)
1229 return -EINVAL;
1230
1231 /*
1232 * This being a fuse request, the uid and gid must be valid
1233 * in the caller's namespace. So we can just check to make
1234 * sure that the caller is root in his uid, and privileged
1235 * over the file's current owner.
1236 */
1237 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD))
1238 return -EACCES;
1239
1240 if (!cgm_chown_file(controller, cgroup, uid, gid))
1241 return -EINVAL;
1242 return 0;
1243 }
1244
1245 int cg_chmod(const char *path, mode_t mode)
1246 {
1247 struct fuse_context *fc = fuse_get_context();
1248 nih_local char * cgdir = NULL;
1249 char *fpath = NULL, *path1, *path2;
1250 nih_local struct cgm_keys *k = NULL;
1251 const char *cgroup;
1252 nih_local char *controller = NULL;
1253
1254 if (!fc)
1255 return -EIO;
1256
1257 if (strcmp(path, "/cgroup") == 0)
1258 return -EINVAL;
1259
1260 controller = pick_controller_from_path(fc, path);
1261 if (!controller)
1262 return -EINVAL;
1263 cgroup = find_cgroup_in_path(path);
1264 if (!cgroup)
1265 /* this is just /cgroup/controller */
1266 return -EINVAL;
1267
1268 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1269
1270 if (!fpath) {
1271 path1 = "/";
1272 path2 = cgdir;
1273 } else {
1274 path1 = cgdir;
1275 path2 = fpath;
1276 }
1277
1278 if (is_child_cgroup(controller, path1, path2)) {
1279 // get uid, gid, from '/tasks' file and make up a mode
1280 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1281 k = get_cgroup_key(controller, cgroup, "tasks");
1282
1283 } else
1284 k = get_cgroup_key(controller, path1, path2);
1285
1286 if (!k)
1287 return -EINVAL;
1288
1289 /*
1290 * This being a fuse request, the uid and gid must be valid
1291 * in the caller's namespace. So we can just check to make
1292 * sure that the caller is root in his uid, and privileged
1293 * over the file's current owner.
1294 */
1295 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT))
1296 return -EPERM;
1297
1298 if (!cgm_chmod_file(controller, cgroup, mode))
1299 return -EINVAL;
1300 return 0;
1301 }
1302
1303 int cg_mkdir(const char *path, mode_t mode)
1304 {
1305 struct fuse_context *fc = fuse_get_context();
1306 nih_local struct cgm_keys **list = NULL;
1307 char *fpath = NULL, *path1;
1308 nih_local char * cgdir = NULL;
1309 const char *cgroup;
1310 nih_local char *controller = NULL;
1311
1312 if (!fc)
1313 return -EIO;
1314
1315
1316 controller = pick_controller_from_path(fc, path);
1317 if (!controller)
1318 return -EINVAL;
1319
1320 cgroup = find_cgroup_in_path(path);
1321 if (!cgroup)
1322 return -EINVAL;
1323
1324 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1325 if (!fpath)
1326 path1 = "/";
1327 else
1328 path1 = cgdir;
1329
1330 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR))
1331 return -EACCES;
1332
1333
1334 if (!cgm_create(controller, cgroup, fc->uid, fc->gid))
1335 return -EINVAL;
1336
1337 return 0;
1338 }
1339
1340 static int cg_rmdir(const char *path)
1341 {
1342 struct fuse_context *fc = fuse_get_context();
1343 nih_local struct cgm_keys **list = NULL;
1344 char *fpath = NULL;
1345 nih_local char * cgdir = NULL;
1346 const char *cgroup;
1347 nih_local char *controller = NULL;
1348
1349 if (!fc)
1350 return -EIO;
1351
1352
1353 controller = pick_controller_from_path(fc, path);
1354 if (!controller)
1355 return -EINVAL;
1356
1357 cgroup = find_cgroup_in_path(path);
1358 if (!cgroup)
1359 return -EINVAL;
1360
1361 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1362 if (!fpath)
1363 return -EINVAL;
1364
1365 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY))
1366 return -EACCES;
1367
1368 if (!cgm_remove(controller, cgroup))
1369 return -EINVAL;
1370
1371 return 0;
1372 }
1373
1374 static bool startswith(const char *line, const char *pref)
1375 {
1376 if (strncmp(line, pref, strlen(pref)) == 0)
1377 return true;
1378 return false;
1379 }
1380
1381 static void get_mem_cached(char *memstat, unsigned long *v)
1382 {
1383 char *eol;
1384
1385 *v = 0;
1386 while (*memstat) {
1387 if (startswith(memstat, "total_cache")) {
1388 sscanf(memstat + 11, "%lu", v);
1389 *v /= 1024;
1390 return;
1391 }
1392 eol = strchr(memstat, '\n');
1393 if (!eol)
1394 return;
1395 memstat = eol+1;
1396 }
1397 }
1398
1399 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1400 {
1401 nih_local char *fnam = NULL;
1402 FILE *f;
1403 char *answer = NULL;
1404 char *line = NULL;
1405 size_t len = 0;
1406
1407 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
1408 if (!(f = fopen(fnam, "r")))
1409 return false;
1410
1411 while (getline(&line, &len, f) != -1) {
1412 char *c1, *c2;
1413 if (!line[0])
1414 continue;
1415 c1 = strchr(line, ':');
1416 if (!c1)
1417 goto out;
1418 c1++;
1419 c2 = strchr(c1, ':');
1420 if (!c2)
1421 goto out;
1422 *c2 = '\0';
1423 if (strcmp(c1, contrl) != 0)
1424 continue;
1425 c2++;
1426 stripnewline(c2);
1427 answer = NIH_MUST( nih_strdup(NULL, c2) );
1428 goto out;
1429 }
1430
1431 out:
1432 fclose(f);
1433 free(line);
1434 return answer;
1435 }
1436
1437 /*
1438 * FUSE ops for /proc
1439 */
1440
1441 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1442 struct fuse_file_info *fi)
1443 {
1444 struct fuse_context *fc = fuse_get_context();
1445 nih_local char *cg = get_pid_cgroup(fc->pid, "memory");
1446 nih_local char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
1447 unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
1448 char *line = NULL;
1449 size_t linelen = 0, total_len = 0;
1450 FILE *f;
1451
1452 if (offset)
1453 return -EINVAL;
1454
1455 if (!cg)
1456 return 0;
1457
1458 if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
1459 return 0;
1460 if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
1461 return 0;
1462 if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
1463 return 0;
1464 memlimit = strtoul(memlimit_str, NULL, 10);
1465 memusage = strtoul(memusage_str, NULL, 10);
1466 memlimit /= 1024;
1467 memusage /= 1024;
1468 get_mem_cached(memstat_str, &cached);
1469
1470 f = fopen("/proc/meminfo", "r");
1471 if (!f)
1472 return 0;
1473
1474 while (getline(&line, &linelen, f) != -1) {
1475 size_t l;
1476 char *printme, lbuf[100];
1477
1478 memset(lbuf, 0, 100);
1479 if (startswith(line, "MemTotal:")) {
1480 sscanf(line+14, "%lu", &hosttotal);
1481 if (hosttotal < memlimit)
1482 memlimit = hosttotal;
1483 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1484 printme = lbuf;
1485 } else if (startswith(line, "MemFree:")) {
1486 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1487 printme = lbuf;
1488 } else if (startswith(line, "MemAvailable:")) {
1489 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1490 printme = lbuf;
1491 } else if (startswith(line, "Buffers:")) {
1492 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1493 printme = lbuf;
1494 } else if (startswith(line, "Cached:")) {
1495 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
1496 printme = lbuf;
1497 } else if (startswith(line, "SwapCached:")) {
1498 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
1499 printme = lbuf;
1500 } else
1501 printme = line;
1502 l = snprintf(buf, size, "%s", printme);
1503 buf += l;
1504 size -= l;
1505 total_len += l;
1506 }
1507
1508 return total_len;
1509 }
1510
1511 /*
1512 * Read the cpuset.cpus for cg
1513 * Return the answer in a nih_alloced string
1514 */
1515 static char *get_cpuset(const char *cg)
1516 {
1517 char *answer;
1518
1519 if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
1520 return NULL;
1521 return answer;
1522 }
1523
1524 /*
1525 * Helper functions for cpuset_in-set
1526 */
1527 char *cpuset_nexttok(const char *c)
1528 {
1529 char *r = strchr(c+1, ',');
1530 if (r)
1531 return r+1;
1532 return NULL;
1533 }
1534
1535 int cpuset_getrange(const char *c, int *a, int *b)
1536 {
1537 int ret;
1538
1539 ret = sscanf(c, "%d-%d", a, b);
1540 return ret;
1541 }
1542
1543 /*
1544 * cpusets are in format "1,2-3,4"
1545 * iow, comma-delimited ranges
1546 */
1547 static bool cpu_in_cpuset(int cpu, const char *cpuset)
1548 {
1549 const char *c;
1550
1551 for (c = cpuset; c; c = cpuset_nexttok(c)) {
1552 int a, b, ret;
1553
1554 ret = cpuset_getrange(c, &a, &b);
1555 if (ret == 1 && cpu == a)
1556 return true;
1557 if (ret != 2) // bad cpuset!
1558 return false;
1559 if (cpu >= a && cpu <= b)
1560 return true;
1561 }
1562
1563 return false;
1564 }
1565
1566 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
1567 {
1568 int cpu;
1569
1570 if (sscanf(line, "processor : %d", &cpu) != 1)
1571 return false;
1572 return cpu_in_cpuset(cpu, cpuset);
1573 }
1574
1575 /*
1576 * check whether this is a '^processor" line in /proc/cpuinfo
1577 */
1578 static bool is_processor_line(const char *line)
1579 {
1580 int cpu;
1581
1582 if (sscanf(line, "processor : %d", &cpu) == 1)
1583 return true;
1584 return false;
1585 }
1586
1587 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
1588 struct fuse_file_info *fi)
1589 {
1590 struct fuse_context *fc = fuse_get_context();
1591 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1592 nih_local char *cpuset = NULL;
1593 char *line = NULL;
1594 size_t linelen = 0, total_len = 0;
1595 bool am_printing = false;
1596 int curcpu = -1;
1597 FILE *f;
1598
1599 if (offset)
1600 return -EINVAL;
1601
1602 if (!cg)
1603 return 0;
1604
1605 cpuset = get_cpuset(cg);
1606 if (!cpuset)
1607 return 0;
1608
1609 f = fopen("/proc/cpuinfo", "r");
1610 if (!f)
1611 return 0;
1612
1613 while (getline(&line, &linelen, f) != -1) {
1614 size_t l;
1615 if (is_processor_line(line)) {
1616 am_printing = cpuline_in_cpuset(line, cpuset);
1617 if (am_printing) {
1618 curcpu ++;
1619 l = snprintf(buf, size, "processor : %d\n", curcpu);
1620 buf += l;
1621 size -= l;
1622 total_len += l;
1623 }
1624 continue;
1625 }
1626 if (am_printing) {
1627 l = snprintf(buf, size, "%s", line);
1628 buf += l;
1629 size -= l;
1630 total_len += l;
1631 }
1632 }
1633
1634 return total_len;
1635 }
1636
1637 static int proc_stat_read(char *buf, size_t size, off_t offset,
1638 struct fuse_file_info *fi)
1639 {
1640 struct fuse_context *fc = fuse_get_context();
1641 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1642 nih_local char *cpuset = NULL;
1643 char *line = NULL;
1644 size_t linelen = 0, total_len = 0;
1645 int curcpu = 0;
1646 FILE *f;
1647
1648 if (offset)
1649 return -EINVAL;
1650
1651 if (!cg)
1652 return 0;
1653
1654 cpuset = get_cpuset(cg);
1655 if (!cpuset)
1656 return 0;
1657
1658 f = fopen("/proc/stat", "r");
1659 if (!f)
1660 return 0;
1661
1662 while (getline(&line, &linelen, f) != -1) {
1663 size_t l;
1664 int cpu;
1665 char *c;
1666
1667 if (sscanf(line, "cpu%d", &cpu) != 1) {
1668 /* not a ^cpu line, just print it */
1669 l = snprintf(buf, size, "%s", line);
1670 buf += l;
1671 size -= l;
1672 total_len += l;
1673 continue;
1674 }
1675 if (!cpu_in_cpuset(cpu, cpuset))
1676 continue;
1677 curcpu ++;
1678
1679 c = strchr(line, ' ');
1680 if (!c)
1681 continue;
1682 l = snprintf(buf, size, "cpu%d %s", curcpu, c);
1683 buf += l;
1684 size -= l;
1685 total_len += l;
1686 }
1687
1688 return total_len;
1689 }
1690
1691 /*
1692 * How to guess what to present for uptime?
1693 * One thing we could do would be to take the date on the caller's
1694 * memory.usage_in_bytes file, which should equal the time of creation
1695 * of his cgroup. However, a task could be in a sub-cgroup of the
1696 * container. The same problem exists if we try to look at the ages
1697 * of processes in the caller's cgroup.
1698 *
1699 * So we'll fork a task that will enter the caller's pidns, mount a
1700 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
1701 *
1702 * For the second uptime #, we'll do as Stéphane had done, just copy
1703 * the number from /proc/uptime. Not sure how to best emulate 'idle'
1704 * time. Maybe someone can come up with a good algorithm and submit a
1705 * patch. Maybe something based on cpushare info?
1706 */
1707
1708 /* return age of the reaper for $pid, taken from ctime of its procdir */
1709 static long int get_pid1_time(pid_t pid)
1710 {
1711 char fnam[100];
1712 int fd;
1713 struct stat sb;
1714 int ret;
1715 pid_t npid;
1716
1717 if (unshare(CLONE_NEWNS))
1718 return 0;
1719
1720 sprintf(fnam, "/proc/%d/ns/pid", pid);
1721 fd = open(fnam, O_RDONLY);
1722 if (fd < 0) {
1723 perror("get_pid1_time open of ns/pid");
1724 return 0;
1725 }
1726 if (setns(fd, 0)) {
1727 perror("get_pid1_time setns 1");
1728 close(fd);
1729 return 0;
1730 }
1731 close(fd);
1732 npid = fork();
1733 if (npid < 0)
1734 return 0;
1735
1736 if (npid) {
1737 // child will do the writing for us
1738 wait_for_pid(npid);
1739 exit(0);
1740 }
1741
1742 umount2("/proc", MNT_DETACH);
1743
1744 if (mount("proc", "/proc", "proc", 0, NULL)) {
1745 perror("get_pid1_time mount");
1746 return 0;
1747 }
1748 ret = lstat("/proc/1", &sb);
1749 if (ret) {
1750 perror("get_pid1_time lstat");
1751 return 0;
1752 }
1753 return time(NULL) - sb.st_ctime;
1754 }
1755
1756 static long int getreaperage(pid_t qpid)
1757 {
1758 int pid, mypipe[2], ret;
1759 struct timeval tv;
1760 fd_set s;
1761 long int mtime, answer = 0;
1762
1763 if (pipe(mypipe)) {
1764 return 0;
1765 }
1766
1767 pid = fork();
1768
1769 if (!pid) { // child
1770 mtime = get_pid1_time(qpid);
1771 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
1772 fprintf(stderr, "Warning: bad write from getreaperage\n");
1773 exit(0);
1774 }
1775
1776 close(mypipe[1]);
1777 FD_ZERO(&s);
1778 FD_SET(mypipe[0], &s);
1779 tv.tv_sec = 1;
1780 tv.tv_usec = 0;
1781 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
1782 if (ret == -1) {
1783 perror("select");
1784 goto out;
1785 }
1786 if (!ret) {
1787 fprintf(stderr, "timed out\n");
1788 goto out;
1789 }
1790 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
1791 perror("read");
1792 goto out;
1793 }
1794 answer = mtime;
1795
1796 out:
1797 wait_for_pid(pid);
1798 close(mypipe[0]);
1799 return answer;
1800 }
1801
1802 static long int getprocidle(void)
1803 {
1804 FILE *f = fopen("/proc/uptime", "r");
1805 long int age, idle;
1806 if (!f)
1807 return 0;
1808 if (fscanf(f, "%ld %ld", &age, &idle) != 2)
1809 return 0;
1810 return idle;
1811 }
1812
1813 /*
1814 * We read /proc/uptime and reuse its second field.
1815 * For the first field, we use the mtime for the reaper for
1816 * the calling pid as returned by getreaperage
1817 */
1818 static int proc_uptime_read(char *buf, size_t size, off_t offset,
1819 struct fuse_file_info *fi)
1820 {
1821 struct fuse_context *fc = fuse_get_context();
1822 long int reaperage = getreaperage(fc->pid);;
1823 long int idletime = getprocidle();
1824
1825 if (offset)
1826 return -EINVAL;
1827 return snprintf(buf, size, "%ld %ld\n", reaperage, idletime);
1828 }
1829
1830 static off_t get_procfile_size(const char *which)
1831 {
1832 FILE *f = fopen(which, "r");
1833 char *line = NULL;
1834 size_t len = 0;
1835 ssize_t sz, answer = 0;
1836 if (!f)
1837 return 0;
1838
1839 while ((sz = getline(&line, &len, f)) != -1)
1840 answer += sz;
1841 fclose (f);
1842
1843 return answer;
1844 }
1845
1846 static int proc_getattr(const char *path, struct stat *sb)
1847 {
1848 struct timespec now;
1849
1850 memset(sb, 0, sizeof(struct stat));
1851 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1852 return -EINVAL;
1853 sb->st_uid = sb->st_gid = 0;
1854 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1855 if (strcmp(path, "/proc") == 0) {
1856 sb->st_mode = S_IFDIR | 00555;
1857 sb->st_nlink = 2;
1858 return 0;
1859 }
1860 if (strcmp(path, "/proc/meminfo") == 0 ||
1861 strcmp(path, "/proc/cpuinfo") == 0 ||
1862 strcmp(path, "/proc/uptime") == 0 ||
1863 strcmp(path, "/proc/stat") == 0) {
1864
1865 sb->st_size = get_procfile_size(path);
1866 sb->st_mode = S_IFREG | 00444;
1867 sb->st_nlink = 1;
1868 return 0;
1869 }
1870
1871 return -ENOENT;
1872 }
1873
1874 static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1875 struct fuse_file_info *fi)
1876 {
1877 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
1878 filler(buf, "meminfo", NULL, 0) != 0 ||
1879 filler(buf, "stat", NULL, 0) != 0 ||
1880 filler(buf, "uptime", NULL, 0) != 0)
1881 return -EINVAL;
1882 return 0;
1883 }
1884
1885 static int proc_open(const char *path, struct fuse_file_info *fi)
1886 {
1887 if (strcmp(path, "/proc/meminfo") == 0 ||
1888 strcmp(path, "/proc/cpuinfo") == 0 ||
1889 strcmp(path, "/proc/uptime") == 0 ||
1890 strcmp(path, "/proc/stat") == 0)
1891 return 0;
1892 return -ENOENT;
1893 }
1894
1895 static int proc_read(const char *path, char *buf, size_t size, off_t offset,
1896 struct fuse_file_info *fi)
1897 {
1898 if (strcmp(path, "/proc/meminfo") == 0)
1899 return proc_meminfo_read(buf, size, offset, fi);
1900 if (strcmp(path, "/proc/cpuinfo") == 0)
1901 return proc_cpuinfo_read(buf, size, offset, fi);
1902 if (strcmp(path, "/proc/uptime") == 0)
1903 return proc_uptime_read(buf, size, offset, fi);
1904 if (strcmp(path, "/proc/stat") == 0)
1905 return proc_stat_read(buf, size, offset, fi);
1906 return -EINVAL;
1907 }
1908
1909 /*
1910 * FUSE ops for /
1911 * these just delegate to the /proc and /cgroup ops as
1912 * needed
1913 */
1914
1915 static int lxcfs_getattr(const char *path, struct stat *sb)
1916 {
1917 if (strcmp(path, "/") == 0) {
1918 sb->st_mode = S_IFDIR | 00755;
1919 sb->st_nlink = 2;
1920 return 0;
1921 }
1922 if (strncmp(path, "/cgroup", 7) == 0) {
1923 return cg_getattr(path, sb);
1924 }
1925 if (strncmp(path, "/proc", 5) == 0) {
1926 return proc_getattr(path, sb);
1927 }
1928 return -EINVAL;
1929 }
1930
1931 static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
1932 {
1933 if (strcmp(path, "/") == 0)
1934 return 0;
1935
1936 if (strncmp(path, "/cgroup", 7) == 0) {
1937 return cg_opendir(path, fi);
1938 }
1939 if (strcmp(path, "/proc") == 0)
1940 return 0;
1941 return -ENOENT;
1942 }
1943
1944 static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1945 struct fuse_file_info *fi)
1946 {
1947 if (strcmp(path, "/") == 0) {
1948 if (filler(buf, "proc", NULL, 0) != 0 ||
1949 filler(buf, "cgroup", NULL, 0) != 0)
1950 return -EINVAL;
1951 return 0;
1952 }
1953 if (strncmp(path, "/cgroup", 7) == 0)
1954 return cg_readdir(path, buf, filler, offset, fi);
1955 if (strcmp(path, "/proc") == 0)
1956 return proc_readdir(path, buf, filler, offset, fi);
1957 return -EINVAL;
1958 }
1959
1960 static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
1961 {
1962 if (strcmp(path, "/") == 0)
1963 return 0;
1964 if (strncmp(path, "/cgroup", 7) == 0) {
1965 return cg_releasedir(path, fi);
1966 }
1967 if (strcmp(path, "/proc") == 0)
1968 return 0;
1969 return -EINVAL;
1970 }
1971
1972 static int lxcfs_open(const char *path, struct fuse_file_info *fi)
1973 {
1974 if (strncmp(path, "/cgroup", 7) == 0)
1975 return cg_open(path, fi);
1976 if (strncmp(path, "/proc", 5) == 0)
1977 return proc_open(path, fi);
1978
1979 return -EINVAL;
1980 }
1981
1982 static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
1983 struct fuse_file_info *fi)
1984 {
1985 if (strncmp(path, "/cgroup", 7) == 0)
1986 return cg_read(path, buf, size, offset, fi);
1987 if (strncmp(path, "/proc", 5) == 0)
1988 return proc_read(path, buf, size, offset, fi);
1989
1990 return -EINVAL;
1991 }
1992
1993 int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
1994 struct fuse_file_info *fi)
1995 {
1996 if (strncmp(path, "/cgroup", 7) == 0) {
1997 return cg_write(path, buf, size, offset, fi);
1998 }
1999
2000 return -EINVAL;
2001 }
2002
2003 static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2004 {
2005 return 0;
2006 }
2007
2008 static int lxcfs_release(const char *path, struct fuse_file_info *fi)
2009 {
2010 return 0;
2011 }
2012
2013 static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2014 {
2015 return 0;
2016 }
2017
2018 int lxcfs_mkdir(const char *path, mode_t mode)
2019 {
2020 if (strncmp(path, "/cgroup", 7) == 0)
2021 return cg_mkdir(path, mode);
2022
2023 return -EINVAL;
2024 }
2025
2026 int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2027 {
2028 if (strncmp(path, "/cgroup", 7) == 0)
2029 return cg_chown(path, uid, gid);
2030
2031 return -EINVAL;
2032 }
2033
2034 /*
2035 * cat first does a truncate before doing ops->write. This doesn't
2036 * really make sense for cgroups. So just return 0 always but do
2037 * nothing.
2038 */
2039 int lxcfs_truncate(const char *path, off_t newsize)
2040 {
2041 if (strncmp(path, "/cgroup", 7) == 0)
2042 return 0;
2043 return -EINVAL;
2044 }
2045
2046 int lxcfs_rmdir(const char *path)
2047 {
2048 if (strncmp(path, "/cgroup", 7) == 0)
2049 return cg_rmdir(path);
2050 return -EINVAL;
2051 }
2052
2053 int lxcfs_chmod(const char *path, mode_t mode)
2054 {
2055 if (strncmp(path, "/cgroup", 7) == 0)
2056 return cg_chmod(path, mode);
2057 return -EINVAL;
2058 }
2059
2060 const struct fuse_operations lxcfs_ops = {
2061 .getattr = lxcfs_getattr,
2062 .readlink = NULL,
2063 .getdir = NULL,
2064 .mknod = NULL,
2065 .mkdir = lxcfs_mkdir,
2066 .unlink = NULL,
2067 .rmdir = lxcfs_rmdir,
2068 .symlink = NULL,
2069 .rename = NULL,
2070 .link = NULL,
2071 .chmod = lxcfs_chmod,
2072 .chown = lxcfs_chown,
2073 .truncate = lxcfs_truncate,
2074 .utime = NULL,
2075
2076 .open = lxcfs_open,
2077 .read = lxcfs_read,
2078 .release = lxcfs_release,
2079 .write = lxcfs_write,
2080
2081 .statfs = NULL,
2082 .flush = lxcfs_flush,
2083 .fsync = lxcfs_fsync,
2084
2085 .setxattr = NULL,
2086 .getxattr = NULL,
2087 .listxattr = NULL,
2088 .removexattr = NULL,
2089
2090 .opendir = lxcfs_opendir,
2091 .readdir = lxcfs_readdir,
2092 .releasedir = lxcfs_releasedir,
2093
2094 .fsyncdir = NULL,
2095 .init = NULL,
2096 .destroy = NULL,
2097 .access = NULL,
2098 .create = NULL,
2099 .ftruncate = NULL,
2100 .fgetattr = NULL,
2101 };
2102
2103 static void usage(const char *me)
2104 {
2105 fprintf(stderr, "Usage:\n");
2106 fprintf(stderr, "\n");
2107 fprintf(stderr, "%s [FUSE and mount options] mountpoint\n", me);
2108 exit(1);
2109 }
2110
2111 static bool is_help(char *w)
2112 {
2113 if (strcmp(w, "-h") == 0 ||
2114 strcmp(w, "--help") == 0 ||
2115 strcmp(w, "-help") == 0 ||
2116 strcmp(w, "help") == 0)
2117 return true;
2118 return false;
2119 }
2120
2121 int main(int argc, char *argv[])
2122 {
2123 int ret;
2124 struct lxcfs_state *d;
2125
2126 if (argc < 2 || is_help(argv[1]))
2127 usage(argv[0]);
2128
2129 d = malloc(sizeof(*d));
2130 if (!d)
2131 return -1;
2132
2133 if (!cgm_escape_cgroup())
2134 fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
2135
2136 if (!cgm_get_controllers(&d->subsystems))
2137 return -1;
2138
2139 ret = fuse_main(argc, argv, &lxcfs_ops, d);
2140
2141 return ret;
2142 }