]> git.proxmox.com Git - mirror_lxcfs.git/blame - bindings.c
bindings: use openat fd for fstatat(), unlinkat()
[mirror_lxcfs.git] / bindings.c
CommitLineData
237e200e
SH
1/* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9#define FUSE_USE_VERSION 26
10
237e200e 11#include <dirent.h>
29a73c2f 12#include <errno.h>
237e200e
SH
13#include <fcntl.h>
14#include <fuse.h>
237e200e 15#include <libgen.h>
237e200e 16#include <pthread.h>
29a73c2f
CB
17#include <sched.h>
18#include <stdbool.h>
19#include <stdio.h>
20#include <stdlib.h>
21#include <string.h>
22#include <time.h>
23#include <unistd.h>
24#include <wait.h>
237e200e 25#include <linux/sched.h>
29a73c2f
CB
26#include <sys/epoll.h>
27#include <sys/mman.h>
28#include <sys/mount.h>
237e200e
SH
29#include <sys/param.h>
30#include <sys/socket.h>
29a73c2f 31#include <sys/syscall.h>
237e200e 32
237e200e 33#include "bindings.h"
237e200e
SH
34#include "config.h" // for VERSION
35
29a73c2f
CB
36/* Define pivot_root() if missing from the C library */
37#ifndef HAVE_PIVOT_ROOT
38static int pivot_root(const char * new_root, const char * put_old)
39{
40#ifdef __NR_pivot_root
41return syscall(__NR_pivot_root, new_root, put_old);
42#else
43errno = ENOSYS;
44return -1;
45#endif
46}
47#else
48extern int pivot_root(const char * new_root, const char * put_old);
49#endif
50
7dd6560a
CB
51#ifdef DEBUG
52#define lxcfs_debug(format, ...) \
53 do { \
54 fprintf(stderr, "%s: %d: %s: " format, __FILE__, __LINE__, \
55 __func__, __VA_ARGS__); \
56 } while (false)
57#else
58#define lxcfs_debug(format, ...)
59#endif /* DEBUG */
60
237e200e
SH
61enum {
62 LXC_TYPE_CGDIR,
63 LXC_TYPE_CGFILE,
64 LXC_TYPE_PROC_MEMINFO,
65 LXC_TYPE_PROC_CPUINFO,
66 LXC_TYPE_PROC_UPTIME,
67 LXC_TYPE_PROC_STAT,
68 LXC_TYPE_PROC_DISKSTATS,
70dcc12e 69 LXC_TYPE_PROC_SWAPS,
237e200e
SH
70};
71
72struct file_info {
73 char *controller;
74 char *cgroup;
75 char *file;
76 int type;
77 char *buf; // unused as of yet
78 int buflen;
79 int size; //actual data size
80 int cached;
81};
82
83/* reserve buffer size, for cpuall in /proc/stat */
84#define BUF_RESERVE_SIZE 256
85
86/*
87 * A table caching which pid is init for a pid namespace.
88 * When looking up which pid is init for $qpid, we first
89 * 1. Stat /proc/$qpid/ns/pid.
90 * 2. Check whether the ino_t is in our store.
91 * a. if not, fork a child in qpid's ns to send us
92 * ucred.pid = 1, and read the initpid. Cache
93 * initpid and creation time for /proc/initpid
94 * in a new store entry.
95 * b. if so, verify that /proc/initpid still matches
96 * what we have saved. If not, clear the store
97 * entry and go back to a. If so, return the
98 * cached initpid.
99 */
100struct pidns_init_store {
101 ino_t ino; // inode number for /proc/$pid/ns/pid
102 pid_t initpid; // the pid of nit in that ns
103 long int ctime; // the time at which /proc/$initpid was created
104 struct pidns_init_store *next;
105 long int lastcheck;
106};
107
108/* lol - look at how they are allocated in the kernel */
109#define PIDNS_HASH_SIZE 4096
110#define HASH(x) ((x) % PIDNS_HASH_SIZE)
111
112static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
113static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
114static void lock_mutex(pthread_mutex_t *l)
115{
116 int ret;
117
118 if ((ret = pthread_mutex_lock(l)) != 0) {
119 fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret));
120 exit(1);
121 }
122}
123
29a73c2f
CB
124/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
125 * Number of hierarchies mounted. */
126static int num_hierarchies;
127
128/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
129 * Hierachies mounted {cpuset, blkio, ...}:
130 * Initialized via __constructor__ collect_and_mount_subsystems(). */
131static char **hierarchies;
132
133/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
134 * Open file descriptors:
135 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
136 * private mount namespace.
137 * Initialized via __constructor__ collect_and_mount_subsystems().
138 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
139 * mounts and respective files in the private namespace even when located in
140 * another namespace using the *at() family of functions
141 * {openat(), fchownat(), ...}. */
142static int *fd_hierarchies;
143
237e200e
SH
144static void unlock_mutex(pthread_mutex_t *l)
145{
146 int ret;
147
148 if ((ret = pthread_mutex_unlock(l)) != 0) {
149 fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret));
150 exit(1);
151 }
152}
153
154static void store_lock(void)
155{
156 lock_mutex(&pidns_store_mutex);
157}
158
159static void store_unlock(void)
160{
161 unlock_mutex(&pidns_store_mutex);
162}
163
164/* Must be called under store_lock */
165static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
166{
167 struct stat initsb;
168 char fnam[100];
169
170 snprintf(fnam, 100, "/proc/%d", e->initpid);
171 if (stat(fnam, &initsb) < 0)
172 return false;
7dd6560a
CB
173
174 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
175 initsb.st_ctime, e->initpid);
176
237e200e
SH
177 if (e->ctime != initsb.st_ctime)
178 return false;
179 return true;
180}
181
182/* Must be called under store_lock */
183static void remove_initpid(struct pidns_init_store *e)
184{
185 struct pidns_init_store *tmp;
186 int h;
187
7dd6560a
CB
188 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
189
237e200e
SH
190 h = HASH(e->ino);
191 if (pidns_hash_table[h] == e) {
192 pidns_hash_table[h] = e->next;
193 free(e);
194 return;
195 }
196
197 tmp = pidns_hash_table[h];
198 while (tmp) {
199 if (tmp->next == e) {
200 tmp->next = e->next;
201 free(e);
202 return;
203 }
204 tmp = tmp->next;
205 }
206}
207
208#define PURGE_SECS 5
209/* Must be called under store_lock */
210static void prune_initpid_store(void)
211{
212 static long int last_prune = 0;
213 struct pidns_init_store *e, *prev, *delme;
214 long int now, threshold;
215 int i;
216
217 if (!last_prune) {
218 last_prune = time(NULL);
219 return;
220 }
221 now = time(NULL);
222 if (now < last_prune + PURGE_SECS)
223 return;
7dd6560a
CB
224
225 lxcfs_debug("%s\n", "Pruning.");
226
237e200e
SH
227 last_prune = now;
228 threshold = now - 2 * PURGE_SECS;
229
230 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
231 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
232 if (e->lastcheck < threshold) {
7dd6560a
CB
233
234 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
235
237e200e
SH
236 delme = e;
237 if (prev)
238 prev->next = e->next;
239 else
240 pidns_hash_table[i] = e->next;
241 e = e->next;
242 free(delme);
243 } else {
244 prev = e;
245 e = e->next;
246 }
247 }
248 }
249}
250
251/* Must be called under store_lock */
252static void save_initpid(struct stat *sb, pid_t pid)
253{
254 struct pidns_init_store *e;
255 char fpath[100];
256 struct stat procsb;
257 int h;
258
7dd6560a
CB
259 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
260
237e200e
SH
261 snprintf(fpath, 100, "/proc/%d", pid);
262 if (stat(fpath, &procsb) < 0)
263 return;
264 do {
265 e = malloc(sizeof(*e));
266 } while (!e);
267 e->ino = sb->st_ino;
268 e->initpid = pid;
269 e->ctime = procsb.st_ctime;
270 h = HASH(e->ino);
271 e->next = pidns_hash_table[h];
272 e->lastcheck = time(NULL);
273 pidns_hash_table[h] = e;
274}
275
276/*
277 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
278 * entry for the inode number and creation time. Verify that the init pid
279 * is still valid. If not, remove it. Return the entry if valid, NULL
280 * otherwise.
281 * Must be called under store_lock
282 */
283static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
284{
285 int h = HASH(sb->st_ino);
286 struct pidns_init_store *e = pidns_hash_table[h];
287
288 while (e) {
289 if (e->ino == sb->st_ino) {
290 if (initpid_still_valid(e, sb)) {
291 e->lastcheck = time(NULL);
292 return e;
293 }
294 remove_initpid(e);
295 return NULL;
296 }
297 e = e->next;
298 }
299
300 return NULL;
301}
302
0f657ce3 303static int is_dir(const char *path, int fd)
237e200e
SH
304{
305 struct stat statbuf;
0f657ce3 306 int ret = fstatat(fd, path, &statbuf, fd);
237e200e
SH
307 if (ret == 0 && S_ISDIR(statbuf.st_mode))
308 return 1;
309 return 0;
310}
311
312static char *must_copy_string(const char *str)
313{
314 char *dup = NULL;
315 if (!str)
316 return NULL;
317 do {
318 dup = strdup(str);
319 } while (!dup);
320
321 return dup;
322}
323
324static inline void drop_trailing_newlines(char *s)
325{
326 int l;
327
328 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
329 s[l-1] = '\0';
330}
331
332#define BATCH_SIZE 50
333static void dorealloc(char **mem, size_t oldlen, size_t newlen)
334{
335 int newbatches = (newlen / BATCH_SIZE) + 1;
336 int oldbatches = (oldlen / BATCH_SIZE) + 1;
337
338 if (!*mem || newbatches > oldbatches) {
339 char *tmp;
340 do {
341 tmp = realloc(*mem, newbatches * BATCH_SIZE);
342 } while (!tmp);
343 *mem = tmp;
344 }
345}
346static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
347{
348 size_t newlen = *len + linelen;
349 dorealloc(contents, *len, newlen + 1);
350 memcpy(*contents + *len, line, linelen+1);
351 *len = newlen;
352}
353
60f2ae53 354static char *slurp_file(const char *from, int fd)
237e200e
SH
355{
356 char *line = NULL;
357 char *contents = NULL;
60f2ae53 358 FILE *f = fdopen(fd, "r");
237e200e
SH
359 size_t len = 0, fulllen = 0;
360 ssize_t linelen;
361
362 if (!f)
363 return NULL;
364
365 while ((linelen = getline(&line, &len, f)) != -1) {
366 append_line(&contents, &fulllen, line, linelen);
367 }
368 fclose(f);
369
370 if (contents)
371 drop_trailing_newlines(contents);
372 free(line);
373 return contents;
374}
375
ba59ea09 376static bool write_string(const char *fnam, const char *string, int fd)
237e200e
SH
377{
378 FILE *f;
379 size_t len, ret;
380
ba59ea09 381 if (!(f = fdopen(fd, "w")))
237e200e
SH
382 return false;
383 len = strlen(string);
384 ret = fwrite(string, 1, len, f);
385 if (ret != len) {
386 fprintf(stderr, "Error writing to file: %s\n", strerror(errno));
387 fclose(f);
388 return false;
389 }
390 if (fclose(f) < 0) {
391 fprintf(stderr, "Error writing to file: %s\n", strerror(errno));
392 return false;
393 }
394 return true;
395}
396
237e200e
SH
397struct cgfs_files {
398 char *name;
399 uint32_t uid, gid;
400 uint32_t mode;
401};
402
0619767c 403#define ALLOC_NUM 20
237e200e
SH
404static bool store_hierarchy(char *stridx, char *h)
405{
0619767c
SH
406 if (num_hierarchies % ALLOC_NUM == 0) {
407 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
408 n *= ALLOC_NUM;
409 char **tmp = realloc(hierarchies, n * sizeof(char *));
0619767c
SH
410 if (!tmp) {
411 fprintf(stderr, "Out of memory\n");
412 exit(1);
413 }
237e200e 414 hierarchies = tmp;
237e200e 415 }
f676eb79 416
0619767c 417 hierarchies[num_hierarchies++] = must_copy_string(h);
237e200e
SH
418 return true;
419}
420
421static void print_subsystems(void)
422{
423 int i;
424
cc97d34c 425 fprintf(stderr, "hierarchies:\n");
237e200e
SH
426 for (i = 0; i < num_hierarchies; i++) {
427 if (hierarchies[i])
428 fprintf(stderr, " %d: %s\n", i, hierarchies[i]);
429 }
430}
431
432static bool in_comma_list(const char *needle, const char *haystack)
433{
434 const char *s = haystack, *e;
435 size_t nlen = strlen(needle);
436
06081b29 437 while (*s && (e = strchr(s, ','))) {
237e200e
SH
438 if (nlen != e - s) {
439 s = e + 1;
440 continue;
441 }
442 if (strncmp(needle, s, nlen) == 0)
443 return true;
444 s = e + 1;
445 }
446 if (strcmp(needle, s) == 0)
447 return true;
448 return false;
449}
450
451/* do we need to do any massaging here? I'm not sure... */
5dd3e6fd
CB
452/* Return the mounted controller and store the corresponding open file descriptor
453 * referring to the controller mountpoint in the private lxcfs namespace in
454 * @cfd.
455 */
456static char *find_mounted_controller(const char *controller, int *cfd)
237e200e
SH
457{
458 int i;
459
460 for (i = 0; i < num_hierarchies; i++) {
461 if (!hierarchies[i])
462 continue;
5dd3e6fd
CB
463 if (strcmp(hierarchies[i], controller) == 0) {
464 *cfd = fd_hierarchies[i];
237e200e 465 return hierarchies[i];
5dd3e6fd
CB
466 }
467 if (in_comma_list(controller, hierarchies[i])) {
468 *cfd = fd_hierarchies[i];
237e200e 469 return hierarchies[i];
5dd3e6fd 470 }
237e200e
SH
471 }
472
473 return NULL;
474}
475
476bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
477 const char *value)
478{
ba59ea09 479 int ret, fd, cfd;
237e200e 480 size_t len;
f5a6d92e 481 char *fnam, *tmpc;
237e200e 482
f5a6d92e 483 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
484 if (!tmpc)
485 return false;
f5a6d92e
CB
486
487 /* Make sure we pass a relative path to *at() family of functions.
488 * . + /cgroup + / + file + \0
489 */
ba59ea09 490 len = strlen(cgroup) + strlen(file) + 3;
237e200e 491 fnam = alloca(len);
ba59ea09
CB
492 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
493 if (ret < 0 || (size_t)ret >= len)
494 return false;
495
496 fd = openat(cfd, fnam, O_WRONLY);
497 if (fd < 0)
498 return false;
f676eb79 499
ba59ea09 500 return write_string(fnam, value, fd);
237e200e
SH
501}
502
503// Chown all the files in the cgroup directory. We do this when we create
504// a cgroup on behalf of a user.
f23fe717 505static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e 506{
f23fe717 507 struct dirent *direntp;
237e200e
SH
508 char path[MAXPATHLEN];
509 size_t len;
510 DIR *d;
f23fe717 511 int fd1, ret;
237e200e
SH
512
513 len = strlen(dirname);
514 if (len >= MAXPATHLEN) {
515 fprintf(stderr, "chown_all_cgroup_files: pathname too long: %s\n", dirname);
516 return;
517 }
518
f23fe717
CB
519 fd1 = openat(fd, dirname, O_DIRECTORY);
520 if (fd1 < 0)
521 return;
522
523 d = fdopendir(fd1);
237e200e
SH
524 if (!d) {
525 fprintf(stderr, "chown_all_cgroup_files: failed to open %s\n", dirname);
526 return;
527 }
528
f23fe717 529 while ((direntp = readdir(d))) {
237e200e
SH
530 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
531 continue;
532 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
533 if (ret < 0 || ret >= MAXPATHLEN) {
534 fprintf(stderr, "chown_all_cgroup_files: pathname too long under %s\n", dirname);
535 continue;
536 }
f23fe717 537 if (fchownat(fd, path, uid, gid, 0) < 0)
237e200e
SH
538 fprintf(stderr, "Failed to chown file %s to %u:%u", path, uid, gid);
539 }
540 closedir(d);
541}
542
543int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
544{
5dd3e6fd 545 int cfd;
237e200e 546 size_t len;
f5a6d92e 547 char *dirnam, *tmpc;
237e200e 548
f5a6d92e 549 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
550 if (!tmpc)
551 return -EINVAL;
f5a6d92e
CB
552
553 /* Make sure we pass a relative path to *at() family of functions.
554 * . + /cg + \0
555 */
f23fe717 556 len = strlen(cg) + 2;
237e200e 557 dirnam = alloca(len);
f23fe717 558 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
237e200e 559
f23fe717 560 if (mkdirat(cfd, dirnam, 0755) < 0)
237e200e
SH
561 return -errno;
562
563 if (uid == 0 && gid == 0)
564 return 0;
565
f23fe717 566 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
237e200e
SH
567 return -errno;
568
f23fe717 569 chown_all_cgroup_files(dirnam, uid, gid, cfd);
237e200e
SH
570
571 return 0;
572}
573
2e81a5e3 574static bool recursive_rmdir(const char *dirname, int fd, int cfd)
237e200e 575{
b7672ded 576 struct dirent *direntp;
237e200e
SH
577 DIR *dir;
578 bool ret = false;
579 char pathname[MAXPATHLEN];
b7672ded
CB
580 int dupfd;
581
582 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
583 if (dupfd < 0)
584 return false;
237e200e 585
b7672ded 586 dir = fdopendir(dupfd);
237e200e 587 if (!dir) {
7dd6560a 588 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
237e200e
SH
589 return false;
590 }
591
b7672ded 592 while ((direntp = readdir(dir))) {
237e200e
SH
593 struct stat mystat;
594 int rc;
595
237e200e
SH
596 if (!strcmp(direntp->d_name, ".") ||
597 !strcmp(direntp->d_name, ".."))
598 continue;
599
600 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
601 if (rc < 0 || rc >= MAXPATHLEN) {
602 fprintf(stderr, "pathname too long\n");
603 continue;
604 }
605
2e81a5e3
CB
606 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
607 if (rc) {
7dd6560a 608 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
237e200e
SH
609 continue;
610 }
7dd6560a 611 if (S_ISDIR(mystat.st_mode))
2e81a5e3 612 if (!recursive_rmdir(pathname, fd, cfd))
7dd6560a 613 lxcfs_debug("Error removing %s.\n", pathname);
237e200e
SH
614 }
615
616 ret = true;
617 if (closedir(dir) < 0) {
618 fprintf(stderr, "%s: failed to close directory %s: %s\n", __func__, dirname, strerror(errno));
619 ret = false;
620 }
621
2e81a5e3 622 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
7dd6560a 623 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
237e200e
SH
624 ret = false;
625 }
b7672ded 626 close(fd);
237e200e
SH
627
628 return ret;
629}
630
631bool cgfs_remove(const char *controller, const char *cg)
632{
b7672ded 633 int fd, cfd;
237e200e 634 size_t len;
f5a6d92e 635 char *dirnam, *tmpc;
237e200e 636
f5a6d92e 637 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
638 if (!tmpc)
639 return false;
f5a6d92e
CB
640
641 /* Make sure we pass a relative path to *at() family of functions.
642 * . + /cg + \0
643 */
b7672ded 644 len = strlen(cg) + 2;
237e200e 645 dirnam = alloca(len);
b7672ded
CB
646 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
647
648 fd = openat(cfd, dirnam, O_DIRECTORY);
649 if (fd < 0)
650 return false;
651
2e81a5e3 652 return recursive_rmdir(dirnam, fd, cfd);
237e200e
SH
653}
654
655bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
656{
5dd3e6fd 657 int cfd;
237e200e 658 size_t len;
f5a6d92e 659 char *pathname, *tmpc;
237e200e 660
f5a6d92e 661 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
662 if (!tmpc)
663 return false;
f5a6d92e
CB
664
665 /* Make sure we pass a relative path to *at() family of functions.
666 * . + /file + \0
667 */
534690b4 668 len = strlen(file) + 2;
237e200e 669 pathname = alloca(len);
534690b4
CB
670 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
671 if (fchmodat(cfd, pathname, mode, 0) < 0)
237e200e
SH
672 return false;
673 return true;
674}
675
0f657ce3 676static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e
SH
677{
678 size_t len;
679 char *fname;
680
681 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
682 fname = alloca(len);
683 snprintf(fname, len, "%s/tasks", dirname);
0f657ce3 684 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
685 return -errno;
686 snprintf(fname, len, "%s/cgroup.procs", dirname);
0f657ce3 687 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
688 return -errno;
689 return 0;
690}
691
692int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
693{
5dd3e6fd 694 int cfd;
237e200e 695 size_t len;
f5a6d92e 696 char *pathname, *tmpc;
237e200e 697
f5a6d92e 698 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
699 if (!tmpc)
700 return -EINVAL;
f5a6d92e
CB
701
702 /* Make sure we pass a relative path to *at() family of functions.
703 * . + /file + \0
704 */
0f657ce3 705 len = strlen(file) + 2;
237e200e 706 pathname = alloca(len);
0f657ce3
CB
707 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
708 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
237e200e
SH
709 return -errno;
710
0f657ce3 711 if (is_dir(pathname, cfd))
237e200e 712 // like cgmanager did, we want to chown the tasks file as well
0f657ce3 713 return chown_tasks_files(pathname, uid, gid, cfd);
237e200e
SH
714
715 return 0;
716}
717
718FILE *open_pids_file(const char *controller, const char *cgroup)
719{
3ffd08ee 720 int fd, cfd;
237e200e 721 size_t len;
f5a6d92e 722 char *pathname, *tmpc;
237e200e 723
f5a6d92e 724 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
725 if (!tmpc)
726 return NULL;
f5a6d92e
CB
727
728 /* Make sure we pass a relative path to *at() family of functions.
729 * . + /cgroup + / "cgroup.procs" + \0
730 */
3ffd08ee 731 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
237e200e 732 pathname = alloca(len);
3ffd08ee
CB
733 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
734
735 fd = openat(cfd, pathname, O_WRONLY);
736 if (fd < 0)
737 return NULL;
738
739 return fdopen(fd, "w");
237e200e
SH
740}
741
f366da65
WB
742static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
743 void ***list, size_t typesize,
744 void* (*iterator)(const char*, const char*, const char*))
237e200e 745{
4ea38a4c 746 int cfd, fd, ret;
237e200e 747 size_t len;
4ea38a4c 748 char *cg, *tmpc;
237e200e 749 char pathname[MAXPATHLEN];
f366da65 750 size_t sz = 0, asz = 0;
4ea38a4c 751 struct dirent *dirent;
237e200e 752 DIR *dir;
237e200e 753
4ea38a4c 754 tmpc = find_mounted_controller(controller, &cfd);
f366da65 755 *list = NULL;
237e200e 756 if (!tmpc)
e97c834b 757 return false;
237e200e 758
f5a6d92e 759 /* Make sure we pass a relative path to *at() family of functions. */
4ea38a4c
CB
760 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
761 cg = alloca(len);
762 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
763 if (ret < 0 || (size_t)ret >= len) {
764 fprintf(stderr, "%s: pathname too long under %s\n", __func__, cgroup);
765 return false;
766 }
237e200e 767
4ea38a4c
CB
768 fd = openat(cfd, cg, O_DIRECTORY);
769 if (fd < 0)
770 return false;
771
772 dir = fdopendir(fd);
237e200e
SH
773 if (!dir)
774 return false;
775
4ea38a4c 776 while ((dirent = readdir(dir))) {
237e200e 777 struct stat mystat;
237e200e 778
4ea38a4c
CB
779 if (!strcmp(dirent->d_name, ".") ||
780 !strcmp(dirent->d_name, ".."))
237e200e
SH
781 continue;
782
4ea38a4c
CB
783 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
784 if (ret < 0 || ret >= MAXPATHLEN) {
785 fprintf(stderr, "%s: pathname too long under %s\n", __func__, cg);
237e200e
SH
786 continue;
787 }
788
4ea38a4c 789 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
237e200e
SH
790 if (ret) {
791 fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno));
792 continue;
793 }
f366da65
WB
794 if ((!directories && !S_ISREG(mystat.st_mode)) ||
795 (directories && !S_ISDIR(mystat.st_mode)))
237e200e
SH
796 continue;
797
798 if (sz+2 >= asz) {
f366da65 799 void **tmp;
237e200e
SH
800 asz += BATCH_SIZE;
801 do {
f366da65 802 tmp = realloc(*list, asz * typesize);
237e200e
SH
803 } while (!tmp);
804 *list = tmp;
805 }
4ea38a4c 806 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
237e200e
SH
807 (*list)[sz+1] = NULL;
808 sz++;
809 }
810 if (closedir(dir) < 0) {
4ea38a4c 811 fprintf(stderr, "%s: failed closedir for %s: %s\n", __func__, cgroup, strerror(errno));
237e200e
SH
812 return false;
813 }
814 return true;
815}
816
f366da65
WB
817static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
818{
819 char *dup;
820 do {
821 dup = strdup(dir_entry);
822 } while (!dup);
823 return dup;
824}
825
826bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
827{
828 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
829}
830
237e200e
SH
831void free_key(struct cgfs_files *k)
832{
833 if (!k)
834 return;
835 free(k->name);
836 free(k);
837}
838
839void free_keys(struct cgfs_files **keys)
840{
841 int i;
842
843 if (!keys)
844 return;
845 for (i = 0; keys[i]; i++) {
846 free_key(keys[i]);
847 }
848 free(keys);
849}
850
851bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
852{
60f2ae53 853 int ret, fd, cfd;
237e200e 854 size_t len;
f5a6d92e 855 char *fnam, *tmpc;
237e200e 856
f5a6d92e 857 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
858 if (!tmpc)
859 return false;
f5a6d92e
CB
860
861 /* Make sure we pass a relative path to *at() family of functions.
862 * . + /cgroup + / + file + \0
863 */
60f2ae53 864 len = strlen(cgroup) + strlen(file) + 3;
237e200e 865 fnam = alloca(len);
60f2ae53
CB
866 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
867 if (ret < 0 || (size_t)ret >= len)
868 return NULL;
869
870 fd = openat(cfd, fnam, O_RDONLY);
871 if (fd < 0)
872 return NULL;
237e200e 873
60f2ae53 874 *value = slurp_file(fnam, fd);
237e200e
SH
875 return *value != NULL;
876}
877
878struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
879{
4ea38a4c 880 int ret, cfd;
237e200e 881 size_t len;
f5a6d92e 882 char *fnam, *tmpc;
237e200e
SH
883 struct stat sb;
884 struct cgfs_files *newkey;
237e200e 885
f5a6d92e 886 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
887 if (!tmpc)
888 return false;
889
890 if (file && *file == '/')
891 file++;
892
06081b29 893 if (file && strchr(file, '/'))
237e200e
SH
894 return NULL;
895
f5a6d92e
CB
896 /* Make sure we pass a relative path to *at() family of functions.
897 * . + /cgroup + / + file + \0
898 */
4ea38a4c 899 len = strlen(cgroup) + 3;
237e200e
SH
900 if (file)
901 len += strlen(file) + 1;
902 fnam = alloca(len);
4ea38a4c
CB
903 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
904 file ? "/" : "", file ? file : "");
237e200e 905
4ea38a4c 906 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
907 if (ret < 0)
908 return NULL;
909
910 do {
911 newkey = malloc(sizeof(struct cgfs_files));
912 } while (!newkey);
913 if (file)
914 newkey->name = must_copy_string(file);
06081b29
CB
915 else if (strrchr(cgroup, '/'))
916 newkey->name = must_copy_string(strrchr(cgroup, '/'));
237e200e
SH
917 else
918 newkey->name = must_copy_string(cgroup);
919 newkey->uid = sb.st_uid;
920 newkey->gid = sb.st_gid;
921 newkey->mode = sb.st_mode;
922
923 return newkey;
924}
925
f366da65 926static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
237e200e 927{
f366da65
WB
928 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
929 if (!entry) {
930 fprintf(stderr, "%s: Error getting files under %s:%s\n",
931 __func__, controller, cgroup);
237e200e 932 }
f366da65
WB
933 return entry;
934}
935
936bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
937{
938 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
237e200e
SH
939}
940
941bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
5dd3e6fd
CB
942{
943 int cfd;
944 size_t len;
f5a6d92e 945 char *fnam, *tmpc;
237e200e
SH
946 int ret;
947 struct stat sb;
948
f5a6d92e 949 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
950 if (!tmpc)
951 return false;
f5a6d92e
CB
952
953 /* Make sure we pass a relative path to *at() family of functions.
954 * . + /cgroup + / + f + \0
955 */
d04232f2 956 len = strlen(cgroup) + strlen(f) + 3;
237e200e 957 fnam = alloca(len);
d04232f2
CB
958 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
959 if (ret < 0 || (size_t)ret >= len)
960 return false;
237e200e 961
d04232f2 962 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
963 if (ret < 0 || !S_ISDIR(sb.st_mode))
964 return false;
f5a6d92e 965
237e200e
SH
966 return true;
967}
968
969#define SEND_CREDS_OK 0
970#define SEND_CREDS_NOTSK 1
971#define SEND_CREDS_FAIL 2
972static bool recv_creds(int sock, struct ucred *cred, char *v);
973static int wait_for_pid(pid_t pid);
974static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
b10bdd6c 975static int send_creds_clone_wrapper(void *arg);
237e200e
SH
976
977/*
b10bdd6c 978 * clone a task which switches to @task's namespace and writes '1'.
237e200e
SH
979 * over a unix sock so we can read the task's reaper's pid in our
980 * namespace
b10bdd6c
FG
981 *
982 * Note: glibc's fork() does not respect pidns, which can lead to failed
983 * assertions inside glibc (and thus failed forks) if the child's pid in
984 * the pidns and the parent pid outside are identical. Using clone prevents
985 * this issue.
237e200e
SH
986 */
987static void write_task_init_pid_exit(int sock, pid_t target)
988{
237e200e
SH
989 char fnam[100];
990 pid_t pid;
237e200e 991 int fd, ret;
b10bdd6c
FG
992 size_t stack_size = sysconf(_SC_PAGESIZE);
993 void *stack = alloca(stack_size);
237e200e
SH
994
995 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
996 if (ret < 0 || ret >= sizeof(fnam))
997 _exit(1);
998
999 fd = open(fnam, O_RDONLY);
1000 if (fd < 0) {
1001 perror("write_task_init_pid_exit open of ns/pid");
1002 _exit(1);
1003 }
1004 if (setns(fd, 0)) {
1005 perror("write_task_init_pid_exit setns 1");
1006 close(fd);
1007 _exit(1);
1008 }
b10bdd6c 1009 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
237e200e
SH
1010 if (pid < 0)
1011 _exit(1);
1012 if (pid != 0) {
1013 if (!wait_for_pid(pid))
1014 _exit(1);
1015 _exit(0);
1016 }
b10bdd6c
FG
1017}
1018
1019static int send_creds_clone_wrapper(void *arg) {
1020 struct ucred cred;
1021 char v;
1022 int sock = *(int *)arg;
237e200e
SH
1023
1024 /* we are the child */
1025 cred.uid = 0;
1026 cred.gid = 0;
1027 cred.pid = 1;
1028 v = '1';
1029 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
b10bdd6c
FG
1030 return 1;
1031 return 0;
237e200e
SH
1032}
1033
1034static pid_t get_init_pid_for_task(pid_t task)
1035{
1036 int sock[2];
1037 pid_t pid;
1038 pid_t ret = -1;
1039 char v = '0';
1040 struct ucred cred;
1041
1042 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1043 perror("socketpair");
1044 return -1;
1045 }
1046
1047 pid = fork();
1048 if (pid < 0)
1049 goto out;
1050 if (!pid) {
1051 close(sock[1]);
1052 write_task_init_pid_exit(sock[0], task);
1053 _exit(0);
1054 }
1055
1056 if (!recv_creds(sock[1], &cred, &v))
1057 goto out;
1058 ret = cred.pid;
1059
1060out:
1061 close(sock[0]);
1062 close(sock[1]);
1063 if (pid > 0)
1064 wait_for_pid(pid);
1065 return ret;
1066}
1067
1068static pid_t lookup_initpid_in_store(pid_t qpid)
1069{
1070 pid_t answer = 0;
1071 struct stat sb;
1072 struct pidns_init_store *e;
1073 char fnam[100];
1074
1075 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1076 store_lock();
1077 if (stat(fnam, &sb) < 0)
1078 goto out;
1079 e = lookup_verify_initpid(&sb);
1080 if (e) {
1081 answer = e->initpid;
1082 goto out;
1083 }
1084 answer = get_init_pid_for_task(qpid);
1085 if (answer > 0)
1086 save_initpid(&sb, answer);
1087
1088out:
1089 /* we prune at end in case we are returning
1090 * the value we were about to return */
1091 prune_initpid_store();
1092 store_unlock();
1093 return answer;
1094}
1095
1096static int wait_for_pid(pid_t pid)
1097{
1098 int status, ret;
1099
1100 if (pid <= 0)
1101 return -1;
1102
1103again:
1104 ret = waitpid(pid, &status, 0);
1105 if (ret == -1) {
1106 if (errno == EINTR)
1107 goto again;
1108 return -1;
1109 }
1110 if (ret != pid)
1111 goto again;
1112 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1113 return -1;
1114 return 0;
1115}
1116
1117
1118/*
1119 * append pid to *src.
1120 * src: a pointer to a char* in which ot append the pid.
1121 * sz: the number of characters printed so far, minus trailing \0.
1122 * asz: the allocated size so far
1123 * pid: the pid to append
1124 */
1125static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1126{
1127 char tmp[30];
1128
1129 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1130
1131 if (!*src || tmplen + *sz + 1 >= *asz) {
1132 char *tmp;
1133 do {
1134 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1135 } while (!tmp);
1136 *src = tmp;
1137 *asz += BUF_RESERVE_SIZE;
1138 }
bbfd0e33 1139 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
237e200e 1140 *sz += tmplen;
237e200e
SH
1141}
1142
1143/*
1144 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1145 * valid in the caller's namespace, return the id mapped into
1146 * pid's namespace.
1147 * Returns the mapped id, or -1 on error.
1148 */
1149unsigned int
1150convert_id_to_ns(FILE *idfile, unsigned int in_id)
1151{
1152 unsigned int nsuid, // base id for a range in the idfile's namespace
1153 hostuid, // base id for a range in the caller's namespace
1154 count; // number of ids in this range
1155 char line[400];
1156 int ret;
1157
1158 fseek(idfile, 0L, SEEK_SET);
1159 while (fgets(line, 400, idfile)) {
1160 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1161 if (ret != 3)
1162 continue;
1163 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1164 /*
1165 * uids wrapped around - unexpected as this is a procfile,
1166 * so just bail.
1167 */
1168 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
1169 nsuid, hostuid, count, line);
1170 return -1;
1171 }
1172 if (hostuid <= in_id && hostuid+count > in_id) {
1173 /*
1174 * now since hostuid <= in_id < hostuid+count, and
1175 * hostuid+count and nsuid+count do not wrap around,
1176 * we know that nsuid+(in_id-hostuid) which must be
1177 * less that nsuid+(count) must not wrap around
1178 */
1179 return (in_id - hostuid) + nsuid;
1180 }
1181 }
1182
1183 // no answer found
1184 return -1;
1185}
1186
1187/*
1188 * for is_privileged_over,
1189 * specify whether we require the calling uid to be root in his
1190 * namespace
1191 */
1192#define NS_ROOT_REQD true
1193#define NS_ROOT_OPT false
1194
1195#define PROCLEN 100
1196
1197static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1198{
1199 char fpath[PROCLEN];
1200 int ret;
1201 bool answer = false;
1202 uid_t nsuid;
1203
1204 if (victim == -1 || uid == -1)
1205 return false;
1206
1207 /*
1208 * If the request is one not requiring root in the namespace,
1209 * then having the same uid suffices. (i.e. uid 1000 has write
1210 * access to files owned by uid 1000
1211 */
1212 if (!req_ns_root && uid == victim)
1213 return true;
1214
1215 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1216 if (ret < 0 || ret >= PROCLEN)
1217 return false;
1218 FILE *f = fopen(fpath, "r");
1219 if (!f)
1220 return false;
1221
1222 /* if caller's not root in his namespace, reject */
1223 nsuid = convert_id_to_ns(f, uid);
1224 if (nsuid)
1225 goto out;
1226
1227 /*
1228 * If victim is not mapped into caller's ns, reject.
1229 * XXX I'm not sure this check is needed given that fuse
1230 * will be sending requests where the vfs has converted
1231 */
1232 nsuid = convert_id_to_ns(f, victim);
1233 if (nsuid == -1)
1234 goto out;
1235
1236 answer = true;
1237
1238out:
1239 fclose(f);
1240 return answer;
1241}
1242
1243static bool perms_include(int fmode, mode_t req_mode)
1244{
1245 mode_t r;
1246
1247 switch (req_mode & O_ACCMODE) {
1248 case O_RDONLY:
1249 r = S_IROTH;
1250 break;
1251 case O_WRONLY:
1252 r = S_IWOTH;
1253 break;
1254 case O_RDWR:
1255 r = S_IROTH | S_IWOTH;
1256 break;
1257 default:
1258 return false;
1259 }
1260 return ((fmode & r) == r);
1261}
1262
1263
1264/*
1265 * taskcg is a/b/c
1266 * querycg is /a/b/c/d/e
1267 * we return 'd'
1268 */
1269static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1270{
1271 char *start, *end;
1272
1273 if (strlen(taskcg) <= strlen(querycg)) {
1274 fprintf(stderr, "%s: I was fed bad input\n", __func__);
1275 return NULL;
1276 }
1277
06081b29 1278 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
237e200e
SH
1279 start = strdup(taskcg + 1);
1280 else
1281 start = strdup(taskcg + strlen(querycg) + 1);
1282 if (!start)
1283 return NULL;
1284 end = strchr(start, '/');
1285 if (end)
1286 *end = '\0';
1287 return start;
1288}
1289
1290static void stripnewline(char *x)
1291{
1292 size_t l = strlen(x);
1293 if (l && x[l-1] == '\n')
1294 x[l-1] = '\0';
1295}
1296
1297static char *get_pid_cgroup(pid_t pid, const char *contrl)
1298{
5dd3e6fd 1299 int cfd;
237e200e
SH
1300 char fnam[PROCLEN];
1301 FILE *f;
1302 char *answer = NULL;
1303 char *line = NULL;
1304 size_t len = 0;
1305 int ret;
5dd3e6fd 1306 const char *h = find_mounted_controller(contrl, &cfd);
237e200e
SH
1307 if (!h)
1308 return NULL;
1309
1310 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1311 if (ret < 0 || ret >= PROCLEN)
1312 return NULL;
1313 if (!(f = fopen(fnam, "r")))
1314 return NULL;
1315
1316 while (getline(&line, &len, f) != -1) {
1317 char *c1, *c2;
1318 if (!line[0])
1319 continue;
1320 c1 = strchr(line, ':');
1321 if (!c1)
1322 goto out;
1323 c1++;
1324 c2 = strchr(c1, ':');
1325 if (!c2)
1326 goto out;
1327 *c2 = '\0';
1328 if (strcmp(c1, h) != 0)
1329 continue;
1330 c2++;
1331 stripnewline(c2);
1332 do {
1333 answer = strdup(c2);
1334 } while (!answer);
1335 break;
1336 }
1337
1338out:
1339 fclose(f);
1340 free(line);
1341 return answer;
1342}
1343
1344/*
1345 * check whether a fuse context may access a cgroup dir or file
1346 *
1347 * If file is not null, it is a cgroup file to check under cg.
1348 * If file is null, then we are checking perms on cg itself.
1349 *
1350 * For files we can check the mode of the list_keys result.
1351 * For cgroups, we must make assumptions based on the files under the
1352 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1353 * yet.
1354 */
1355static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1356{
1357 struct cgfs_files *k = NULL;
1358 bool ret = false;
1359
1360 k = cgfs_get_key(contrl, cg, file);
1361 if (!k)
1362 return false;
1363
1364 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1365 if (perms_include(k->mode >> 6, mode)) {
1366 ret = true;
1367 goto out;
1368 }
1369 }
1370 if (fc->gid == k->gid) {
1371 if (perms_include(k->mode >> 3, mode)) {
1372 ret = true;
1373 goto out;
1374 }
1375 }
1376 ret = perms_include(k->mode, mode);
1377
1378out:
1379 free_key(k);
1380 return ret;
1381}
1382
1383#define INITSCOPE "/init.scope"
1384static void prune_init_slice(char *cg)
1385{
1386 char *point;
1387 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1388
1389 if (cg_len < initscope_len)
1390 return;
1391
1392 point = cg + cg_len - initscope_len;
1393 if (strcmp(point, INITSCOPE) == 0) {
1394 if (point == cg)
1395 *(point+1) = '\0';
1396 else
1397 *point = '\0';
1398 }
1399}
1400
1401/*
1402 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1403 * If pid is in /a, he may act on /a/b, but not on /b.
1404 * if the answer is false and nextcg is not NULL, then *nextcg will point
1405 * to a string containing the next cgroup directory under cg, which must be
1406 * freed by the caller.
1407 */
1408static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1409{
1410 bool answer = false;
1411 char *c2 = get_pid_cgroup(pid, contrl);
1412 char *linecmp;
1413
1414 if (!c2)
1415 return false;
1416 prune_init_slice(c2);
1417
1418 /*
12c31268
CB
1419 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1420 * they pass in a cgroup without leading '/'
1421 *
1422 * The original line here was:
1423 * linecmp = *cg == '/' ? c2 : c2+1;
1424 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1425 * Serge, do you know?
237e200e 1426 */
12c31268
CB
1427 if (*cg == '/' || !strncmp(cg, "./", 2))
1428 linecmp = c2;
1429 else
1430 linecmp = c2 + 1;
237e200e
SH
1431 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1432 if (nextcg) {
1433 *nextcg = get_next_cgroup_dir(linecmp, cg);
1434 }
1435 goto out;
1436 }
1437 answer = true;
1438
1439out:
1440 free(c2);
1441 return answer;
1442}
1443
1444/*
1445 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1446 */
1447static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1448{
1449 bool answer = false;
1450 char *c2, *task_cg;
1451 size_t target_len, task_len;
1452
f7bff426 1453 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
237e200e
SH
1454 return true;
1455
1456 c2 = get_pid_cgroup(pid, contrl);
1457 if (!c2)
1458 return false;
1459 prune_init_slice(c2);
1460
1461 task_cg = c2 + 1;
1462 target_len = strlen(cg);
1463 task_len = strlen(task_cg);
1464 if (task_len == 0) {
1465 /* Task is in the root cg, it can see everything. This case is
1466 * not handled by the strmcps below, since they test for the
1467 * last /, but that is the first / that we've chopped off
1468 * above.
1469 */
1470 answer = true;
1471 goto out;
1472 }
1473 if (strcmp(cg, task_cg) == 0) {
1474 answer = true;
1475 goto out;
1476 }
1477 if (target_len < task_len) {
1478 /* looking up a parent dir */
1479 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1480 answer = true;
1481 goto out;
1482 }
1483 if (target_len > task_len) {
1484 /* looking up a child dir */
1485 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1486 answer = true;
1487 goto out;
1488 }
1489
1490out:
1491 free(c2);
1492 return answer;
1493}
1494
1495/*
1496 * given /cgroup/freezer/a/b, return "freezer".
1497 * the returned char* should NOT be freed.
1498 */
1499static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1500{
1501 const char *p1;
1502 char *contr, *slash;
1503
1504 if (strlen(path) < 9)
1505 return NULL;
1506 if (*(path+7) != '/')
1507 return NULL;
1508 p1 = path+8;
1509 contr = strdupa(p1);
1510 if (!contr)
1511 return NULL;
1512 slash = strstr(contr, "/");
1513 if (slash)
1514 *slash = '\0';
1515
1516 int i;
1517 for (i = 0; i < num_hierarchies; i++) {
1518 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1519 return hierarchies[i];
1520 }
1521 return NULL;
1522}
1523
1524/*
1525 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1526 * Note that the returned value may include files (keynames) etc
1527 */
1528static const char *find_cgroup_in_path(const char *path)
1529{
1530 const char *p1;
1531
1532 if (strlen(path) < 9)
1533 return NULL;
1534 p1 = strstr(path+8, "/");
1535 if (!p1)
1536 return NULL;
1537 return p1+1;
1538}
1539
1540/*
1541 * split the last path element from the path in @cg.
1542 * @dir is newly allocated and should be freed, @last not
1543*/
1544static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1545{
1546 char *p;
1547
1548 do {
1549 *dir = strdup(cg);
1550 } while (!*dir);
1551 *last = strrchr(cg, '/');
1552 if (!*last) {
1553 *last = NULL;
1554 return;
1555 }
1556 p = strrchr(*dir, '/');
1557 *p = '\0';
1558}
1559
1560/*
1561 * FUSE ops for /cgroup
1562 */
1563
1564int cg_getattr(const char *path, struct stat *sb)
1565{
1566 struct timespec now;
1567 struct fuse_context *fc = fuse_get_context();
1568 char * cgdir = NULL;
1569 char *last = NULL, *path1, *path2;
1570 struct cgfs_files *k = NULL;
1571 const char *cgroup;
1572 const char *controller = NULL;
1573 int ret = -ENOENT;
1574
1575
1576 if (!fc)
1577 return -EIO;
1578
1579 memset(sb, 0, sizeof(struct stat));
1580
1581 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1582 return -EINVAL;
1583
1584 sb->st_uid = sb->st_gid = 0;
1585 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1586 sb->st_size = 0;
1587
1588 if (strcmp(path, "/cgroup") == 0) {
1589 sb->st_mode = S_IFDIR | 00755;
1590 sb->st_nlink = 2;
1591 return 0;
1592 }
1593
1594 controller = pick_controller_from_path(fc, path);
1595 if (!controller)
1596 return -EIO;
1597 cgroup = find_cgroup_in_path(path);
1598 if (!cgroup) {
1599 /* this is just /cgroup/controller, return it as a dir */
1600 sb->st_mode = S_IFDIR | 00755;
1601 sb->st_nlink = 2;
1602 return 0;
1603 }
1604
1605 get_cgdir_and_path(cgroup, &cgdir, &last);
1606
1607 if (!last) {
1608 path1 = "/";
1609 path2 = cgdir;
1610 } else {
1611 path1 = cgdir;
1612 path2 = last;
1613 }
1614
1615 pid_t initpid = lookup_initpid_in_store(fc->pid);
1616 if (initpid <= 0)
1617 initpid = fc->pid;
1618 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1619 * Then check that caller's cgroup is under path if last is a child
1620 * cgroup, or cgdir if last is a file */
1621
1622 if (is_child_cgroup(controller, path1, path2)) {
1623 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1624 ret = -ENOENT;
1625 goto out;
1626 }
1627 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1628 /* this is just /cgroup/controller, return it as a dir */
1629 sb->st_mode = S_IFDIR | 00555;
1630 sb->st_nlink = 2;
1631 ret = 0;
1632 goto out;
1633 }
1634 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1635 ret = -EACCES;
1636 goto out;
1637 }
1638
1639 // get uid, gid, from '/tasks' file and make up a mode
1640 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1641 sb->st_mode = S_IFDIR | 00755;
1642 k = cgfs_get_key(controller, cgroup, NULL);
1643 if (!k) {
1644 sb->st_uid = sb->st_gid = 0;
1645 } else {
1646 sb->st_uid = k->uid;
1647 sb->st_gid = k->gid;
1648 }
1649 free_key(k);
1650 sb->st_nlink = 2;
1651 ret = 0;
1652 goto out;
1653 }
1654
1655 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1656 sb->st_mode = S_IFREG | k->mode;
1657 sb->st_nlink = 1;
1658 sb->st_uid = k->uid;
1659 sb->st_gid = k->gid;
1660 sb->st_size = 0;
1661 free_key(k);
1662 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1663 ret = -ENOENT;
1664 goto out;
1665 }
1666 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
1667 ret = -EACCES;
1668 goto out;
1669 }
1670
1671 ret = 0;
1672 }
1673
1674out:
1675 free(cgdir);
1676 return ret;
1677}
1678
1679int cg_opendir(const char *path, struct fuse_file_info *fi)
1680{
1681 struct fuse_context *fc = fuse_get_context();
1682 const char *cgroup;
1683 struct file_info *dir_info;
1684 char *controller = NULL;
1685
1686 if (!fc)
1687 return -EIO;
1688
1689 if (strcmp(path, "/cgroup") == 0) {
1690 cgroup = NULL;
1691 controller = NULL;
1692 } else {
1693 // return list of keys for the controller, and list of child cgroups
1694 controller = pick_controller_from_path(fc, path);
1695 if (!controller)
1696 return -EIO;
1697
1698 cgroup = find_cgroup_in_path(path);
1699 if (!cgroup) {
1700 /* this is just /cgroup/controller, return its contents */
1701 cgroup = "/";
1702 }
1703 }
1704
1705 pid_t initpid = lookup_initpid_in_store(fc->pid);
1706 if (initpid <= 0)
1707 initpid = fc->pid;
1708 if (cgroup) {
1709 if (!caller_may_see_dir(initpid, controller, cgroup))
1710 return -ENOENT;
1711 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1712 return -EACCES;
1713 }
1714
1715 /* we'll free this at cg_releasedir */
1716 dir_info = malloc(sizeof(*dir_info));
1717 if (!dir_info)
1718 return -ENOMEM;
1719 dir_info->controller = must_copy_string(controller);
1720 dir_info->cgroup = must_copy_string(cgroup);
1721 dir_info->type = LXC_TYPE_CGDIR;
1722 dir_info->buf = NULL;
1723 dir_info->file = NULL;
1724 dir_info->buflen = 0;
1725
1726 fi->fh = (unsigned long)dir_info;
1727 return 0;
1728}
1729
1730int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1731 struct fuse_file_info *fi)
1732{
1733 struct file_info *d = (struct file_info *)fi->fh;
1734 struct cgfs_files **list = NULL;
1735 int i, ret;
1736 char *nextcg = NULL;
1737 struct fuse_context *fc = fuse_get_context();
1738 char **clist = NULL;
1739
1740 if (d->type != LXC_TYPE_CGDIR) {
1741 fprintf(stderr, "Internal error: file cache info used in readdir\n");
1742 return -EIO;
1743 }
1744 if (!d->cgroup && !d->controller) {
1745 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1746 int i;
1747
1748 for (i = 0; i < num_hierarchies; i++) {
1749 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1750 return -EIO;
1751 }
1752 }
1753 return 0;
1754 }
1755
1756 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1757 // not a valid cgroup
1758 ret = -EINVAL;
1759 goto out;
1760 }
1761
1762 pid_t initpid = lookup_initpid_in_store(fc->pid);
1763 if (initpid <= 0)
1764 initpid = fc->pid;
1765 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1766 if (nextcg) {
1767 ret = filler(buf, nextcg, NULL, 0);
1768 free(nextcg);
1769 if (ret != 0) {
1770 ret = -EIO;
1771 goto out;
1772 }
1773 }
1774 ret = 0;
1775 goto out;
1776 }
1777
1778 for (i = 0; list[i]; i++) {
1779 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1780 ret = -EIO;
1781 goto out;
1782 }
1783 }
1784
1785 // now get the list of child cgroups
1786
1787 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1788 ret = 0;
1789 goto out;
1790 }
f366da65
WB
1791 if (clist) {
1792 for (i = 0; clist[i]; i++) {
1793 if (filler(buf, clist[i], NULL, 0) != 0) {
1794 ret = -EIO;
1795 goto out;
1796 }
237e200e
SH
1797 }
1798 }
1799 ret = 0;
1800
1801out:
1802 free_keys(list);
1803 if (clist) {
1804 for (i = 0; clist[i]; i++)
1805 free(clist[i]);
1806 free(clist);
1807 }
1808 return ret;
1809}
1810
43215927 1811static void do_release_file_info(struct fuse_file_info *fi)
237e200e 1812{
43215927
SH
1813 struct file_info *f = (struct file_info *)fi->fh;
1814
237e200e
SH
1815 if (!f)
1816 return;
43215927
SH
1817
1818 fi->fh = 0;
1819
237e200e 1820 free(f->controller);
43215927 1821 f->controller = NULL;
237e200e 1822 free(f->cgroup);
43215927 1823 f->cgroup = NULL;
237e200e 1824 free(f->file);
43215927 1825 f->file = NULL;
237e200e 1826 free(f->buf);
43215927 1827 f->buf = NULL;
237e200e
SH
1828 free(f);
1829}
1830
1831int cg_releasedir(const char *path, struct fuse_file_info *fi)
1832{
43215927 1833 do_release_file_info(fi);
237e200e
SH
1834 return 0;
1835}
1836
1837int cg_open(const char *path, struct fuse_file_info *fi)
1838{
1839 const char *cgroup;
1840 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1841 struct cgfs_files *k = NULL;
1842 struct file_info *file_info;
1843 struct fuse_context *fc = fuse_get_context();
1844 int ret;
1845
1846 if (!fc)
1847 return -EIO;
1848
1849 controller = pick_controller_from_path(fc, path);
1850 if (!controller)
1851 return -EIO;
1852 cgroup = find_cgroup_in_path(path);
1853 if (!cgroup)
1854 return -EINVAL;
1855
1856 get_cgdir_and_path(cgroup, &cgdir, &last);
1857 if (!last) {
1858 path1 = "/";
1859 path2 = cgdir;
1860 } else {
1861 path1 = cgdir;
1862 path2 = last;
1863 }
1864
1865 k = cgfs_get_key(controller, path1, path2);
1866 if (!k) {
1867 ret = -EINVAL;
1868 goto out;
1869 }
1870 free_key(k);
1871
1872 pid_t initpid = lookup_initpid_in_store(fc->pid);
1873 if (initpid <= 0)
1874 initpid = fc->pid;
1875 if (!caller_may_see_dir(initpid, controller, path1)) {
1876 ret = -ENOENT;
1877 goto out;
1878 }
1879 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
237e200e
SH
1880 ret = -EACCES;
1881 goto out;
1882 }
1883
1884 /* we'll free this at cg_release */
1885 file_info = malloc(sizeof(*file_info));
1886 if (!file_info) {
1887 ret = -ENOMEM;
1888 goto out;
1889 }
1890 file_info->controller = must_copy_string(controller);
1891 file_info->cgroup = must_copy_string(path1);
1892 file_info->file = must_copy_string(path2);
1893 file_info->type = LXC_TYPE_CGFILE;
1894 file_info->buf = NULL;
1895 file_info->buflen = 0;
1896
1897 fi->fh = (unsigned long)file_info;
1898 ret = 0;
1899
1900out:
1901 free(cgdir);
1902 return ret;
1903}
1904
bddbb106
SH
1905int cg_access(const char *path, int mode)
1906{
1907 const char *cgroup;
1908 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1909 struct cgfs_files *k = NULL;
1910 struct fuse_context *fc = fuse_get_context();
1911 int ret;
1912
1913 if (!fc)
1914 return -EIO;
1915
1916 controller = pick_controller_from_path(fc, path);
1917 if (!controller)
1918 return -EIO;
1919 cgroup = find_cgroup_in_path(path);
575316c4
SH
1920 if (!cgroup) {
1921 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
3f441bc7
SH
1922 if ((mode & W_OK) == 0)
1923 return 0;
1924 return -EACCES;
575316c4 1925 }
bddbb106
SH
1926
1927 get_cgdir_and_path(cgroup, &cgdir, &last);
1928 if (!last) {
1929 path1 = "/";
1930 path2 = cgdir;
1931 } else {
1932 path1 = cgdir;
1933 path2 = last;
1934 }
1935
1936 k = cgfs_get_key(controller, path1, path2);
1937 if (!k) {
3f441bc7
SH
1938 if ((mode & W_OK) == 0)
1939 ret = 0;
1940 else
1941 ret = -EACCES;
bddbb106
SH
1942 goto out;
1943 }
1944 free_key(k);
1945
1946 pid_t initpid = lookup_initpid_in_store(fc->pid);
1947 if (initpid <= 0)
1948 initpid = fc->pid;
1949 if (!caller_may_see_dir(initpid, controller, path1)) {
1950 ret = -ENOENT;
1951 goto out;
1952 }
1953 if (!fc_may_access(fc, controller, path1, path2, mode)) {
1954 ret = -EACCES;
1955 goto out;
1956 }
1957
1958 ret = 0;
1959
1960out:
1961 free(cgdir);
1962 return ret;
1963}
1964
237e200e
SH
1965int cg_release(const char *path, struct fuse_file_info *fi)
1966{
43215927 1967 do_release_file_info(fi);
237e200e
SH
1968 return 0;
1969}
1970
1971#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
1972
1973static bool wait_for_sock(int sock, int timeout)
1974{
1975 struct epoll_event ev;
1976 int epfd, ret, now, starttime, deltatime, saved_errno;
1977
1978 if ((starttime = time(NULL)) < 0)
1979 return false;
1980
1981 if ((epfd = epoll_create(1)) < 0) {
1982 fprintf(stderr, "Failed to create epoll socket: %m\n");
1983 return false;
1984 }
1985
1986 ev.events = POLLIN_SET;
1987 ev.data.fd = sock;
1988 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
1989 fprintf(stderr, "Failed adding socket to epoll: %m\n");
1990 close(epfd);
1991 return false;
1992 }
1993
1994again:
1995 if ((now = time(NULL)) < 0) {
1996 close(epfd);
1997 return false;
1998 }
1999
2000 deltatime = (starttime + timeout) - now;
2001 if (deltatime < 0) { // timeout
2002 errno = 0;
2003 close(epfd);
2004 return false;
2005 }
2006 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2007 if (ret < 0 && errno == EINTR)
2008 goto again;
2009 saved_errno = errno;
2010 close(epfd);
2011
2012 if (ret <= 0) {
2013 errno = saved_errno;
2014 return false;
2015 }
2016 return true;
2017}
2018
2019static int msgrecv(int sockfd, void *buf, size_t len)
2020{
2021 if (!wait_for_sock(sockfd, 2))
2022 return -1;
2023 return recv(sockfd, buf, len, MSG_DONTWAIT);
2024}
2025
2026static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2027{
2028 struct msghdr msg = { 0 };
2029 struct iovec iov;
2030 struct cmsghdr *cmsg;
2031 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2032 char buf[1];
2033 buf[0] = 'p';
2034
2035 if (pingfirst) {
2036 if (msgrecv(sock, buf, 1) != 1) {
2037 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
2038 __func__);
2039 return SEND_CREDS_FAIL;
2040 }
2041 }
2042
2043 msg.msg_control = cmsgbuf;
2044 msg.msg_controllen = sizeof(cmsgbuf);
2045
2046 cmsg = CMSG_FIRSTHDR(&msg);
2047 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2048 cmsg->cmsg_level = SOL_SOCKET;
2049 cmsg->cmsg_type = SCM_CREDENTIALS;
2050 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2051
2052 msg.msg_name = NULL;
2053 msg.msg_namelen = 0;
2054
2055 buf[0] = v;
2056 iov.iov_base = buf;
2057 iov.iov_len = sizeof(buf);
2058 msg.msg_iov = &iov;
2059 msg.msg_iovlen = 1;
2060
2061 if (sendmsg(sock, &msg, 0) < 0) {
2062 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
2063 strerror(errno));
2064 if (errno == 3)
2065 return SEND_CREDS_NOTSK;
2066 return SEND_CREDS_FAIL;
2067 }
2068
2069 return SEND_CREDS_OK;
2070}
2071
2072static bool recv_creds(int sock, struct ucred *cred, char *v)
2073{
2074 struct msghdr msg = { 0 };
2075 struct iovec iov;
2076 struct cmsghdr *cmsg;
2077 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2078 char buf[1];
2079 int ret;
2080 int optval = 1;
2081
2082 *v = '1';
2083
2084 cred->pid = -1;
2085 cred->uid = -1;
2086 cred->gid = -1;
2087
2088 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2089 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
2090 return false;
2091 }
2092 buf[0] = '1';
2093 if (write(sock, buf, 1) != 1) {
2094 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
2095 return false;
2096 }
2097
2098 msg.msg_name = NULL;
2099 msg.msg_namelen = 0;
2100 msg.msg_control = cmsgbuf;
2101 msg.msg_controllen = sizeof(cmsgbuf);
2102
2103 iov.iov_base = buf;
2104 iov.iov_len = sizeof(buf);
2105 msg.msg_iov = &iov;
2106 msg.msg_iovlen = 1;
2107
2108 if (!wait_for_sock(sock, 2)) {
2109 fprintf(stderr, "Timed out waiting for scm_cred: %s\n",
2110 strerror(errno));
2111 return false;
2112 }
2113 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2114 if (ret < 0) {
2115 fprintf(stderr, "Failed to receive scm_cred: %s\n",
2116 strerror(errno));
2117 return false;
2118 }
2119
2120 cmsg = CMSG_FIRSTHDR(&msg);
2121
2122 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2123 cmsg->cmsg_level == SOL_SOCKET &&
2124 cmsg->cmsg_type == SCM_CREDENTIALS) {
2125 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2126 }
2127 *v = buf[0];
2128
2129 return true;
2130}
2131
35174b0f
FG
2132struct pid_ns_clone_args {
2133 int *cpipe;
2134 int sock;
2135 pid_t tpid;
2136 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2137};
2138
2139/*
2140 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2141 * with clone(). This simply writes '1' as ACK back to the parent
2142 * before calling the actual wrapped function.
2143 */
2144static int pid_ns_clone_wrapper(void *arg) {
2145 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2146 char b = '1';
2147
2148 close(args->cpipe[0]);
2149 if (write(args->cpipe[1], &b, sizeof(char)) < 0) {
2150 fprintf(stderr, "%s (child): error on write: %s\n",
2151 __func__, strerror(errno));
2152 }
2153 close(args->cpipe[1]);
2154 return args->wrapped(args->sock, args->tpid);
2155}
237e200e
SH
2156
2157/*
2158 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2159 * int value back over the socket. This shifts the pid from the
2160 * sender's pidns into tpid's pidns.
2161 */
35174b0f 2162static int pid_to_ns(int sock, pid_t tpid)
237e200e
SH
2163{
2164 char v = '0';
2165 struct ucred cred;
2166
2167 while (recv_creds(sock, &cred, &v)) {
2168 if (v == '1')
35174b0f 2169 return 0;
237e200e 2170 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
35174b0f 2171 return 1;
237e200e 2172 }
35174b0f 2173 return 0;
237e200e
SH
2174}
2175
35174b0f 2176
237e200e
SH
2177/*
2178 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
35174b0f
FG
2179 * in your old pidns. Only children which you clone will be in the target
2180 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2181 * actually convert pids.
2182 *
2183 * Note: glibc's fork() does not respect pidns, which can lead to failed
2184 * assertions inside glibc (and thus failed forks) if the child's pid in
2185 * the pidns and the parent pid outside are identical. Using clone prevents
2186 * this issue.
237e200e
SH
2187 */
2188static void pid_to_ns_wrapper(int sock, pid_t tpid)
2189{
2190 int newnsfd = -1, ret, cpipe[2];
2191 char fnam[100];
2192 pid_t cpid;
2193 char v;
2194
2195 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2196 if (ret < 0 || ret >= sizeof(fnam))
2197 _exit(1);
2198 newnsfd = open(fnam, O_RDONLY);
2199 if (newnsfd < 0)
2200 _exit(1);
2201 if (setns(newnsfd, 0) < 0)
2202 _exit(1);
2203 close(newnsfd);
2204
2205 if (pipe(cpipe) < 0)
2206 _exit(1);
2207
35174b0f
FG
2208 struct pid_ns_clone_args args = {
2209 .cpipe = cpipe,
2210 .sock = sock,
2211 .tpid = tpid,
2212 .wrapped = &pid_to_ns
2213 };
2214 size_t stack_size = sysconf(_SC_PAGESIZE);
2215 void *stack = alloca(stack_size);
2216
2217 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2218 if (cpid < 0)
2219 _exit(1);
2220
237e200e
SH
2221 // give the child 1 second to be done forking and
2222 // write its ack
2223 if (!wait_for_sock(cpipe[0], 1))
2224 _exit(1);
2225 ret = read(cpipe[0], &v, 1);
2226 if (ret != sizeof(char) || v != '1')
2227 _exit(1);
2228
2229 if (!wait_for_pid(cpid))
2230 _exit(1);
2231 _exit(0);
2232}
2233
2234/*
2235 * To read cgroup files with a particular pid, we will setns into the child
2236 * pidns, open a pipe, fork a child - which will be the first to really be in
2237 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2238 */
2239bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2240{
2241 int sock[2] = {-1, -1};
2242 char *tmpdata = NULL;
2243 int ret;
2244 pid_t qpid, cpid = -1;
2245 bool answer = false;
2246 char v = '0';
2247 struct ucred cred;
2248 size_t sz = 0, asz = 0;
2249
2250 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2251 return false;
2252
2253 /*
2254 * Now we read the pids from returned data one by one, pass
2255 * them into a child in the target namespace, read back the
2256 * translated pids, and put them into our to-return data
2257 */
2258
2259 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2260 perror("socketpair");
2261 free(tmpdata);
2262 return false;
2263 }
2264
2265 cpid = fork();
2266 if (cpid == -1)
2267 goto out;
2268
2269 if (!cpid) // child - exits when done
2270 pid_to_ns_wrapper(sock[1], tpid);
2271
2272 char *ptr = tmpdata;
2273 cred.uid = 0;
2274 cred.gid = 0;
2275 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2276 cred.pid = qpid;
2277 ret = send_creds(sock[0], &cred, v, true);
2278
2279 if (ret == SEND_CREDS_NOTSK)
2280 goto next;
2281 if (ret == SEND_CREDS_FAIL)
2282 goto out;
2283
2284 // read converted results
2285 if (!wait_for_sock(sock[0], 2)) {
2286 fprintf(stderr, "%s: timed out waiting for pid from child: %s\n",
2287 __func__, strerror(errno));
2288 goto out;
2289 }
2290 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2291 fprintf(stderr, "%s: error reading pid from child: %s\n",
2292 __func__, strerror(errno));
2293 goto out;
2294 }
2295 must_strcat_pid(d, &sz, &asz, qpid);
2296next:
2297 ptr = strchr(ptr, '\n');
2298 if (!ptr)
2299 break;
2300 ptr++;
2301 }
2302
2303 cred.pid = getpid();
2304 v = '1';
2305 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2306 // failed to ask child to exit
2307 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
2308 __func__, strerror(errno));
2309 goto out;
2310 }
2311
2312 answer = true;
2313
2314out:
2315 free(tmpdata);
2316 if (cpid != -1)
2317 wait_for_pid(cpid);
2318 if (sock[0] != -1) {
2319 close(sock[0]);
2320 close(sock[1]);
2321 }
2322 return answer;
2323}
2324
2325int cg_read(const char *path, char *buf, size_t size, off_t offset,
2326 struct fuse_file_info *fi)
2327{
2328 struct fuse_context *fc = fuse_get_context();
2329 struct file_info *f = (struct file_info *)fi->fh;
2330 struct cgfs_files *k = NULL;
2331 char *data = NULL;
2332 int ret, s;
2333 bool r;
2334
2335 if (f->type != LXC_TYPE_CGFILE) {
2336 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
2337 return -EIO;
2338 }
2339
2340 if (offset)
2341 return 0;
2342
2343 if (!fc)
2344 return -EIO;
2345
2346 if (!f->controller)
2347 return -EINVAL;
2348
2349 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2350 return -EINVAL;
2351 }
2352 free_key(k);
2353
2354
888f8f3c 2355 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
237e200e
SH
2356 ret = -EACCES;
2357 goto out;
2358 }
2359
2360 if (strcmp(f->file, "tasks") == 0 ||
2361 strcmp(f->file, "/tasks") == 0 ||
2362 strcmp(f->file, "/cgroup.procs") == 0 ||
2363 strcmp(f->file, "cgroup.procs") == 0)
2364 // special case - we have to translate the pids
2365 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2366 else
2367 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2368
2369 if (!r) {
2370 ret = -EINVAL;
2371 goto out;
2372 }
2373
2374 if (!data) {
2375 ret = 0;
2376 goto out;
2377 }
2378 s = strlen(data);
2379 if (s > size)
2380 s = size;
2381 memcpy(buf, data, s);
2382 if (s > 0 && s < size && data[s-1] != '\n')
2383 buf[s++] = '\n';
2384
2385 ret = s;
2386
2387out:
2388 free(data);
2389 return ret;
2390}
2391
35174b0f 2392static int pid_from_ns(int sock, pid_t tpid)
237e200e
SH
2393{
2394 pid_t vpid;
2395 struct ucred cred;
2396 char v;
2397 int ret;
2398
2399 cred.uid = 0;
2400 cred.gid = 0;
2401 while (1) {
2402 if (!wait_for_sock(sock, 2)) {
2403 fprintf(stderr, "%s: timeout reading from parent\n", __func__);
35174b0f 2404 return 1;
237e200e
SH
2405 }
2406 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2407 fprintf(stderr, "%s: bad read from parent: %s\n",
2408 __func__, strerror(errno));
35174b0f 2409 return 1;
237e200e
SH
2410 }
2411 if (vpid == -1) // done
2412 break;
2413 v = '0';
2414 cred.pid = vpid;
2415 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2416 v = '1';
2417 cred.pid = getpid();
2418 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
35174b0f 2419 return 1;
237e200e
SH
2420 }
2421 }
35174b0f 2422 return 0;
237e200e
SH
2423}
2424
2425static void pid_from_ns_wrapper(int sock, pid_t tpid)
2426{
2427 int newnsfd = -1, ret, cpipe[2];
2428 char fnam[100];
2429 pid_t cpid;
2430 char v;
2431
2432 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2433 if (ret < 0 || ret >= sizeof(fnam))
2434 _exit(1);
2435 newnsfd = open(fnam, O_RDONLY);
2436 if (newnsfd < 0)
2437 _exit(1);
2438 if (setns(newnsfd, 0) < 0)
2439 _exit(1);
2440 close(newnsfd);
2441
2442 if (pipe(cpipe) < 0)
2443 _exit(1);
2444
35174b0f
FG
2445 struct pid_ns_clone_args args = {
2446 .cpipe = cpipe,
2447 .sock = sock,
2448 .tpid = tpid,
2449 .wrapped = &pid_from_ns
2450 };
f0f8b851
SH
2451 size_t stack_size = sysconf(_SC_PAGESIZE);
2452 void *stack = alloca(stack_size);
35174b0f
FG
2453
2454 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2455 if (cpid < 0)
2456 _exit(1);
2457
237e200e
SH
2458 // give the child 1 second to be done forking and
2459 // write its ack
2460 if (!wait_for_sock(cpipe[0], 1))
f0f8b851 2461 _exit(1);
237e200e 2462 ret = read(cpipe[0], &v, 1);
f0f8b851
SH
2463 if (ret != sizeof(char) || v != '1')
2464 _exit(1);
237e200e
SH
2465
2466 if (!wait_for_pid(cpid))
2467 _exit(1);
2468 _exit(0);
237e200e
SH
2469}
2470
2471/*
2472 * Given host @uid, return the uid to which it maps in
2473 * @pid's user namespace, or -1 if none.
2474 */
2475bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2476{
2477 FILE *f;
2478 char line[400];
2479
2480 sprintf(line, "/proc/%d/uid_map", pid);
2481 if ((f = fopen(line, "r")) == NULL) {
2482 return false;
2483 }
2484
2485 *answer = convert_id_to_ns(f, uid);
2486 fclose(f);
2487
2488 if (*answer == -1)
2489 return false;
2490 return true;
2491}
2492
2493/*
2494 * get_pid_creds: get the real uid and gid of @pid from
2495 * /proc/$$/status
2496 * (XXX should we use euid here?)
2497 */
2498void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2499{
2500 char line[400];
2501 uid_t u;
2502 gid_t g;
2503 FILE *f;
2504
2505 *uid = -1;
2506 *gid = -1;
2507 sprintf(line, "/proc/%d/status", pid);
2508 if ((f = fopen(line, "r")) == NULL) {
2509 fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno));
2510 return;
2511 }
2512 while (fgets(line, 400, f)) {
2513 if (strncmp(line, "Uid:", 4) == 0) {
2514 if (sscanf(line+4, "%u", &u) != 1) {
2515 fprintf(stderr, "bad uid line for pid %u\n", pid);
2516 fclose(f);
2517 return;
2518 }
2519 *uid = u;
2520 } else if (strncmp(line, "Gid:", 4) == 0) {
2521 if (sscanf(line+4, "%u", &g) != 1) {
2522 fprintf(stderr, "bad gid line for pid %u\n", pid);
2523 fclose(f);
2524 return;
2525 }
2526 *gid = g;
2527 }
2528 }
2529 fclose(f);
2530}
2531
2532/*
2533 * May the requestor @r move victim @v to a new cgroup?
2534 * This is allowed if
2535 * . they are the same task
2536 * . they are ownedy by the same uid
2537 * . @r is root on the host, or
2538 * . @v's uid is mapped into @r's where @r is root.
2539 */
2540bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2541{
2542 uid_t v_uid, tmpuid;
2543 gid_t v_gid;
2544
2545 if (r == v)
2546 return true;
2547 if (r_uid == 0)
2548 return true;
2549 get_pid_creds(v, &v_uid, &v_gid);
2550 if (r_uid == v_uid)
2551 return true;
2552 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2553 && hostuid_to_ns(v_uid, r, &tmpuid))
2554 return true;
2555 return false;
2556}
2557
2558static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2559 const char *file, const char *buf)
2560{
2561 int sock[2] = {-1, -1};
2562 pid_t qpid, cpid = -1;
2563 FILE *pids_file = NULL;
2564 bool answer = false, fail = false;
2565
2566 pids_file = open_pids_file(contrl, cg);
2567 if (!pids_file)
2568 return false;
2569
2570 /*
2571 * write the pids to a socket, have helper in writer's pidns
2572 * call movepid for us
2573 */
2574 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2575 perror("socketpair");
2576 goto out;
2577 }
2578
2579 cpid = fork();
2580 if (cpid == -1)
2581 goto out;
2582
2583 if (!cpid) { // child
2584 fclose(pids_file);
2585 pid_from_ns_wrapper(sock[1], tpid);
2586 }
2587
2588 const char *ptr = buf;
2589 while (sscanf(ptr, "%d", &qpid) == 1) {
2590 struct ucred cred;
2591 char v;
2592
2593 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2594 fprintf(stderr, "%s: error writing pid to child: %s\n",
2595 __func__, strerror(errno));
2596 goto out;
2597 }
2598
2599 if (recv_creds(sock[0], &cred, &v)) {
2600 if (v == '0') {
2601 if (!may_move_pid(tpid, tuid, cred.pid)) {
2602 fail = true;
2603 break;
2604 }
2605 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2606 fail = true;
2607 }
2608 }
2609
2610 ptr = strchr(ptr, '\n');
2611 if (!ptr)
2612 break;
2613 ptr++;
2614 }
2615
2616 /* All good, write the value */
2617 qpid = -1;
2618 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2619 fprintf(stderr, "Warning: failed to ask child to exit\n");
2620
2621 if (!fail)
2622 answer = true;
2623
2624out:
2625 if (cpid != -1)
2626 wait_for_pid(cpid);
2627 if (sock[0] != -1) {
2628 close(sock[0]);
2629 close(sock[1]);
2630 }
2631 if (pids_file) {
2632 if (fclose(pids_file) != 0)
2633 answer = false;
2634 }
2635 return answer;
2636}
2637
2638int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2639 struct fuse_file_info *fi)
2640{
2641 struct fuse_context *fc = fuse_get_context();
2642 char *localbuf = NULL;
2643 struct cgfs_files *k = NULL;
2644 struct file_info *f = (struct file_info *)fi->fh;
2645 bool r;
2646
2647 if (f->type != LXC_TYPE_CGFILE) {
2648 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
2649 return -EIO;
2650 }
2651
2652 if (offset)
2653 return 0;
2654
2655 if (!fc)
2656 return -EIO;
2657
2658 localbuf = alloca(size+1);
2659 localbuf[size] = '\0';
2660 memcpy(localbuf, buf, size);
2661
2662 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2663 size = -EINVAL;
2664 goto out;
2665 }
2666
2667 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2668 size = -EACCES;
2669 goto out;
2670 }
2671
2672 if (strcmp(f->file, "tasks") == 0 ||
2673 strcmp(f->file, "/tasks") == 0 ||
2674 strcmp(f->file, "/cgroup.procs") == 0 ||
2675 strcmp(f->file, "cgroup.procs") == 0)
2676 // special case - we have to translate the pids
2677 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2678 else
2679 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2680
2681 if (!r)
2682 size = -EINVAL;
2683
2684out:
2685 free_key(k);
2686 return size;
2687}
2688
2689int cg_chown(const char *path, uid_t uid, gid_t gid)
2690{
2691 struct fuse_context *fc = fuse_get_context();
2692 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2693 struct cgfs_files *k = NULL;
2694 const char *cgroup;
2695 int ret;
2696
2697 if (!fc)
2698 return -EIO;
2699
2700 if (strcmp(path, "/cgroup") == 0)
2701 return -EINVAL;
2702
2703 controller = pick_controller_from_path(fc, path);
2704 if (!controller)
2705 return -EINVAL;
2706 cgroup = find_cgroup_in_path(path);
2707 if (!cgroup)
2708 /* this is just /cgroup/controller */
2709 return -EINVAL;
2710
2711 get_cgdir_and_path(cgroup, &cgdir, &last);
2712
2713 if (!last) {
2714 path1 = "/";
2715 path2 = cgdir;
2716 } else {
2717 path1 = cgdir;
2718 path2 = last;
2719 }
2720
2721 if (is_child_cgroup(controller, path1, path2)) {
2722 // get uid, gid, from '/tasks' file and make up a mode
2723 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2724 k = cgfs_get_key(controller, cgroup, "tasks");
2725
2726 } else
2727 k = cgfs_get_key(controller, path1, path2);
2728
2729 if (!k) {
2730 ret = -EINVAL;
2731 goto out;
2732 }
2733
2734 /*
2735 * This being a fuse request, the uid and gid must be valid
2736 * in the caller's namespace. So we can just check to make
2737 * sure that the caller is root in his uid, and privileged
2738 * over the file's current owner.
2739 */
2740 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2741 ret = -EACCES;
2742 goto out;
2743 }
2744
2745 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2746
2747out:
2748 free_key(k);
2749 free(cgdir);
2750
2751 return ret;
2752}
2753
2754int cg_chmod(const char *path, mode_t mode)
2755{
2756 struct fuse_context *fc = fuse_get_context();
2757 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2758 struct cgfs_files *k = NULL;
2759 const char *cgroup;
2760 int ret;
2761
2762 if (!fc)
2763 return -EIO;
2764
2765 if (strcmp(path, "/cgroup") == 0)
2766 return -EINVAL;
2767
2768 controller = pick_controller_from_path(fc, path);
2769 if (!controller)
2770 return -EINVAL;
2771 cgroup = find_cgroup_in_path(path);
2772 if (!cgroup)
2773 /* this is just /cgroup/controller */
2774 return -EINVAL;
2775
2776 get_cgdir_and_path(cgroup, &cgdir, &last);
2777
2778 if (!last) {
2779 path1 = "/";
2780 path2 = cgdir;
2781 } else {
2782 path1 = cgdir;
2783 path2 = last;
2784 }
2785
2786 if (is_child_cgroup(controller, path1, path2)) {
2787 // get uid, gid, from '/tasks' file and make up a mode
2788 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2789 k = cgfs_get_key(controller, cgroup, "tasks");
2790
2791 } else
2792 k = cgfs_get_key(controller, path1, path2);
2793
2794 if (!k) {
2795 ret = -EINVAL;
2796 goto out;
2797 }
2798
2799 /*
2800 * This being a fuse request, the uid and gid must be valid
2801 * in the caller's namespace. So we can just check to make
2802 * sure that the caller is root in his uid, and privileged
2803 * over the file's current owner.
2804 */
2805 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2806 ret = -EPERM;
2807 goto out;
2808 }
2809
2810 if (!cgfs_chmod_file(controller, cgroup, mode)) {
2811 ret = -EINVAL;
2812 goto out;
2813 }
2814
2815 ret = 0;
2816out:
2817 free_key(k);
2818 free(cgdir);
2819 return ret;
2820}
2821
2822int cg_mkdir(const char *path, mode_t mode)
2823{
2824 struct fuse_context *fc = fuse_get_context();
2825 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2826 const char *cgroup;
2827 int ret;
2828
2829 if (!fc)
2830 return -EIO;
2831
2832
2833 controller = pick_controller_from_path(fc, path);
2834 if (!controller)
2835 return -EINVAL;
2836
2837 cgroup = find_cgroup_in_path(path);
2838 if (!cgroup)
2839 return -EINVAL;
2840
2841 get_cgdir_and_path(cgroup, &cgdir, &last);
2842 if (!last)
2843 path1 = "/";
2844 else
2845 path1 = cgdir;
2846
2847 pid_t initpid = lookup_initpid_in_store(fc->pid);
2848 if (initpid <= 0)
2849 initpid = fc->pid;
2850 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
2851 if (!next)
2852 ret = -EINVAL;
2853 else if (last && strcmp(next, last) == 0)
2854 ret = -EEXIST;
2855 else
2856 ret = -ENOENT;
2857 goto out;
2858 }
2859
2860 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
2861 ret = -EACCES;
2862 goto out;
2863 }
2864 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2865 ret = -EACCES;
2866 goto out;
2867 }
2868
2869 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
2870
2871out:
2872 free(cgdir);
2873 free(next);
2874 return ret;
2875}
2876
2877int cg_rmdir(const char *path)
2878{
2879 struct fuse_context *fc = fuse_get_context();
2880 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
2881 const char *cgroup;
2882 int ret;
2883
2884 if (!fc)
2885 return -EIO;
2886
2887 controller = pick_controller_from_path(fc, path);
2888 if (!controller)
2889 return -EINVAL;
2890
2891 cgroup = find_cgroup_in_path(path);
2892 if (!cgroup)
2893 return -EINVAL;
2894
2895 get_cgdir_and_path(cgroup, &cgdir, &last);
2896 if (!last) {
2897 ret = -EINVAL;
2898 goto out;
2899 }
2900
2901 pid_t initpid = lookup_initpid_in_store(fc->pid);
2902 if (initpid <= 0)
2903 initpid = fc->pid;
2904 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
2905 if (!last || strcmp(next, last) == 0)
2906 ret = -EBUSY;
2907 else
2908 ret = -ENOENT;
2909 goto out;
2910 }
2911
2912 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
2913 ret = -EACCES;
2914 goto out;
2915 }
2916 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2917 ret = -EACCES;
2918 goto out;
2919 }
2920
2921 if (!cgfs_remove(controller, cgroup)) {
2922 ret = -EINVAL;
2923 goto out;
2924 }
2925
2926 ret = 0;
2927
2928out:
2929 free(cgdir);
2930 free(next);
2931 return ret;
2932}
2933
2934static bool startswith(const char *line, const char *pref)
2935{
2936 if (strncmp(line, pref, strlen(pref)) == 0)
2937 return true;
2938 return false;
2939}
2940
c6095b08
SH
2941static void parse_memstat(char *memstat, unsigned long *cached,
2942 unsigned long *active_anon, unsigned long *inactive_anon,
2943 unsigned long *active_file, unsigned long *inactive_file,
2944 unsigned long *unevictable)
237e200e
SH
2945{
2946 char *eol;
2947
237e200e 2948 while (*memstat) {
c6095b08
SH
2949 if (startswith(memstat, "cache")) {
2950 sscanf(memstat + 11, "%lu", cached);
2951 *cached /= 1024;
2952 } else if (startswith(memstat, "active_anon")) {
2953 sscanf(memstat + 11, "%lu", active_anon);
2954 *active_anon /= 1024;
2955 } else if (startswith(memstat, "inactive_anon")) {
2956 sscanf(memstat + 11, "%lu", inactive_anon);
2957 *inactive_anon /= 1024;
2958 } else if (startswith(memstat, "active_file")) {
2959 sscanf(memstat + 11, "%lu", active_file);
2960 *active_file /= 1024;
2961 } else if (startswith(memstat, "inactive_file")) {
2962 sscanf(memstat + 11, "%lu", inactive_file);
2963 *inactive_file /= 1024;
2964 } else if (startswith(memstat, "unevictable")) {
2965 sscanf(memstat + 11, "%lu", unevictable);
2966 *unevictable /= 1024;
237e200e
SH
2967 }
2968 eol = strchr(memstat, '\n');
2969 if (!eol)
2970 return;
2971 memstat = eol+1;
2972 }
2973}
2974
2975static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2976{
2977 char *eol;
2978 char key[32];
2979
2980 memset(key, 0, 32);
2981 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2982
2983 size_t len = strlen(key);
2984 *v = 0;
2985
2986 while (*str) {
2987 if (startswith(str, key)) {
2988 sscanf(str + len, "%lu", v);
2989 return;
2990 }
2991 eol = strchr(str, '\n');
2992 if (!eol)
2993 return;
2994 str = eol+1;
2995 }
2996}
2997
2998static int read_file(const char *path, char *buf, size_t size,
2999 struct file_info *d)
3000{
3001 size_t linelen = 0, total_len = 0, rv = 0;
3002 char *line = NULL;
3003 char *cache = d->buf;
3004 size_t cache_size = d->buflen;
3005 FILE *f = fopen(path, "r");
3006 if (!f)
3007 return 0;
3008
3009 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3010 ssize_t l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
3011 if (l < 0) {
3012 perror("Error writing to cache");
3013 rv = 0;
3014 goto err;
3015 }
3016 if (l >= cache_size) {
3017 fprintf(stderr, "Internal error: truncated write to cache\n");
3018 rv = 0;
3019 goto err;
3020 }
3021 cache += l;
3022 cache_size -= l;
3023 total_len += l;
3024 }
3025
3026 d->size = total_len;
a262ddb7
CB
3027 if (total_len > size)
3028 total_len = size;
237e200e
SH
3029
3030 /* read from off 0 */
3031 memcpy(buf, d->buf, total_len);
3032 rv = total_len;
3033 err:
3034 fclose(f);
3035 free(line);
3036 return rv;
3037}
3038
3039/*
3040 * FUSE ops for /proc
3041 */
3042
3043static unsigned long get_memlimit(const char *cgroup)
3044{
3045 char *memlimit_str = NULL;
3046 unsigned long memlimit = -1;
3047
3048 if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
3049 memlimit = strtoul(memlimit_str, NULL, 10);
3050
3051 free(memlimit_str);
3052
3053 return memlimit;
3054}
3055
3056static unsigned long get_min_memlimit(const char *cgroup)
3057{
3058 char *copy = strdupa(cgroup);
3059 unsigned long memlimit = 0, retlimit;
3060
3061 retlimit = get_memlimit(copy);
3062
3063 while (strcmp(copy, "/") != 0) {
3064 copy = dirname(copy);
3065 memlimit = get_memlimit(copy);
3066 if (memlimit != -1 && memlimit < retlimit)
3067 retlimit = memlimit;
3068 };
3069
3070 return retlimit;
3071}
3072
3073static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3074 struct fuse_file_info *fi)
3075{
3076 struct fuse_context *fc = fuse_get_context();
3077 struct file_info *d = (struct file_info *)fi->fh;
3078 char *cg;
3079 char *memusage_str = NULL, *memstat_str = NULL,
3080 *memswlimit_str = NULL, *memswusage_str = NULL,
3081 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
3082 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
c6095b08
SH
3083 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3084 active_file = 0, inactive_file = 0, unevictable = 0;
237e200e
SH
3085 char *line = NULL;
3086 size_t linelen = 0, total_len = 0, rv = 0;
3087 char *cache = d->buf;
3088 size_t cache_size = d->buflen;
3089 FILE *f = NULL;
3090
3091 if (offset){
3092 if (offset > d->size)
3093 return -EINVAL;
3094 if (!d->cached)
3095 return 0;
3096 int left = d->size - offset;
3097 total_len = left > size ? size: left;
3098 memcpy(buf, cache + offset, total_len);
3099 return total_len;
3100 }
3101
3102 pid_t initpid = lookup_initpid_in_store(fc->pid);
3103 if (initpid <= 0)
3104 initpid = fc->pid;
3105 cg = get_pid_cgroup(initpid, "memory");
3106 if (!cg)
3107 return read_file("/proc/meminfo", buf, size, d);
6d2f6996 3108 prune_init_slice(cg);
237e200e
SH
3109
3110 memlimit = get_min_memlimit(cg);
3111 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3112 goto err;
3113 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3114 goto err;
3115
3116 // Following values are allowed to fail, because swapaccount might be turned
3117 // off for current kernel
3118 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3119 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3120 {
3121 /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
3122 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
3123 goto err;
3124 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
3125 goto err;
3126
3127 memswlimit = strtoul(memswlimit_str, NULL, 10);
3128 memswusage = strtoul(memswusage_str, NULL, 10);
3129
3130 if (!strcmp(memswlimit_str, memswlimit_default_str))
3131 memswlimit = 0;
3132 if (!strcmp(memswusage_str, memswusage_default_str))
3133 memswusage = 0;
3134
3135 memswlimit = memswlimit / 1024;
3136 memswusage = memswusage / 1024;
3137 }
3138
3139 memusage = strtoul(memusage_str, NULL, 10);
3140 memlimit /= 1024;
3141 memusage /= 1024;
3142
c6095b08
SH
3143 parse_memstat(memstat_str, &cached, &active_anon,
3144 &inactive_anon, &active_file, &inactive_file,
3145 &unevictable);
237e200e
SH
3146
3147 f = fopen("/proc/meminfo", "r");
3148 if (!f)
3149 goto err;
3150
3151 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3152 ssize_t l;
237e200e
SH
3153 char *printme, lbuf[100];
3154
3155 memset(lbuf, 0, 100);
3156 if (startswith(line, "MemTotal:")) {
3157 sscanf(line+14, "%lu", &hosttotal);
3158 if (hosttotal < memlimit)
3159 memlimit = hosttotal;
3160 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3161 printme = lbuf;
3162 } else if (startswith(line, "MemFree:")) {
3163 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3164 printme = lbuf;
3165 } else if (startswith(line, "MemAvailable:")) {
3166 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
3167 printme = lbuf;
3168 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3169 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit);
3170 printme = lbuf;
3171 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
b4665ce0
SH
3172 unsigned long swaptotal = memswlimit - memlimit,
3173 swapusage = memswusage - memusage,
3174 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3175 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
237e200e 3176 printme = lbuf;
da35d72a
SH
3177 } else if (startswith(line, "Slab:")) {
3178 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3179 printme = lbuf;
237e200e
SH
3180 } else if (startswith(line, "Buffers:")) {
3181 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3182 printme = lbuf;
3183 } else if (startswith(line, "Cached:")) {
3184 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3185 printme = lbuf;
3186 } else if (startswith(line, "SwapCached:")) {
3187 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3188 printme = lbuf;
c6095b08
SH
3189 } else if (startswith(line, "Active")) {
3190 snprintf(lbuf, 100, "Active: %8lu kB\n",
3191 active_anon + active_file);
3192 printme = lbuf;
3193 } else if (startswith(line, "Inactive")) {
3194 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3195 inactive_anon + inactive_file);
3196 printme = lbuf;
3197 } else if (startswith(line, "Active(anon)")) {
3198 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3199 printme = lbuf;
3200 } else if (startswith(line, "Inactive(anon)")) {
3201 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3202 printme = lbuf;
3203 } else if (startswith(line, "Active(file)")) {
3204 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3205 printme = lbuf;
3206 } else if (startswith(line, "Inactive(file)")) {
3207 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3208 printme = lbuf;
3209 } else if (startswith(line, "Unevictable")) {
3210 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3211 printme = lbuf;
3212 } else if (startswith(line, "SReclaimable")) {
3213 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3214 printme = lbuf;
3215 } else if (startswith(line, "SUnreclaim")) {
3216 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3217 printme = lbuf;
237e200e
SH
3218 } else
3219 printme = line;
3220
3221 l = snprintf(cache, cache_size, "%s", printme);
3222 if (l < 0) {
3223 perror("Error writing to cache");
3224 rv = 0;
3225 goto err;
3226
3227 }
3228 if (l >= cache_size) {
3229 fprintf(stderr, "Internal error: truncated write to cache\n");
3230 rv = 0;
3231 goto err;
3232 }
3233
3234 cache += l;
3235 cache_size -= l;
3236 total_len += l;
3237 }
3238
3239 d->cached = 1;
3240 d->size = total_len;
3241 if (total_len > size ) total_len = size;
3242 memcpy(buf, d->buf, total_len);
3243
3244 rv = total_len;
3245err:
3246 if (f)
3247 fclose(f);
3248 free(line);
3249 free(cg);
3250 free(memusage_str);
3251 free(memswlimit_str);
3252 free(memswusage_str);
3253 free(memstat_str);
3254 free(memswlimit_default_str);
3255 free(memswusage_default_str);
3256 return rv;
3257}
3258
3259/*
3260 * Read the cpuset.cpus for cg
3261 * Return the answer in a newly allocated string which must be freed
3262 */
3263static char *get_cpuset(const char *cg)
3264{
3265 char *answer;
3266
3267 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3268 return NULL;
3269 return answer;
3270}
3271
3272bool cpu_in_cpuset(int cpu, const char *cpuset);
3273
3274static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3275{
3276 int cpu;
3277
3278 if (sscanf(line, "processor : %d", &cpu) != 1)
3279 return false;
3280 return cpu_in_cpuset(cpu, cpuset);
3281}
3282
3283/*
3284 * check whether this is a '^processor" line in /proc/cpuinfo
3285 */
3286static bool is_processor_line(const char *line)
3287{
3288 int cpu;
3289
3290 if (sscanf(line, "processor : %d", &cpu) == 1)
3291 return true;
3292 return false;
3293}
3294
3295static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3296 struct fuse_file_info *fi)
3297{
3298 struct fuse_context *fc = fuse_get_context();
3299 struct file_info *d = (struct file_info *)fi->fh;
3300 char *cg;
3301 char *cpuset = NULL;
3302 char *line = NULL;
3303 size_t linelen = 0, total_len = 0, rv = 0;
f676eb79
SH
3304 bool am_printing = false, firstline = true, is_s390x = false;
3305 int curcpu = -1, cpu;
237e200e
SH
3306 char *cache = d->buf;
3307 size_t cache_size = d->buflen;
3308 FILE *f = NULL;
3309
3310 if (offset){
3311 if (offset > d->size)
3312 return -EINVAL;
3313 if (!d->cached)
3314 return 0;
3315 int left = d->size - offset;
3316 total_len = left > size ? size: left;
3317 memcpy(buf, cache + offset, total_len);
3318 return total_len;
3319 }
3320
3321 pid_t initpid = lookup_initpid_in_store(fc->pid);
3322 if (initpid <= 0)
3323 initpid = fc->pid;
3324 cg = get_pid_cgroup(initpid, "cpuset");
3325 if (!cg)
3326 return read_file("proc/cpuinfo", buf, size, d);
6d2f6996 3327 prune_init_slice(cg);
237e200e
SH
3328
3329 cpuset = get_cpuset(cg);
3330 if (!cpuset)
3331 goto err;
3332
3333 f = fopen("/proc/cpuinfo", "r");
3334 if (!f)
3335 goto err;
3336
3337 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3338 ssize_t l;
f676eb79
SH
3339 if (firstline) {
3340 firstline = false;
3341 if (strstr(line, "IBM/S390") != NULL) {
3342 is_s390x = true;
3343 am_printing = true;
5ed9d4e2 3344 continue;
f676eb79
SH
3345 }
3346 }
5ed9d4e2
SH
3347 if (strncmp(line, "# processors:", 12) == 0)
3348 continue;
237e200e
SH
3349 if (is_processor_line(line)) {
3350 am_printing = cpuline_in_cpuset(line, cpuset);
3351 if (am_printing) {
3352 curcpu ++;
3353 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3354 if (l < 0) {
3355 perror("Error writing to cache");
3356 rv = 0;
3357 goto err;
3358 }
3359 if (l >= cache_size) {
3360 fprintf(stderr, "Internal error: truncated write to cache\n");
3361 rv = 0;
3362 goto err;
3363 }
3364 cache += l;
3365 cache_size -= l;
3366 total_len += l;
3367 }
3368 continue;
f676eb79
SH
3369 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3370 char *p;
3371 if (!cpu_in_cpuset(cpu, cpuset))
3372 continue;
3373 curcpu ++;
3374 p = strchr(line, ':');
3375 if (!p || !*p)
3376 goto err;
3377 p++;
5ed9d4e2 3378 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
f676eb79
SH
3379 if (l < 0) {
3380 perror("Error writing to cache");
3381 rv = 0;
3382 goto err;
3383 }
3384 if (l >= cache_size) {
3385 fprintf(stderr, "Internal error: truncated write to cache\n");
3386 rv = 0;
3387 goto err;
3388 }
3389 cache += l;
3390 cache_size -= l;
3391 total_len += l;
3392 continue;
3393
237e200e
SH
3394 }
3395 if (am_printing) {
3396 l = snprintf(cache, cache_size, "%s", line);
3397 if (l < 0) {
3398 perror("Error writing to cache");
3399 rv = 0;
3400 goto err;
3401 }
3402 if (l >= cache_size) {
3403 fprintf(stderr, "Internal error: truncated write to cache\n");
3404 rv = 0;
3405 goto err;
3406 }
3407 cache += l;
3408 cache_size -= l;
3409 total_len += l;
3410 }
3411 }
3412
5ed9d4e2
SH
3413 if (is_s390x) {
3414 char *origcache = d->buf;
a262ddb7 3415 ssize_t l;
5ed9d4e2
SH
3416 do {
3417 d->buf = malloc(d->buflen);
3418 } while (!d->buf);
3419 cache = d->buf;
3420 cache_size = d->buflen;
3421 total_len = 0;
3422 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3423 if (l < 0 || l >= cache_size) {
3424 free(origcache);
3425 goto err;
3426 }
3427 cache_size -= l;
3428 cache += l;
3429 total_len += l;
3430 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3431 if (l < 0 || l >= cache_size) {
3432 free(origcache);
3433 goto err;
3434 }
3435 cache_size -= l;
3436 cache += l;
3437 total_len += l;
3438 l = snprintf(cache, cache_size, "%s", origcache);
3439 free(origcache);
3440 if (l < 0 || l >= cache_size)
3441 goto err;
3442 total_len += l;
3443 }
3444
237e200e
SH
3445 d->cached = 1;
3446 d->size = total_len;
3447 if (total_len > size ) total_len = size;
3448
3449 /* read from off 0 */
3450 memcpy(buf, d->buf, total_len);
3451 rv = total_len;
3452err:
3453 if (f)
3454 fclose(f);
3455 free(line);
3456 free(cpuset);
3457 free(cg);
3458 return rv;
3459}
3460
3461static int proc_stat_read(char *buf, size_t size, off_t offset,
3462 struct fuse_file_info *fi)
3463{
3464 struct fuse_context *fc = fuse_get_context();
3465 struct file_info *d = (struct file_info *)fi->fh;
3466 char *cg;
3467 char *cpuset = NULL;
3468 char *line = NULL;
3469 size_t linelen = 0, total_len = 0, rv = 0;
3470 int curcpu = -1; /* cpu numbering starts at 0 */
3471 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
3472 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3473 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
3474#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
3475 char cpuall[CPUALL_MAX_SIZE];
3476 /* reserve for cpu all */
3477 char *cache = d->buf + CPUALL_MAX_SIZE;
3478 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3479 FILE *f = NULL;
3480
3481 if (offset){
3482 if (offset > d->size)
3483 return -EINVAL;
3484 if (!d->cached)
3485 return 0;
3486 int left = d->size - offset;
3487 total_len = left > size ? size: left;
3488 memcpy(buf, d->buf + offset, total_len);
3489 return total_len;
3490 }
3491
3492 pid_t initpid = lookup_initpid_in_store(fc->pid);
3493 if (initpid <= 0)
3494 initpid = fc->pid;
3495 cg = get_pid_cgroup(initpid, "cpuset");
3496 if (!cg)
3497 return read_file("/proc/stat", buf, size, d);
6d2f6996 3498 prune_init_slice(cg);
237e200e
SH
3499
3500 cpuset = get_cpuset(cg);
3501 if (!cpuset)
3502 goto err;
3503
3504 f = fopen("/proc/stat", "r");
3505 if (!f)
3506 goto err;
3507
3508 //skip first line
3509 if (getline(&line, &linelen, f) < 0) {
3510 fprintf(stderr, "proc_stat_read read first line failed\n");
3511 goto err;
3512 }
3513
3514 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3515 ssize_t l;
237e200e
SH
3516 int cpu;
3517 char cpu_char[10]; /* That's a lot of cores */
3518 char *c;
3519
b4665ce0
SH
3520 if (strlen(line) == 0)
3521 continue;
237e200e
SH
3522 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3523 /* not a ^cpuN line containing a number N, just print it */
3524 l = snprintf(cache, cache_size, "%s", line);
3525 if (l < 0) {
3526 perror("Error writing to cache");
3527 rv = 0;
3528 goto err;
3529 }
3530 if (l >= cache_size) {
3531 fprintf(stderr, "Internal error: truncated write to cache\n");
3532 rv = 0;
3533 goto err;
3534 }
3535 cache += l;
3536 cache_size -= l;
3537 total_len += l;
3538 continue;
3539 }
3540
3541 if (sscanf(cpu_char, "%d", &cpu) != 1)
3542 continue;
3543 if (!cpu_in_cpuset(cpu, cpuset))
3544 continue;
3545 curcpu ++;
3546
3547 c = strchr(line, ' ');
3548 if (!c)
3549 continue;
3550 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3551 if (l < 0) {
3552 perror("Error writing to cache");
3553 rv = 0;
3554 goto err;
3555
3556 }
3557 if (l >= cache_size) {
3558 fprintf(stderr, "Internal error: truncated write to cache\n");
3559 rv = 0;
3560 goto err;
3561 }
3562
3563 cache += l;
3564 cache_size -= l;
3565 total_len += l;
3566
3567 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
3568 &softirq, &steal, &guest) != 9)
3569 continue;
3570 user_sum += user;
3571 nice_sum += nice;
3572 system_sum += system;
3573 idle_sum += idle;
3574 iowait_sum += iowait;
3575 irq_sum += irq;
3576 softirq_sum += softirq;
3577 steal_sum += steal;
3578 guest_sum += guest;
3579 }
3580
3581 cache = d->buf;
3582
3583 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3584 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
3585 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
3586 memcpy(cache, cpuall, cpuall_len);
3587 cache += cpuall_len;
3588 } else{
3589 /* shouldn't happen */
3590 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
3591 cpuall_len = 0;
3592 }
3593
3594 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3595 total_len += cpuall_len;
3596 d->cached = 1;
3597 d->size = total_len;
3598 if (total_len > size ) total_len = size;
3599
3600 memcpy(buf, d->buf, total_len);
3601 rv = total_len;
3602
3603err:
3604 if (f)
3605 fclose(f);
3606 free(line);
3607 free(cpuset);
3608 free(cg);
3609 return rv;
3610}
3611
3612static long int getreaperage(pid_t pid)
3613{
3614 char fnam[100];
3615 struct stat sb;
3616 int ret;
3617 pid_t qpid;
3618
3619 qpid = lookup_initpid_in_store(pid);
3620 if (qpid <= 0)
3621 return 0;
3622
3623 ret = snprintf(fnam, 100, "/proc/%d", qpid);
3624 if (ret < 0 || ret >= 100)
3625 return 0;
3626
3627 if (lstat(fnam, &sb) < 0)
3628 return 0;
3629
3630 return time(NULL) - sb.st_ctime;
3631}
3632
3633static unsigned long get_reaper_busy(pid_t task)
3634{
3635 pid_t initpid = lookup_initpid_in_store(task);
3636 char *cgroup = NULL, *usage_str = NULL;
3637 unsigned long usage = 0;
3638
3639 if (initpid <= 0)
3640 return 0;
3641
3642 cgroup = get_pid_cgroup(initpid, "cpuacct");
3643 if (!cgroup)
3644 goto out;
6d2f6996 3645 prune_init_slice(cgroup);
237e200e
SH
3646 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3647 goto out;
3648 usage = strtoul(usage_str, NULL, 10);
3649 usage /= 1000000000;
3650
3651out:
3652 free(cgroup);
3653 free(usage_str);
3654 return usage;
3655}
3656
3657#if RELOADTEST
3658void iwashere(void)
3659{
237e200e
SH
3660 int fd;
3661
ec2b5e7c 3662 fd = creat("/tmp/lxcfs-iwashere", 0644);
237e200e
SH
3663 if (fd >= 0)
3664 close(fd);
3665}
3666#endif
3667
3668/*
3669 * We read /proc/uptime and reuse its second field.
3670 * For the first field, we use the mtime for the reaper for
3671 * the calling pid as returned by getreaperage
3672 */
3673static int proc_uptime_read(char *buf, size_t size, off_t offset,
3674 struct fuse_file_info *fi)
3675{
3676 struct fuse_context *fc = fuse_get_context();
3677 struct file_info *d = (struct file_info *)fi->fh;
3678 long int reaperage = getreaperage(fc->pid);
3679 unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
3680 char *cache = d->buf;
a262ddb7 3681 ssize_t total_len = 0;
237e200e
SH
3682
3683#if RELOADTEST
3684 iwashere();
3685#endif
3686
3687 if (offset){
3688 if (offset > d->size)
3689 return -EINVAL;
3690 if (!d->cached)
3691 return 0;
3692 int left = d->size - offset;
3693 total_len = left > size ? size: left;
3694 memcpy(buf, cache + offset, total_len);
3695 return total_len;
3696 }
3697
3698 idletime = reaperage - busytime;
3699 if (idletime > reaperage)
3700 idletime = reaperage;
3701
3702 total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
3703 if (total_len < 0){
3704 perror("Error writing to cache");
3705 return 0;
3706 }
3707
3708 d->size = (int)total_len;
3709 d->cached = 1;
3710
3711 if (total_len > size) total_len = size;
3712
3713 memcpy(buf, d->buf, total_len);
3714 return total_len;
3715}
3716
3717static int proc_diskstats_read(char *buf, size_t size, off_t offset,
3718 struct fuse_file_info *fi)
3719{
3720 char dev_name[72];
3721 struct fuse_context *fc = fuse_get_context();
3722 struct file_info *d = (struct file_info *)fi->fh;
3723 char *cg;
3724 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
3725 *io_wait_time_str = NULL, *io_service_time_str = NULL;
3726 unsigned long read = 0, write = 0;
3727 unsigned long read_merged = 0, write_merged = 0;
3728 unsigned long read_sectors = 0, write_sectors = 0;
3729 unsigned long read_ticks = 0, write_ticks = 0;
3730 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
3731 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
3732 char *cache = d->buf;
3733 size_t cache_size = d->buflen;
3734 char *line = NULL;
3735 size_t linelen = 0, total_len = 0, rv = 0;
3736 unsigned int major = 0, minor = 0;
3737 int i = 0;
3738 FILE *f = NULL;
3739
3740 if (offset){
3741 if (offset > d->size)
3742 return -EINVAL;
3743 if (!d->cached)
3744 return 0;
3745 int left = d->size - offset;
3746 total_len = left > size ? size: left;
3747 memcpy(buf, cache + offset, total_len);
3748 return total_len;
3749 }
3750
3751 pid_t initpid = lookup_initpid_in_store(fc->pid);
3752 if (initpid <= 0)
3753 initpid = fc->pid;
3754 cg = get_pid_cgroup(initpid, "blkio");
3755 if (!cg)
3756 return read_file("/proc/diskstats", buf, size, d);
6d2f6996 3757 prune_init_slice(cg);
237e200e 3758
2209fe50 3759 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
237e200e 3760 goto err;
2209fe50 3761 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
237e200e 3762 goto err;
2209fe50 3763 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
237e200e 3764 goto err;
2209fe50 3765 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
237e200e 3766 goto err;
2209fe50 3767 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
237e200e
SH
3768 goto err;
3769
3770
3771 f = fopen("/proc/diskstats", "r");
3772 if (!f)
3773 goto err;
3774
3775 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3776 ssize_t l;
2209fe50 3777 char lbuf[256];
237e200e
SH
3778
3779 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2209fe50 3780 if (i != 3)
237e200e 3781 continue;
2209fe50
SH
3782
3783 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
3784 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
3785 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
3786 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
3787 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
3788 read_sectors = read_sectors/512;
3789 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
3790 write_sectors = write_sectors/512;
3791
3792 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
3793 rd_svctm = rd_svctm/1000000;
3794 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
3795 rd_wait = rd_wait/1000000;
3796 read_ticks = rd_svctm + rd_wait;
3797
3798 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
3799 wr_svctm = wr_svctm/1000000;
3800 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
3801 wr_wait = wr_wait/1000000;
3802 write_ticks = wr_svctm + wr_wait;
3803
3804 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
3805 tot_ticks = tot_ticks/1000000;
237e200e
SH
3806
3807 memset(lbuf, 0, 256);
2db31eb6
SH
3808 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
3809 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3810 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
3811 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
3812 else
3813 continue;
237e200e 3814
2209fe50 3815 l = snprintf(cache, cache_size, "%s", lbuf);
237e200e
SH
3816 if (l < 0) {
3817 perror("Error writing to fuse buf");
3818 rv = 0;
3819 goto err;
3820 }
3821 if (l >= cache_size) {
3822 fprintf(stderr, "Internal error: truncated write to cache\n");
3823 rv = 0;
3824 goto err;
3825 }
3826 cache += l;
3827 cache_size -= l;
3828 total_len += l;
3829 }
3830
3831 d->cached = 1;
3832 d->size = total_len;
3833 if (total_len > size ) total_len = size;
3834 memcpy(buf, d->buf, total_len);
3835
3836 rv = total_len;
3837err:
3838 free(cg);
3839 if (f)
3840 fclose(f);
3841 free(line);
3842 free(io_serviced_str);
3843 free(io_merged_str);
3844 free(io_service_bytes_str);
3845 free(io_wait_time_str);
3846 free(io_service_time_str);
3847 return rv;
3848}
3849
70dcc12e
SH
3850static int proc_swaps_read(char *buf, size_t size, off_t offset,
3851 struct fuse_file_info *fi)
3852{
3853 struct fuse_context *fc = fuse_get_context();
3854 struct file_info *d = (struct file_info *)fi->fh;
3855 char *cg = NULL;
3856 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL,
3857 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
3858 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
a262ddb7
CB
3859 ssize_t total_len = 0, rv = 0;
3860 ssize_t l = 0;
70dcc12e
SH
3861 char *cache = d->buf;
3862
3863 if (offset) {
3864 if (offset > d->size)
3865 return -EINVAL;
3866 if (!d->cached)
3867 return 0;
3868 int left = d->size - offset;
3869 total_len = left > size ? size: left;
3870 memcpy(buf, cache + offset, total_len);
3871 return total_len;
3872 }
3873
3874 pid_t initpid = lookup_initpid_in_store(fc->pid);
3875 if (initpid <= 0)
3876 initpid = fc->pid;
3877 cg = get_pid_cgroup(initpid, "memory");
3878 if (!cg)
3879 return read_file("/proc/swaps", buf, size, d);
6d2f6996 3880 prune_init_slice(cg);
70dcc12e
SH
3881
3882 if (!cgfs_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
3883 goto err;
3884
3885 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3886 goto err;
3887
3888 memlimit = strtoul(memlimit_str, NULL, 10);
3889 memusage = strtoul(memusage_str, NULL, 10);
3890
3891 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
3892 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
3893
3894 /* If swap accounting is turned on, then default value is assumed to be that of cgroup / */
3895 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
3896 goto err;
3897 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
3898 goto err;
3899
3900 memswlimit = strtoul(memswlimit_str, NULL, 10);
3901 memswusage = strtoul(memswusage_str, NULL, 10);
3902
3903 if (!strcmp(memswlimit_str, memswlimit_default_str))
3904 memswlimit = 0;
3905 if (!strcmp(memswusage_str, memswusage_default_str))
3906 memswusage = 0;
3907
3908 swap_total = (memswlimit - memlimit) / 1024;
3909 swap_free = (memswusage - memusage) / 1024;
3910 }
3911
3912 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
3913
3914 /* When no mem + swap limit is specified or swapaccount=0*/
3915 if (!memswlimit) {
3916 char *line = NULL;
3917 size_t linelen = 0;
3918 FILE *f = fopen("/proc/meminfo", "r");
3919
3920 if (!f)
3921 goto err;
3922
3923 while (getline(&line, &linelen, f) != -1) {
3924 if (startswith(line, "SwapTotal:")) {
3925 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
3926 } else if (startswith(line, "SwapFree:")) {
3927 sscanf(line, "SwapFree: %8lu kB", &swap_free);
3928 }
3929 }
3930
3931 free(line);
3932 fclose(f);
3933 }
3934
3935 if (swap_total > 0) {
a262ddb7
CB
3936 l = snprintf(d->buf + total_len, d->size - total_len,
3937 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
3938 swap_total, swap_free);
3939 total_len += l;
70dcc12e
SH
3940 }
3941
a262ddb7 3942 if (total_len < 0 || l < 0) {
70dcc12e
SH
3943 perror("Error writing to cache");
3944 rv = 0;
3945 goto err;
3946 }
3947
3948 d->cached = 1;
3949 d->size = (int)total_len;
3950
3951 if (total_len > size) total_len = size;
3952 memcpy(buf, d->buf, total_len);
3953 rv = total_len;
3954
3955err:
3956 free(cg);
3957 free(memswlimit_str);
3958 free(memlimit_str);
3959 free(memusage_str);
3960 free(memswusage_str);
3961 free(memswusage_default_str);
3962 free(memswlimit_default_str);
3963 return rv;
3964}
3965
237e200e
SH
3966static off_t get_procfile_size(const char *which)
3967{
3968 FILE *f = fopen(which, "r");
3969 char *line = NULL;
3970 size_t len = 0;
3971 ssize_t sz, answer = 0;
3972 if (!f)
3973 return 0;
3974
3975 while ((sz = getline(&line, &len, f)) != -1)
3976 answer += sz;
3977 fclose (f);
3978 free(line);
3979
3980 return answer;
3981}
3982
3983int proc_getattr(const char *path, struct stat *sb)
3984{
3985 struct timespec now;
3986
3987 memset(sb, 0, sizeof(struct stat));
3988 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
3989 return -EINVAL;
3990 sb->st_uid = sb->st_gid = 0;
3991 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
3992 if (strcmp(path, "/proc") == 0) {
3993 sb->st_mode = S_IFDIR | 00555;
3994 sb->st_nlink = 2;
3995 return 0;
3996 }
3997 if (strcmp(path, "/proc/meminfo") == 0 ||
3998 strcmp(path, "/proc/cpuinfo") == 0 ||
3999 strcmp(path, "/proc/uptime") == 0 ||
4000 strcmp(path, "/proc/stat") == 0 ||
70dcc12e
SH
4001 strcmp(path, "/proc/diskstats") == 0 ||
4002 strcmp(path, "/proc/swaps") == 0) {
237e200e
SH
4003 sb->st_size = 0;
4004 sb->st_mode = S_IFREG | 00444;
4005 sb->st_nlink = 1;
4006 return 0;
4007 }
4008
4009 return -ENOENT;
4010}
4011
4012int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4013 struct fuse_file_info *fi)
4014{
4015 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
4016 filler(buf, "meminfo", NULL, 0) != 0 ||
4017 filler(buf, "stat", NULL, 0) != 0 ||
4018 filler(buf, "uptime", NULL, 0) != 0 ||
70dcc12e
SH
4019 filler(buf, "diskstats", NULL, 0) != 0 ||
4020 filler(buf, "swaps", NULL, 0) != 0)
237e200e
SH
4021 return -EINVAL;
4022 return 0;
4023}
4024
4025int proc_open(const char *path, struct fuse_file_info *fi)
4026{
4027 int type = -1;
4028 struct file_info *info;
4029
4030 if (strcmp(path, "/proc/meminfo") == 0)
4031 type = LXC_TYPE_PROC_MEMINFO;
4032 else if (strcmp(path, "/proc/cpuinfo") == 0)
4033 type = LXC_TYPE_PROC_CPUINFO;
4034 else if (strcmp(path, "/proc/uptime") == 0)
4035 type = LXC_TYPE_PROC_UPTIME;
4036 else if (strcmp(path, "/proc/stat") == 0)
4037 type = LXC_TYPE_PROC_STAT;
4038 else if (strcmp(path, "/proc/diskstats") == 0)
4039 type = LXC_TYPE_PROC_DISKSTATS;
70dcc12e
SH
4040 else if (strcmp(path, "/proc/swaps") == 0)
4041 type = LXC_TYPE_PROC_SWAPS;
237e200e
SH
4042 if (type == -1)
4043 return -ENOENT;
4044
4045 info = malloc(sizeof(*info));
4046 if (!info)
4047 return -ENOMEM;
4048
4049 memset(info, 0, sizeof(*info));
4050 info->type = type;
4051
4052 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4053 do {
4054 info->buf = malloc(info->buflen);
4055 } while (!info->buf);
4056 memset(info->buf, 0, info->buflen);
4057 /* set actual size to buffer size */
4058 info->size = info->buflen;
4059
4060 fi->fh = (unsigned long)info;
4061 return 0;
4062}
4063
bddbb106
SH
4064int proc_access(const char *path, int mask)
4065{
4066 /* these are all read-only */
4067 if ((mask & ~R_OK) != 0)
1b060d0a 4068 return -EACCES;
bddbb106
SH
4069 return 0;
4070}
4071
237e200e
SH
4072int proc_release(const char *path, struct fuse_file_info *fi)
4073{
43215927 4074 do_release_file_info(fi);
237e200e
SH
4075 return 0;
4076}
4077
4078int proc_read(const char *path, char *buf, size_t size, off_t offset,
4079 struct fuse_file_info *fi)
4080{
4081 struct file_info *f = (struct file_info *) fi->fh;
4082
4083 switch (f->type) {
4084 case LXC_TYPE_PROC_MEMINFO:
4085 return proc_meminfo_read(buf, size, offset, fi);
4086 case LXC_TYPE_PROC_CPUINFO:
4087 return proc_cpuinfo_read(buf, size, offset, fi);
4088 case LXC_TYPE_PROC_UPTIME:
4089 return proc_uptime_read(buf, size, offset, fi);
4090 case LXC_TYPE_PROC_STAT:
4091 return proc_stat_read(buf, size, offset, fi);
4092 case LXC_TYPE_PROC_DISKSTATS:
4093 return proc_diskstats_read(buf, size, offset, fi);
70dcc12e
SH
4094 case LXC_TYPE_PROC_SWAPS:
4095 return proc_swaps_read(buf, size, offset, fi);
237e200e
SH
4096 default:
4097 return -EINVAL;
4098 }
4099}
4100
29a73c2f
CB
4101/*
4102 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
4103 */
4104
4105static bool mkdir_p(const char *dir, mode_t mode)
4106{
4107 const char *tmp = dir;
4108 const char *orig = dir;
4109 char *makeme;
4110
4111 do {
4112 dir = tmp + strspn(tmp, "/");
4113 tmp = dir + strcspn(dir, "/");
4114 makeme = strndup(orig, dir - orig);
4115 if (!makeme)
4116 return false;
4117 if (mkdir(makeme, mode) && errno != EEXIST) {
4118 fprintf(stderr, "failed to create directory '%s': %s",
4119 makeme, strerror(errno));
4120 free(makeme);
4121 return false;
4122 }
4123 free(makeme);
4124 } while(tmp != dir);
4125
4126 return true;
4127}
4128
4129static bool umount_if_mounted(void)
4130{
4131 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
480262c9 4132 fprintf(stderr, "failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
4133 return false;
4134 }
4135 return true;
4136}
4137
4138static int pivot_enter(void)
4139{
4140 int ret = -1, oldroot = -1, newroot = -1;
4141
4142 oldroot = open("/", O_DIRECTORY | O_RDONLY);
4143 if (oldroot < 0) {
4144 fprintf(stderr, "%s: Failed to open old root for fchdir.\n", __func__);
4145 return ret;
4146 }
4147
4148 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
4149 if (newroot < 0) {
4150 fprintf(stderr, "%s: Failed to open new root for fchdir.\n", __func__);
4151 goto err;
4152 }
4153
4154 /* change into new root fs */
4155 if (fchdir(newroot) < 0) {
4156 fprintf(stderr, "%s: Failed to change directory to new rootfs: %s.\n", __func__, ROOTDIR);
4157 goto err;
4158 }
4159
4160 /* pivot_root into our new root fs */
4161 if (pivot_root(".", ".") < 0) {
4162 fprintf(stderr, "%s: pivot_root() syscall failed: %s.\n", __func__, strerror(errno));
4163 goto err;
4164 }
4165
4166 /*
4167 * At this point the old-root is mounted on top of our new-root.
4168 * To unmounted it we must not be chdir'd into it, so escape back
4169 * to the old-root.
4170 */
4171 if (fchdir(oldroot) < 0) {
4172 fprintf(stderr, "%s: Failed to enter old root.\n", __func__);
4173 goto err;
4174 }
4175 if (umount2(".", MNT_DETACH) < 0) {
4176 fprintf(stderr, "%s: Failed to detach old root.\n", __func__);
4177 goto err;
4178 }
4179
4180 if (fchdir(newroot) < 0) {
4181 fprintf(stderr, "%s: Failed to re-enter new root.\n", __func__);
4182 goto err;
4183 }
4184
4185 ret = 0;
4186
4187err:
4188 if (oldroot > 0)
4189 close(oldroot);
4190 if (newroot > 0)
4191 close(newroot);
4192 return ret;
4193}
4194
4195/* Prepare our new clean root. */
4196static int pivot_prepare(void)
4197{
4198 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
4199 fprintf(stderr, "%s: Failed to create directory for new root.\n", __func__);
4200 return -1;
4201 }
4202
4203 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
4204 fprintf(stderr, "%s: Failed to bind-mount / for new root: %s.\n", __func__, strerror(errno));
4205 return -1;
4206 }
4207
4208 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
4209 fprintf(stderr, "%s: Failed to bind-mount /run into new root: %s.\n", __func__, strerror(errno));
4210 return -1;
4211 }
4212
4213 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
4214 printf("%s: failed to move " BASEDIR " into new root: %s.\n", __func__, strerror(errno));
4215 return -1;
4216 }
4217
4218 return 0;
4219}
4220
4221static bool pivot_new_root(void)
4222{
4223 /* Prepare new root. */
4224 if (pivot_prepare() < 0)
4225 return false;
4226
4227 /* Pivot into new root. */
4228 if (pivot_enter() < 0)
4229 return false;
4230
4231 return true;
4232}
4233
4234static bool setup_cgfs_dir(void)
4235{
4236 if (!mkdir_p(BASEDIR, 0700)) {
480262c9 4237 fprintf(stderr, "Failed to create lxcfs cgroup mountpoint.\n");
29a73c2f
CB
4238 return false;
4239 }
480262c9 4240
29a73c2f 4241 if (!umount_if_mounted()) {
480262c9
CB
4242 fprintf(stderr, "Failed to clean up old lxcfs cgroup mountpoint.\n");
4243 return false;
4244 }
4245
4246 if (unshare(CLONE_NEWNS) < 0) {
4247 fprintf(stderr, "%s: Failed to unshare mount namespace: %s.\n", __func__, strerror(errno));
4248 return false;
4249 }
4250
4251 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
4252 fprintf(stderr, "%s: Failed to remount / private: %s.\n", __func__, strerror(errno));
29a73c2f
CB
4253 return false;
4254 }
480262c9 4255
29a73c2f 4256 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
480262c9 4257 fprintf(stderr, "Failed to mount tmpfs over lxcfs cgroup mountpoint.\n");
29a73c2f
CB
4258 return false;
4259 }
480262c9 4260
29a73c2f
CB
4261 return true;
4262}
4263
4264static bool do_mount_cgroups(void)
4265{
4266 char *target;
4267 size_t clen, len;
4268 int i, ret;
4269
4270 for (i = 0; i < num_hierarchies; i++) {
4271 char *controller = hierarchies[i];
4272 clen = strlen(controller);
4273 len = strlen(BASEDIR) + clen + 2;
4274 target = malloc(len);
4275 if (!target)
4276 return false;
4277 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
4278 if (ret < 0 || ret >= len) {
4279 free(target);
4280 return false;
4281 }
4282 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
4283 free(target);
4284 return false;
4285 }
4286 if (mount(controller, target, "cgroup", 0, controller) < 0) {
4287 fprintf(stderr, "Failed mounting cgroup %s\n", controller);
4288 free(target);
4289 return false;
4290 }
4291
4292 fd_hierarchies[i] = open(target, O_DIRECTORY);
4293 if (fd_hierarchies[i] < 0) {
4294 free(target);
4295 return false;
4296 }
4297 free(target);
4298 }
4299 return true;
4300}
4301
480262c9 4302static bool cgfs_setup_controllers(void)
29a73c2f 4303{
480262c9 4304 if (!setup_cgfs_dir())
29a73c2f 4305 return false;
29a73c2f
CB
4306
4307 if (!do_mount_cgroups()) {
480262c9 4308 fprintf(stderr, "Failed to set up private lxcfs cgroup mounts.\n");
29a73c2f
CB
4309 return false;
4310 }
4311
4312 if (!pivot_new_root())
4313 return false;
4314
4315 return true;
4316}
4317
480262c9
CB
4318static int preserve_ns(int pid)
4319{
4320 int ret;
4321 size_t len = 5 /* /proc */ + 21 /* /int_as_str */ + 7 /* /ns/mnt */ + 1 /* \0 */;
4322 char path[len];
4323
4324 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
4325 if (ret < 0 || (size_t)ret >= len)
4326 return -1;
4327
4328 return open(path, O_RDONLY | O_CLOEXEC);
4329}
4330
29a73c2f 4331static void __attribute__((constructor)) collect_and_mount_subsystems(void)
237e200e
SH
4332{
4333 FILE *f;
4334 char *line = NULL;
4335 size_t len = 0;
480262c9 4336 int i, init_ns = -1;
237e200e
SH
4337
4338 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
4339 fprintf(stderr, "Error opening /proc/self/cgroup: %s\n", strerror(errno));
4340 return;
4341 }
4342 while (getline(&line, &len, f) != -1) {
4343 char *p, *p2;
4344
4345 p = strchr(line, ':');
4346 if (!p)
4347 goto out;
4348 *(p++) = '\0';
4349
4350 p2 = strrchr(p, ':');
4351 if (!p2)
4352 goto out;
4353 *p2 = '\0';
4354
a67719f6
CB
4355 /* With cgroupv2 /proc/self/cgroup can contain entries of the
4356 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
4357 * because it parses out the empty string "" and later on passes
4358 * it to mount(). Let's skip such entries.
4359 */
4360 if (!strcmp(p, ""))
4361 continue;
4362
237e200e
SH
4363 if (!store_hierarchy(line, p))
4364 goto out;
4365 }
4366
480262c9
CB
4367 /* Preserve initial namespace. */
4368 init_ns = preserve_ns(getpid());
4369 if (init_ns < 0)
4370 goto out;
4371
4372 fd_hierarchies = malloc(sizeof(int *) * num_hierarchies);
29a73c2f
CB
4373 if (!fd_hierarchies)
4374 goto out;
4375
480262c9
CB
4376 for (i = 0; i < num_hierarchies; i++)
4377 fd_hierarchies[i] = -1;
4378
4379 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
4380 * to privately mount lxcfs cgroups. */
4381 if (!cgfs_setup_controllers())
29a73c2f 4382 goto out;
480262c9
CB
4383
4384 if (setns(init_ns, 0) < 0)
29a73c2f 4385 goto out;
29a73c2f 4386
237e200e
SH
4387 print_subsystems();
4388
4389out:
4390 free(line);
4391 fclose(f);
480262c9
CB
4392 if (init_ns >= 0)
4393 close(init_ns);
237e200e
SH
4394}
4395
4396static void __attribute__((destructor)) free_subsystems(void)
4397{
4398 int i;
4399
29a73c2f 4400 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
4401 if (hierarchies[i])
4402 free(hierarchies[i]);
480262c9 4403 if (fd_hierarchies && fd_hierarchies[i] >= 0)
29a73c2f
CB
4404 close(fd_hierarchies[i]);
4405 }
237e200e 4406 free(hierarchies);
480262c9 4407 free(fd_hierarchies);
237e200e 4408}