]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
Merge pull request #242 from aither64/loadavg-reload
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #define __STDC_FORMAT_MACROS
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <fuse.h>
16 #include <inttypes.h>
17 #include <libgen.h>
18 #include <pthread.h>
19 #include <sched.h>
20 #include <stdbool.h>
21 #include <stdint.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <time.h>
26 #include <unistd.h>
27 #include <wait.h>
28 #include <linux/magic.h>
29 #include <linux/sched.h>
30 #include <sys/epoll.h>
31 #include <sys/mman.h>
32 #include <sys/mount.h>
33 #include <sys/param.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/sysinfo.h>
37 #include <sys/vfs.h>
38
39 #include "bindings.h"
40 #include "config.h" // for VERSION
41
42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
43 #define LXCFS_NUMSTRLEN64 21
44
45 /* Define pivot_root() if missing from the C library */
46 #ifndef HAVE_PIVOT_ROOT
47 static int pivot_root(const char * new_root, const char * put_old)
48 {
49 #ifdef __NR_pivot_root
50 return syscall(__NR_pivot_root, new_root, put_old);
51 #else
52 errno = ENOSYS;
53 return -1;
54 #endif
55 }
56 #else
57 extern int pivot_root(const char * new_root, const char * put_old);
58 #endif
59
60 enum {
61 LXC_TYPE_CGDIR,
62 LXC_TYPE_CGFILE,
63 LXC_TYPE_PROC_MEMINFO,
64 LXC_TYPE_PROC_CPUINFO,
65 LXC_TYPE_PROC_UPTIME,
66 LXC_TYPE_PROC_STAT,
67 LXC_TYPE_PROC_DISKSTATS,
68 LXC_TYPE_PROC_SWAPS,
69 LXC_TYPE_PROC_LOADAVG,
70 };
71
72 struct file_info {
73 char *controller;
74 char *cgroup;
75 char *file;
76 int type;
77 char *buf; // unused as of yet
78 int buflen;
79 int size; //actual data size
80 int cached;
81 };
82
83 /* The function of hash table.*/
84 #define LOAD_SIZE 100 /*the size of hash_table */
85 #define FLUSH_TIME 5 /*the flush rate */
86 #define DEPTH_DIR 3 /*the depth of per cgroup */
87 /* The function of calculate loadavg .*/
88 #define FSHIFT 11 /* nr of bits of precision */
89 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
90 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
91 #define EXP_5 2014 /* 1/exp(5sec/5min) */
92 #define EXP_15 2037 /* 1/exp(5sec/15min) */
93 #define LOAD_INT(x) ((x) >> FSHIFT)
94 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
95 /*
96 * This parameter is used for proc_loadavg_read().
97 * 1 means use loadavg, 0 means not use.
98 */
99 static int loadavg = 0;
100 static volatile sig_atomic_t loadavg_stop = 0;
101 static int calc_hash(char *name)
102 {
103 unsigned int hash = 0;
104 unsigned int x = 0;
105 /* ELFHash algorithm. */
106 while (*name) {
107 hash = (hash << 4) + *name++;
108 x = hash & 0xf0000000;
109 if (x != 0)
110 hash ^= (x >> 24);
111 hash &= ~x;
112 }
113 return ((hash & 0x7fffffff) % LOAD_SIZE);
114 }
115
116 struct load_node {
117 char *cg; /*cg */
118 unsigned long avenrun[3]; /* Load averages */
119 unsigned int run_pid;
120 unsigned int total_pid;
121 unsigned int last_pid;
122 int cfd; /* The file descriptor of the mounted cgroup */
123 struct load_node *next;
124 struct load_node **pre;
125 };
126
127 struct load_head {
128 /*
129 * The lock is about insert load_node and refresh load_node.To the first
130 * load_node of each hash bucket, insert and refresh in this hash bucket is
131 * mutually exclusive.
132 */
133 pthread_mutex_t lock;
134 /*
135 * The rdlock is about read loadavg and delete load_node.To each hash
136 * bucket, read and delete is mutually exclusive. But at the same time, we
137 * allow paratactic read operation. This rdlock is at list level.
138 */
139 pthread_rwlock_t rdlock;
140 /*
141 * The rilock is about read loadavg and insert load_node.To the first
142 * load_node of each hash bucket, read and insert is mutually exclusive.
143 * But at the same time, we allow paratactic read operation.
144 */
145 pthread_rwlock_t rilock;
146 struct load_node *next;
147 };
148
149 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
150 /*
151 * init_load initialize the hash table.
152 * Return 0 on success, return -1 on failure.
153 */
154 static int init_load(void)
155 {
156 int i;
157 int ret;
158
159 for (i = 0; i < LOAD_SIZE; i++) {
160 load_hash[i].next = NULL;
161 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
162 if (ret != 0) {
163 lxcfs_error("%s\n", "Failed to initialize lock");
164 goto out3;
165 }
166 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
167 if (ret != 0) {
168 lxcfs_error("%s\n", "Failed to initialize rdlock");
169 goto out2;
170 }
171 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
172 if (ret != 0) {
173 lxcfs_error("%s\n", "Failed to initialize rilock");
174 goto out1;
175 }
176 }
177 return 0;
178 out1:
179 pthread_rwlock_destroy(&load_hash[i].rdlock);
180 out2:
181 pthread_mutex_destroy(&load_hash[i].lock);
182 out3:
183 while (i > 0) {
184 i--;
185 pthread_mutex_destroy(&load_hash[i].lock);
186 pthread_rwlock_destroy(&load_hash[i].rdlock);
187 pthread_rwlock_destroy(&load_hash[i].rilock);
188 }
189 return -1;
190 }
191
192 static void insert_node(struct load_node **n, int locate)
193 {
194 struct load_node *f;
195
196 pthread_mutex_lock(&load_hash[locate].lock);
197 pthread_rwlock_wrlock(&load_hash[locate].rilock);
198 f = load_hash[locate].next;
199 load_hash[locate].next = *n;
200
201 (*n)->pre = &(load_hash[locate].next);
202 if (f)
203 f->pre = &((*n)->next);
204 (*n)->next = f;
205 pthread_mutex_unlock(&load_hash[locate].lock);
206 pthread_rwlock_unlock(&load_hash[locate].rilock);
207 }
208 /*
209 * locate_node() finds special node. Not return NULL means success.
210 * It should be noted that rdlock isn't unlocked at the end of code
211 * because this function is used to read special node. Delete is not
212 * allowed before read has ended.
213 * unlock rdlock only in proc_loadavg_read().
214 */
215 static struct load_node *locate_node(char *cg, int locate)
216 {
217 struct load_node *f = NULL;
218 int i = 0;
219
220 pthread_rwlock_rdlock(&load_hash[locate].rilock);
221 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
222 if (load_hash[locate].next == NULL) {
223 pthread_rwlock_unlock(&load_hash[locate].rilock);
224 return f;
225 }
226 f = load_hash[locate].next;
227 pthread_rwlock_unlock(&load_hash[locate].rilock);
228 while (f && ((i = strcmp(f->cg, cg)) != 0))
229 f = f->next;
230 return f;
231 }
232 /* Delete the load_node n and return the next node of it. */
233 static struct load_node *del_node(struct load_node *n, int locate)
234 {
235 struct load_node *g;
236
237 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
238 if (n->next == NULL) {
239 *(n->pre) = NULL;
240 } else {
241 *(n->pre) = n->next;
242 n->next->pre = n->pre;
243 }
244 g = n->next;
245 free(n->cg);
246 free(n);
247 pthread_rwlock_unlock(&load_hash[locate].rdlock);
248 return g;
249 }
250
251 static void load_free(void)
252 {
253 int i;
254 struct load_node *f, *p;
255
256 for (i = 0; i < LOAD_SIZE; i++) {
257 pthread_mutex_lock(&load_hash[i].lock);
258 pthread_rwlock_wrlock(&load_hash[i].rilock);
259 pthread_rwlock_wrlock(&load_hash[i].rdlock);
260 if (load_hash[i].next == NULL) {
261 pthread_mutex_unlock(&load_hash[i].lock);
262 pthread_mutex_destroy(&load_hash[i].lock);
263 pthread_rwlock_unlock(&load_hash[i].rilock);
264 pthread_rwlock_destroy(&load_hash[i].rilock);
265 pthread_rwlock_unlock(&load_hash[i].rdlock);
266 pthread_rwlock_destroy(&load_hash[i].rdlock);
267 continue;
268 }
269 for (f = load_hash[i].next; f; ) {
270 free(f->cg);
271 p = f->next;
272 free(f);
273 f = p;
274 }
275 pthread_mutex_unlock(&load_hash[i].lock);
276 pthread_mutex_destroy(&load_hash[i].lock);
277 pthread_rwlock_unlock(&load_hash[i].rilock);
278 pthread_rwlock_destroy(&load_hash[i].rilock);
279 pthread_rwlock_unlock(&load_hash[i].rdlock);
280 pthread_rwlock_destroy(&load_hash[i].rdlock);
281 }
282 }
283 /* Reserve buffer size to account for file size changes. */
284 #define BUF_RESERVE_SIZE 512
285
286 /*
287 * A table caching which pid is init for a pid namespace.
288 * When looking up which pid is init for $qpid, we first
289 * 1. Stat /proc/$qpid/ns/pid.
290 * 2. Check whether the ino_t is in our store.
291 * a. if not, fork a child in qpid's ns to send us
292 * ucred.pid = 1, and read the initpid. Cache
293 * initpid and creation time for /proc/initpid
294 * in a new store entry.
295 * b. if so, verify that /proc/initpid still matches
296 * what we have saved. If not, clear the store
297 * entry and go back to a. If so, return the
298 * cached initpid.
299 */
300 struct pidns_init_store {
301 ino_t ino; // inode number for /proc/$pid/ns/pid
302 pid_t initpid; // the pid of nit in that ns
303 long int ctime; // the time at which /proc/$initpid was created
304 struct pidns_init_store *next;
305 long int lastcheck;
306 };
307
308 /* lol - look at how they are allocated in the kernel */
309 #define PIDNS_HASH_SIZE 4096
310 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
311
312 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
313 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
314 static void lock_mutex(pthread_mutex_t *l)
315 {
316 int ret;
317
318 if ((ret = pthread_mutex_lock(l)) != 0) {
319 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
320 exit(1);
321 }
322 }
323
324 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
325 * Number of hierarchies mounted. */
326 static int num_hierarchies;
327
328 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
329 * Hierachies mounted {cpuset, blkio, ...}:
330 * Initialized via __constructor__ collect_and_mount_subsystems(). */
331 static char **hierarchies;
332
333 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
334 * Open file descriptors:
335 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
336 * private mount namespace.
337 * Initialized via __constructor__ collect_and_mount_subsystems().
338 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
339 * mounts and respective files in the private namespace even when located in
340 * another namespace using the *at() family of functions
341 * {openat(), fchownat(), ...}. */
342 static int *fd_hierarchies;
343 static int cgroup_mount_ns_fd = -1;
344
345 static void unlock_mutex(pthread_mutex_t *l)
346 {
347 int ret;
348
349 if ((ret = pthread_mutex_unlock(l)) != 0) {
350 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
351 exit(1);
352 }
353 }
354
355 static void store_lock(void)
356 {
357 lock_mutex(&pidns_store_mutex);
358 }
359
360 static void store_unlock(void)
361 {
362 unlock_mutex(&pidns_store_mutex);
363 }
364
365 /* Must be called under store_lock */
366 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
367 {
368 struct stat initsb;
369 char fnam[100];
370
371 snprintf(fnam, 100, "/proc/%d", e->initpid);
372 if (stat(fnam, &initsb) < 0)
373 return false;
374
375 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
376 initsb.st_ctime, e->initpid);
377
378 if (e->ctime != initsb.st_ctime)
379 return false;
380 return true;
381 }
382
383 /* Must be called under store_lock */
384 static void remove_initpid(struct pidns_init_store *e)
385 {
386 struct pidns_init_store *tmp;
387 int h;
388
389 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
390
391 h = HASH(e->ino);
392 if (pidns_hash_table[h] == e) {
393 pidns_hash_table[h] = e->next;
394 free(e);
395 return;
396 }
397
398 tmp = pidns_hash_table[h];
399 while (tmp) {
400 if (tmp->next == e) {
401 tmp->next = e->next;
402 free(e);
403 return;
404 }
405 tmp = tmp->next;
406 }
407 }
408
409 #define PURGE_SECS 5
410 /* Must be called under store_lock */
411 static void prune_initpid_store(void)
412 {
413 static long int last_prune = 0;
414 struct pidns_init_store *e, *prev, *delme;
415 long int now, threshold;
416 int i;
417
418 if (!last_prune) {
419 last_prune = time(NULL);
420 return;
421 }
422 now = time(NULL);
423 if (now < last_prune + PURGE_SECS)
424 return;
425
426 lxcfs_debug("%s\n", "Pruning.");
427
428 last_prune = now;
429 threshold = now - 2 * PURGE_SECS;
430
431 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
432 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
433 if (e->lastcheck < threshold) {
434
435 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
436
437 delme = e;
438 if (prev)
439 prev->next = e->next;
440 else
441 pidns_hash_table[i] = e->next;
442 e = e->next;
443 free(delme);
444 } else {
445 prev = e;
446 e = e->next;
447 }
448 }
449 }
450 }
451
452 /* Must be called under store_lock */
453 static void save_initpid(struct stat *sb, pid_t pid)
454 {
455 struct pidns_init_store *e;
456 char fpath[100];
457 struct stat procsb;
458 int h;
459
460 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
461
462 snprintf(fpath, 100, "/proc/%d", pid);
463 if (stat(fpath, &procsb) < 0)
464 return;
465 do {
466 e = malloc(sizeof(*e));
467 } while (!e);
468 e->ino = sb->st_ino;
469 e->initpid = pid;
470 e->ctime = procsb.st_ctime;
471 h = HASH(e->ino);
472 e->next = pidns_hash_table[h];
473 e->lastcheck = time(NULL);
474 pidns_hash_table[h] = e;
475 }
476
477 /*
478 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
479 * entry for the inode number and creation time. Verify that the init pid
480 * is still valid. If not, remove it. Return the entry if valid, NULL
481 * otherwise.
482 * Must be called under store_lock
483 */
484 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
485 {
486 int h = HASH(sb->st_ino);
487 struct pidns_init_store *e = pidns_hash_table[h];
488
489 while (e) {
490 if (e->ino == sb->st_ino) {
491 if (initpid_still_valid(e, sb)) {
492 e->lastcheck = time(NULL);
493 return e;
494 }
495 remove_initpid(e);
496 return NULL;
497 }
498 e = e->next;
499 }
500
501 return NULL;
502 }
503
504 static int is_dir(const char *path, int fd)
505 {
506 struct stat statbuf;
507 int ret = fstatat(fd, path, &statbuf, fd);
508 if (ret == 0 && S_ISDIR(statbuf.st_mode))
509 return 1;
510 return 0;
511 }
512
513 static char *must_copy_string(const char *str)
514 {
515 char *dup = NULL;
516 if (!str)
517 return NULL;
518 do {
519 dup = strdup(str);
520 } while (!dup);
521
522 return dup;
523 }
524
525 static inline void drop_trailing_newlines(char *s)
526 {
527 int l;
528
529 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
530 s[l-1] = '\0';
531 }
532
533 #define BATCH_SIZE 50
534 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
535 {
536 int newbatches = (newlen / BATCH_SIZE) + 1;
537 int oldbatches = (oldlen / BATCH_SIZE) + 1;
538
539 if (!*mem || newbatches > oldbatches) {
540 char *tmp;
541 do {
542 tmp = realloc(*mem, newbatches * BATCH_SIZE);
543 } while (!tmp);
544 *mem = tmp;
545 }
546 }
547 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
548 {
549 size_t newlen = *len + linelen;
550 dorealloc(contents, *len, newlen + 1);
551 memcpy(*contents + *len, line, linelen+1);
552 *len = newlen;
553 }
554
555 static char *slurp_file(const char *from, int fd)
556 {
557 char *line = NULL;
558 char *contents = NULL;
559 FILE *f = fdopen(fd, "r");
560 size_t len = 0, fulllen = 0;
561 ssize_t linelen;
562
563 if (!f)
564 return NULL;
565
566 while ((linelen = getline(&line, &len, f)) != -1) {
567 append_line(&contents, &fulllen, line, linelen);
568 }
569 fclose(f);
570
571 if (contents)
572 drop_trailing_newlines(contents);
573 free(line);
574 return contents;
575 }
576
577 static bool write_string(const char *fnam, const char *string, int fd)
578 {
579 FILE *f;
580 size_t len, ret;
581
582 if (!(f = fdopen(fd, "w")))
583 return false;
584 len = strlen(string);
585 ret = fwrite(string, 1, len, f);
586 if (ret != len) {
587 lxcfs_error("Error writing to file: %s\n", strerror(errno));
588 fclose(f);
589 return false;
590 }
591 if (fclose(f) < 0) {
592 lxcfs_error("Error writing to file: %s\n", strerror(errno));
593 return false;
594 }
595 return true;
596 }
597
598 struct cgfs_files {
599 char *name;
600 uint32_t uid, gid;
601 uint32_t mode;
602 };
603
604 #define ALLOC_NUM 20
605 static bool store_hierarchy(char *stridx, char *h)
606 {
607 if (num_hierarchies % ALLOC_NUM == 0) {
608 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
609 n *= ALLOC_NUM;
610 char **tmp = realloc(hierarchies, n * sizeof(char *));
611 if (!tmp) {
612 lxcfs_error("%s\n", strerror(errno));
613 exit(1);
614 }
615 hierarchies = tmp;
616 }
617
618 hierarchies[num_hierarchies++] = must_copy_string(h);
619 return true;
620 }
621
622 static void print_subsystems(void)
623 {
624 int i;
625
626 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
627 fprintf(stderr, "hierarchies:\n");
628 for (i = 0; i < num_hierarchies; i++) {
629 if (hierarchies[i])
630 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
631 fd_hierarchies[i], hierarchies[i]);
632 }
633 }
634
635 static bool in_comma_list(const char *needle, const char *haystack)
636 {
637 const char *s = haystack, *e;
638 size_t nlen = strlen(needle);
639
640 while (*s && (e = strchr(s, ','))) {
641 if (nlen != e - s) {
642 s = e + 1;
643 continue;
644 }
645 if (strncmp(needle, s, nlen) == 0)
646 return true;
647 s = e + 1;
648 }
649 if (strcmp(needle, s) == 0)
650 return true;
651 return false;
652 }
653
654 /* do we need to do any massaging here? I'm not sure... */
655 /* Return the mounted controller and store the corresponding open file descriptor
656 * referring to the controller mountpoint in the private lxcfs namespace in
657 * @cfd.
658 */
659 static char *find_mounted_controller(const char *controller, int *cfd)
660 {
661 int i;
662
663 for (i = 0; i < num_hierarchies; i++) {
664 if (!hierarchies[i])
665 continue;
666 if (strcmp(hierarchies[i], controller) == 0) {
667 *cfd = fd_hierarchies[i];
668 return hierarchies[i];
669 }
670 if (in_comma_list(controller, hierarchies[i])) {
671 *cfd = fd_hierarchies[i];
672 return hierarchies[i];
673 }
674 }
675
676 return NULL;
677 }
678
679 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
680 const char *value)
681 {
682 int ret, fd, cfd;
683 size_t len;
684 char *fnam, *tmpc;
685
686 tmpc = find_mounted_controller(controller, &cfd);
687 if (!tmpc)
688 return false;
689
690 /* Make sure we pass a relative path to *at() family of functions.
691 * . + /cgroup + / + file + \0
692 */
693 len = strlen(cgroup) + strlen(file) + 3;
694 fnam = alloca(len);
695 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
696 if (ret < 0 || (size_t)ret >= len)
697 return false;
698
699 fd = openat(cfd, fnam, O_WRONLY);
700 if (fd < 0)
701 return false;
702
703 return write_string(fnam, value, fd);
704 }
705
706 // Chown all the files in the cgroup directory. We do this when we create
707 // a cgroup on behalf of a user.
708 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
709 {
710 struct dirent *direntp;
711 char path[MAXPATHLEN];
712 size_t len;
713 DIR *d;
714 int fd1, ret;
715
716 len = strlen(dirname);
717 if (len >= MAXPATHLEN) {
718 lxcfs_error("Pathname too long: %s\n", dirname);
719 return;
720 }
721
722 fd1 = openat(fd, dirname, O_DIRECTORY);
723 if (fd1 < 0)
724 return;
725
726 d = fdopendir(fd1);
727 if (!d) {
728 lxcfs_error("Failed to open %s\n", dirname);
729 return;
730 }
731
732 while ((direntp = readdir(d))) {
733 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
734 continue;
735 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
736 if (ret < 0 || ret >= MAXPATHLEN) {
737 lxcfs_error("Pathname too long under %s\n", dirname);
738 continue;
739 }
740 if (fchownat(fd, path, uid, gid, 0) < 0)
741 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
742 }
743 closedir(d);
744 }
745
746 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
747 {
748 int cfd;
749 size_t len;
750 char *dirnam, *tmpc;
751
752 tmpc = find_mounted_controller(controller, &cfd);
753 if (!tmpc)
754 return -EINVAL;
755
756 /* Make sure we pass a relative path to *at() family of functions.
757 * . + /cg + \0
758 */
759 len = strlen(cg) + 2;
760 dirnam = alloca(len);
761 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
762
763 if (mkdirat(cfd, dirnam, 0755) < 0)
764 return -errno;
765
766 if (uid == 0 && gid == 0)
767 return 0;
768
769 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
770 return -errno;
771
772 chown_all_cgroup_files(dirnam, uid, gid, cfd);
773
774 return 0;
775 }
776
777 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
778 {
779 struct dirent *direntp;
780 DIR *dir;
781 bool ret = false;
782 char pathname[MAXPATHLEN];
783 int dupfd;
784
785 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
786 if (dupfd < 0)
787 return false;
788
789 dir = fdopendir(dupfd);
790 if (!dir) {
791 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
792 close(dupfd);
793 return false;
794 }
795
796 while ((direntp = readdir(dir))) {
797 struct stat mystat;
798 int rc;
799
800 if (!strcmp(direntp->d_name, ".") ||
801 !strcmp(direntp->d_name, ".."))
802 continue;
803
804 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
805 if (rc < 0 || rc >= MAXPATHLEN) {
806 lxcfs_error("%s\n", "Pathname too long.");
807 continue;
808 }
809
810 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
811 if (rc) {
812 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
813 continue;
814 }
815 if (S_ISDIR(mystat.st_mode))
816 if (!recursive_rmdir(pathname, fd, cfd))
817 lxcfs_debug("Error removing %s.\n", pathname);
818 }
819
820 ret = true;
821 if (closedir(dir) < 0) {
822 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
823 ret = false;
824 }
825
826 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
827 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
828 ret = false;
829 }
830
831 close(dupfd);
832
833 return ret;
834 }
835
836 bool cgfs_remove(const char *controller, const char *cg)
837 {
838 int fd, cfd;
839 size_t len;
840 char *dirnam, *tmpc;
841 bool bret;
842
843 tmpc = find_mounted_controller(controller, &cfd);
844 if (!tmpc)
845 return false;
846
847 /* Make sure we pass a relative path to *at() family of functions.
848 * . + /cg + \0
849 */
850 len = strlen(cg) + 2;
851 dirnam = alloca(len);
852 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
853
854 fd = openat(cfd, dirnam, O_DIRECTORY);
855 if (fd < 0)
856 return false;
857
858 bret = recursive_rmdir(dirnam, fd, cfd);
859 close(fd);
860 return bret;
861 }
862
863 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
864 {
865 int cfd;
866 size_t len;
867 char *pathname, *tmpc;
868
869 tmpc = find_mounted_controller(controller, &cfd);
870 if (!tmpc)
871 return false;
872
873 /* Make sure we pass a relative path to *at() family of functions.
874 * . + /file + \0
875 */
876 len = strlen(file) + 2;
877 pathname = alloca(len);
878 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
879 if (fchmodat(cfd, pathname, mode, 0) < 0)
880 return false;
881 return true;
882 }
883
884 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
885 {
886 size_t len;
887 char *fname;
888
889 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
890 fname = alloca(len);
891 snprintf(fname, len, "%s/tasks", dirname);
892 if (fchownat(fd, fname, uid, gid, 0) != 0)
893 return -errno;
894 snprintf(fname, len, "%s/cgroup.procs", dirname);
895 if (fchownat(fd, fname, uid, gid, 0) != 0)
896 return -errno;
897 return 0;
898 }
899
900 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
901 {
902 int cfd;
903 size_t len;
904 char *pathname, *tmpc;
905
906 tmpc = find_mounted_controller(controller, &cfd);
907 if (!tmpc)
908 return -EINVAL;
909
910 /* Make sure we pass a relative path to *at() family of functions.
911 * . + /file + \0
912 */
913 len = strlen(file) + 2;
914 pathname = alloca(len);
915 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
916 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
917 return -errno;
918
919 if (is_dir(pathname, cfd))
920 // like cgmanager did, we want to chown the tasks file as well
921 return chown_tasks_files(pathname, uid, gid, cfd);
922
923 return 0;
924 }
925
926 FILE *open_pids_file(const char *controller, const char *cgroup)
927 {
928 int fd, cfd;
929 size_t len;
930 char *pathname, *tmpc;
931
932 tmpc = find_mounted_controller(controller, &cfd);
933 if (!tmpc)
934 return NULL;
935
936 /* Make sure we pass a relative path to *at() family of functions.
937 * . + /cgroup + / "cgroup.procs" + \0
938 */
939 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
940 pathname = alloca(len);
941 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
942
943 fd = openat(cfd, pathname, O_WRONLY);
944 if (fd < 0)
945 return NULL;
946
947 return fdopen(fd, "w");
948 }
949
950 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
951 void ***list, size_t typesize,
952 void* (*iterator)(const char*, const char*, const char*))
953 {
954 int cfd, fd, ret;
955 size_t len;
956 char *cg, *tmpc;
957 char pathname[MAXPATHLEN];
958 size_t sz = 0, asz = 0;
959 struct dirent *dirent;
960 DIR *dir;
961
962 tmpc = find_mounted_controller(controller, &cfd);
963 *list = NULL;
964 if (!tmpc)
965 return false;
966
967 /* Make sure we pass a relative path to *at() family of functions. */
968 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
969 cg = alloca(len);
970 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
971 if (ret < 0 || (size_t)ret >= len) {
972 lxcfs_error("Pathname too long under %s\n", cgroup);
973 return false;
974 }
975
976 fd = openat(cfd, cg, O_DIRECTORY);
977 if (fd < 0)
978 return false;
979
980 dir = fdopendir(fd);
981 if (!dir)
982 return false;
983
984 while ((dirent = readdir(dir))) {
985 struct stat mystat;
986
987 if (!strcmp(dirent->d_name, ".") ||
988 !strcmp(dirent->d_name, ".."))
989 continue;
990
991 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
992 if (ret < 0 || ret >= MAXPATHLEN) {
993 lxcfs_error("Pathname too long under %s\n", cg);
994 continue;
995 }
996
997 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
998 if (ret) {
999 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1000 continue;
1001 }
1002 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1003 (directories && !S_ISDIR(mystat.st_mode)))
1004 continue;
1005
1006 if (sz+2 >= asz) {
1007 void **tmp;
1008 asz += BATCH_SIZE;
1009 do {
1010 tmp = realloc(*list, asz * typesize);
1011 } while (!tmp);
1012 *list = tmp;
1013 }
1014 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1015 (*list)[sz+1] = NULL;
1016 sz++;
1017 }
1018 if (closedir(dir) < 0) {
1019 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1020 return false;
1021 }
1022 return true;
1023 }
1024
1025 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1026 {
1027 char *dup;
1028 do {
1029 dup = strdup(dir_entry);
1030 } while (!dup);
1031 return dup;
1032 }
1033
1034 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1035 {
1036 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1037 }
1038
1039 void free_key(struct cgfs_files *k)
1040 {
1041 if (!k)
1042 return;
1043 free(k->name);
1044 free(k);
1045 }
1046
1047 void free_keys(struct cgfs_files **keys)
1048 {
1049 int i;
1050
1051 if (!keys)
1052 return;
1053 for (i = 0; keys[i]; i++) {
1054 free_key(keys[i]);
1055 }
1056 free(keys);
1057 }
1058
1059 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1060 {
1061 int ret, fd, cfd;
1062 size_t len;
1063 char *fnam, *tmpc;
1064
1065 tmpc = find_mounted_controller(controller, &cfd);
1066 if (!tmpc)
1067 return false;
1068
1069 /* Make sure we pass a relative path to *at() family of functions.
1070 * . + /cgroup + / + file + \0
1071 */
1072 len = strlen(cgroup) + strlen(file) + 3;
1073 fnam = alloca(len);
1074 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1075 if (ret < 0 || (size_t)ret >= len)
1076 return false;
1077
1078 fd = openat(cfd, fnam, O_RDONLY);
1079 if (fd < 0)
1080 return false;
1081
1082 *value = slurp_file(fnam, fd);
1083 return *value != NULL;
1084 }
1085
1086 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1087 {
1088 int ret, cfd;
1089 size_t len;
1090 char *fnam, *tmpc;
1091 struct stat sb;
1092 struct cgfs_files *newkey;
1093
1094 tmpc = find_mounted_controller(controller, &cfd);
1095 if (!tmpc)
1096 return false;
1097
1098 if (file && *file == '/')
1099 file++;
1100
1101 if (file && strchr(file, '/'))
1102 return NULL;
1103
1104 /* Make sure we pass a relative path to *at() family of functions.
1105 * . + /cgroup + / + file + \0
1106 */
1107 len = strlen(cgroup) + 3;
1108 if (file)
1109 len += strlen(file) + 1;
1110 fnam = alloca(len);
1111 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1112 file ? "/" : "", file ? file : "");
1113
1114 ret = fstatat(cfd, fnam, &sb, 0);
1115 if (ret < 0)
1116 return NULL;
1117
1118 do {
1119 newkey = malloc(sizeof(struct cgfs_files));
1120 } while (!newkey);
1121 if (file)
1122 newkey->name = must_copy_string(file);
1123 else if (strrchr(cgroup, '/'))
1124 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1125 else
1126 newkey->name = must_copy_string(cgroup);
1127 newkey->uid = sb.st_uid;
1128 newkey->gid = sb.st_gid;
1129 newkey->mode = sb.st_mode;
1130
1131 return newkey;
1132 }
1133
1134 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1135 {
1136 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1137 if (!entry) {
1138 lxcfs_error("Error getting files under %s:%s\n", controller,
1139 cgroup);
1140 }
1141 return entry;
1142 }
1143
1144 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1145 {
1146 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1147 }
1148
1149 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1150 {
1151 int cfd;
1152 size_t len;
1153 char *fnam, *tmpc;
1154 int ret;
1155 struct stat sb;
1156
1157 tmpc = find_mounted_controller(controller, &cfd);
1158 if (!tmpc)
1159 return false;
1160
1161 /* Make sure we pass a relative path to *at() family of functions.
1162 * . + /cgroup + / + f + \0
1163 */
1164 len = strlen(cgroup) + strlen(f) + 3;
1165 fnam = alloca(len);
1166 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1167 if (ret < 0 || (size_t)ret >= len)
1168 return false;
1169
1170 ret = fstatat(cfd, fnam, &sb, 0);
1171 if (ret < 0 || !S_ISDIR(sb.st_mode))
1172 return false;
1173
1174 return true;
1175 }
1176
1177 #define SEND_CREDS_OK 0
1178 #define SEND_CREDS_NOTSK 1
1179 #define SEND_CREDS_FAIL 2
1180 static bool recv_creds(int sock, struct ucred *cred, char *v);
1181 static int wait_for_pid(pid_t pid);
1182 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1183 static int send_creds_clone_wrapper(void *arg);
1184
1185 /*
1186 * clone a task which switches to @task's namespace and writes '1'.
1187 * over a unix sock so we can read the task's reaper's pid in our
1188 * namespace
1189 *
1190 * Note: glibc's fork() does not respect pidns, which can lead to failed
1191 * assertions inside glibc (and thus failed forks) if the child's pid in
1192 * the pidns and the parent pid outside are identical. Using clone prevents
1193 * this issue.
1194 */
1195 static void write_task_init_pid_exit(int sock, pid_t target)
1196 {
1197 char fnam[100];
1198 pid_t pid;
1199 int fd, ret;
1200 size_t stack_size = sysconf(_SC_PAGESIZE);
1201 void *stack = alloca(stack_size);
1202
1203 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1204 if (ret < 0 || ret >= sizeof(fnam))
1205 _exit(1);
1206
1207 fd = open(fnam, O_RDONLY);
1208 if (fd < 0) {
1209 perror("write_task_init_pid_exit open of ns/pid");
1210 _exit(1);
1211 }
1212 if (setns(fd, 0)) {
1213 perror("write_task_init_pid_exit setns 1");
1214 close(fd);
1215 _exit(1);
1216 }
1217 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1218 if (pid < 0)
1219 _exit(1);
1220 if (pid != 0) {
1221 if (!wait_for_pid(pid))
1222 _exit(1);
1223 _exit(0);
1224 }
1225 }
1226
1227 static int send_creds_clone_wrapper(void *arg) {
1228 struct ucred cred;
1229 char v;
1230 int sock = *(int *)arg;
1231
1232 /* we are the child */
1233 cred.uid = 0;
1234 cred.gid = 0;
1235 cred.pid = 1;
1236 v = '1';
1237 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1238 return 1;
1239 return 0;
1240 }
1241
1242 static pid_t get_init_pid_for_task(pid_t task)
1243 {
1244 int sock[2];
1245 pid_t pid;
1246 pid_t ret = -1;
1247 char v = '0';
1248 struct ucred cred;
1249
1250 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1251 perror("socketpair");
1252 return -1;
1253 }
1254
1255 pid = fork();
1256 if (pid < 0)
1257 goto out;
1258 if (!pid) {
1259 close(sock[1]);
1260 write_task_init_pid_exit(sock[0], task);
1261 _exit(0);
1262 }
1263
1264 if (!recv_creds(sock[1], &cred, &v))
1265 goto out;
1266 ret = cred.pid;
1267
1268 out:
1269 close(sock[0]);
1270 close(sock[1]);
1271 if (pid > 0)
1272 wait_for_pid(pid);
1273 return ret;
1274 }
1275
1276 static pid_t lookup_initpid_in_store(pid_t qpid)
1277 {
1278 pid_t answer = 0;
1279 struct stat sb;
1280 struct pidns_init_store *e;
1281 char fnam[100];
1282
1283 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1284 store_lock();
1285 if (stat(fnam, &sb) < 0)
1286 goto out;
1287 e = lookup_verify_initpid(&sb);
1288 if (e) {
1289 answer = e->initpid;
1290 goto out;
1291 }
1292 answer = get_init_pid_for_task(qpid);
1293 if (answer > 0)
1294 save_initpid(&sb, answer);
1295
1296 out:
1297 /* we prune at end in case we are returning
1298 * the value we were about to return */
1299 prune_initpid_store();
1300 store_unlock();
1301 return answer;
1302 }
1303
1304 static int wait_for_pid(pid_t pid)
1305 {
1306 int status, ret;
1307
1308 if (pid <= 0)
1309 return -1;
1310
1311 again:
1312 ret = waitpid(pid, &status, 0);
1313 if (ret == -1) {
1314 if (errno == EINTR)
1315 goto again;
1316 return -1;
1317 }
1318 if (ret != pid)
1319 goto again;
1320 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1321 return -1;
1322 return 0;
1323 }
1324
1325
1326 /*
1327 * append pid to *src.
1328 * src: a pointer to a char* in which ot append the pid.
1329 * sz: the number of characters printed so far, minus trailing \0.
1330 * asz: the allocated size so far
1331 * pid: the pid to append
1332 */
1333 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1334 {
1335 char tmp[30];
1336
1337 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1338
1339 if (!*src || tmplen + *sz + 1 >= *asz) {
1340 char *tmp;
1341 do {
1342 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1343 } while (!tmp);
1344 *src = tmp;
1345 *asz += BUF_RESERVE_SIZE;
1346 }
1347 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1348 *sz += tmplen;
1349 }
1350
1351 /*
1352 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1353 * valid in the caller's namespace, return the id mapped into
1354 * pid's namespace.
1355 * Returns the mapped id, or -1 on error.
1356 */
1357 unsigned int
1358 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1359 {
1360 unsigned int nsuid, // base id for a range in the idfile's namespace
1361 hostuid, // base id for a range in the caller's namespace
1362 count; // number of ids in this range
1363 char line[400];
1364 int ret;
1365
1366 fseek(idfile, 0L, SEEK_SET);
1367 while (fgets(line, 400, idfile)) {
1368 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1369 if (ret != 3)
1370 continue;
1371 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1372 /*
1373 * uids wrapped around - unexpected as this is a procfile,
1374 * so just bail.
1375 */
1376 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1377 nsuid, hostuid, count, line);
1378 return -1;
1379 }
1380 if (hostuid <= in_id && hostuid+count > in_id) {
1381 /*
1382 * now since hostuid <= in_id < hostuid+count, and
1383 * hostuid+count and nsuid+count do not wrap around,
1384 * we know that nsuid+(in_id-hostuid) which must be
1385 * less that nsuid+(count) must not wrap around
1386 */
1387 return (in_id - hostuid) + nsuid;
1388 }
1389 }
1390
1391 // no answer found
1392 return -1;
1393 }
1394
1395 /*
1396 * for is_privileged_over,
1397 * specify whether we require the calling uid to be root in his
1398 * namespace
1399 */
1400 #define NS_ROOT_REQD true
1401 #define NS_ROOT_OPT false
1402
1403 #define PROCLEN 100
1404
1405 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1406 {
1407 char fpath[PROCLEN];
1408 int ret;
1409 bool answer = false;
1410 uid_t nsuid;
1411
1412 if (victim == -1 || uid == -1)
1413 return false;
1414
1415 /*
1416 * If the request is one not requiring root in the namespace,
1417 * then having the same uid suffices. (i.e. uid 1000 has write
1418 * access to files owned by uid 1000
1419 */
1420 if (!req_ns_root && uid == victim)
1421 return true;
1422
1423 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1424 if (ret < 0 || ret >= PROCLEN)
1425 return false;
1426 FILE *f = fopen(fpath, "r");
1427 if (!f)
1428 return false;
1429
1430 /* if caller's not root in his namespace, reject */
1431 nsuid = convert_id_to_ns(f, uid);
1432 if (nsuid)
1433 goto out;
1434
1435 /*
1436 * If victim is not mapped into caller's ns, reject.
1437 * XXX I'm not sure this check is needed given that fuse
1438 * will be sending requests where the vfs has converted
1439 */
1440 nsuid = convert_id_to_ns(f, victim);
1441 if (nsuid == -1)
1442 goto out;
1443
1444 answer = true;
1445
1446 out:
1447 fclose(f);
1448 return answer;
1449 }
1450
1451 static bool perms_include(int fmode, mode_t req_mode)
1452 {
1453 mode_t r;
1454
1455 switch (req_mode & O_ACCMODE) {
1456 case O_RDONLY:
1457 r = S_IROTH;
1458 break;
1459 case O_WRONLY:
1460 r = S_IWOTH;
1461 break;
1462 case O_RDWR:
1463 r = S_IROTH | S_IWOTH;
1464 break;
1465 default:
1466 return false;
1467 }
1468 return ((fmode & r) == r);
1469 }
1470
1471
1472 /*
1473 * taskcg is a/b/c
1474 * querycg is /a/b/c/d/e
1475 * we return 'd'
1476 */
1477 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1478 {
1479 char *start, *end;
1480
1481 if (strlen(taskcg) <= strlen(querycg)) {
1482 lxcfs_error("%s\n", "I was fed bad input.");
1483 return NULL;
1484 }
1485
1486 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1487 start = strdup(taskcg + 1);
1488 else
1489 start = strdup(taskcg + strlen(querycg) + 1);
1490 if (!start)
1491 return NULL;
1492 end = strchr(start, '/');
1493 if (end)
1494 *end = '\0';
1495 return start;
1496 }
1497
1498 static void stripnewline(char *x)
1499 {
1500 size_t l = strlen(x);
1501 if (l && x[l-1] == '\n')
1502 x[l-1] = '\0';
1503 }
1504
1505 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1506 {
1507 int cfd;
1508 char fnam[PROCLEN];
1509 FILE *f;
1510 char *answer = NULL;
1511 char *line = NULL;
1512 size_t len = 0;
1513 int ret;
1514 const char *h = find_mounted_controller(contrl, &cfd);
1515 if (!h)
1516 return NULL;
1517
1518 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1519 if (ret < 0 || ret >= PROCLEN)
1520 return NULL;
1521 if (!(f = fopen(fnam, "r")))
1522 return NULL;
1523
1524 while (getline(&line, &len, f) != -1) {
1525 char *c1, *c2;
1526 if (!line[0])
1527 continue;
1528 c1 = strchr(line, ':');
1529 if (!c1)
1530 goto out;
1531 c1++;
1532 c2 = strchr(c1, ':');
1533 if (!c2)
1534 goto out;
1535 *c2 = '\0';
1536 if (strcmp(c1, h) != 0)
1537 continue;
1538 c2++;
1539 stripnewline(c2);
1540 do {
1541 answer = strdup(c2);
1542 } while (!answer);
1543 break;
1544 }
1545
1546 out:
1547 fclose(f);
1548 free(line);
1549 return answer;
1550 }
1551
1552 /*
1553 * check whether a fuse context may access a cgroup dir or file
1554 *
1555 * If file is not null, it is a cgroup file to check under cg.
1556 * If file is null, then we are checking perms on cg itself.
1557 *
1558 * For files we can check the mode of the list_keys result.
1559 * For cgroups, we must make assumptions based on the files under the
1560 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1561 * yet.
1562 */
1563 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1564 {
1565 struct cgfs_files *k = NULL;
1566 bool ret = false;
1567
1568 k = cgfs_get_key(contrl, cg, file);
1569 if (!k)
1570 return false;
1571
1572 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1573 if (perms_include(k->mode >> 6, mode)) {
1574 ret = true;
1575 goto out;
1576 }
1577 }
1578 if (fc->gid == k->gid) {
1579 if (perms_include(k->mode >> 3, mode)) {
1580 ret = true;
1581 goto out;
1582 }
1583 }
1584 ret = perms_include(k->mode, mode);
1585
1586 out:
1587 free_key(k);
1588 return ret;
1589 }
1590
1591 #define INITSCOPE "/init.scope"
1592 static void prune_init_slice(char *cg)
1593 {
1594 char *point;
1595 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1596
1597 if (cg_len < initscope_len)
1598 return;
1599
1600 point = cg + cg_len - initscope_len;
1601 if (strcmp(point, INITSCOPE) == 0) {
1602 if (point == cg)
1603 *(point+1) = '\0';
1604 else
1605 *point = '\0';
1606 }
1607 }
1608
1609 /*
1610 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1611 * If pid is in /a, he may act on /a/b, but not on /b.
1612 * if the answer is false and nextcg is not NULL, then *nextcg will point
1613 * to a string containing the next cgroup directory under cg, which must be
1614 * freed by the caller.
1615 */
1616 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1617 {
1618 bool answer = false;
1619 char *c2 = get_pid_cgroup(pid, contrl);
1620 char *linecmp;
1621
1622 if (!c2)
1623 return false;
1624 prune_init_slice(c2);
1625
1626 /*
1627 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1628 * they pass in a cgroup without leading '/'
1629 *
1630 * The original line here was:
1631 * linecmp = *cg == '/' ? c2 : c2+1;
1632 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1633 * Serge, do you know?
1634 */
1635 if (*cg == '/' || !strncmp(cg, "./", 2))
1636 linecmp = c2;
1637 else
1638 linecmp = c2 + 1;
1639 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1640 if (nextcg) {
1641 *nextcg = get_next_cgroup_dir(linecmp, cg);
1642 }
1643 goto out;
1644 }
1645 answer = true;
1646
1647 out:
1648 free(c2);
1649 return answer;
1650 }
1651
1652 /*
1653 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1654 */
1655 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1656 {
1657 bool answer = false;
1658 char *c2, *task_cg;
1659 size_t target_len, task_len;
1660
1661 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1662 return true;
1663
1664 c2 = get_pid_cgroup(pid, contrl);
1665 if (!c2)
1666 return false;
1667 prune_init_slice(c2);
1668
1669 task_cg = c2 + 1;
1670 target_len = strlen(cg);
1671 task_len = strlen(task_cg);
1672 if (task_len == 0) {
1673 /* Task is in the root cg, it can see everything. This case is
1674 * not handled by the strmcps below, since they test for the
1675 * last /, but that is the first / that we've chopped off
1676 * above.
1677 */
1678 answer = true;
1679 goto out;
1680 }
1681 if (strcmp(cg, task_cg) == 0) {
1682 answer = true;
1683 goto out;
1684 }
1685 if (target_len < task_len) {
1686 /* looking up a parent dir */
1687 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1688 answer = true;
1689 goto out;
1690 }
1691 if (target_len > task_len) {
1692 /* looking up a child dir */
1693 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1694 answer = true;
1695 goto out;
1696 }
1697
1698 out:
1699 free(c2);
1700 return answer;
1701 }
1702
1703 /*
1704 * given /cgroup/freezer/a/b, return "freezer".
1705 * the returned char* should NOT be freed.
1706 */
1707 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1708 {
1709 const char *p1;
1710 char *contr, *slash;
1711
1712 if (strlen(path) < 9) {
1713 errno = EACCES;
1714 return NULL;
1715 }
1716 if (*(path + 7) != '/') {
1717 errno = EINVAL;
1718 return NULL;
1719 }
1720 p1 = path + 8;
1721 contr = strdupa(p1);
1722 if (!contr) {
1723 errno = ENOMEM;
1724 return NULL;
1725 }
1726 slash = strstr(contr, "/");
1727 if (slash)
1728 *slash = '\0';
1729
1730 int i;
1731 for (i = 0; i < num_hierarchies; i++) {
1732 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1733 return hierarchies[i];
1734 }
1735 errno = ENOENT;
1736 return NULL;
1737 }
1738
1739 /*
1740 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1741 * Note that the returned value may include files (keynames) etc
1742 */
1743 static const char *find_cgroup_in_path(const char *path)
1744 {
1745 const char *p1;
1746
1747 if (strlen(path) < 9) {
1748 errno = EACCES;
1749 return NULL;
1750 }
1751 p1 = strstr(path + 8, "/");
1752 if (!p1) {
1753 errno = EINVAL;
1754 return NULL;
1755 }
1756 errno = 0;
1757 return p1 + 1;
1758 }
1759
1760 /*
1761 * split the last path element from the path in @cg.
1762 * @dir is newly allocated and should be freed, @last not
1763 */
1764 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1765 {
1766 char *p;
1767
1768 do {
1769 *dir = strdup(cg);
1770 } while (!*dir);
1771 *last = strrchr(cg, '/');
1772 if (!*last) {
1773 *last = NULL;
1774 return;
1775 }
1776 p = strrchr(*dir, '/');
1777 *p = '\0';
1778 }
1779
1780 /*
1781 * FUSE ops for /cgroup
1782 */
1783
1784 int cg_getattr(const char *path, struct stat *sb)
1785 {
1786 struct timespec now;
1787 struct fuse_context *fc = fuse_get_context();
1788 char * cgdir = NULL;
1789 char *last = NULL, *path1, *path2;
1790 struct cgfs_files *k = NULL;
1791 const char *cgroup;
1792 const char *controller = NULL;
1793 int ret = -ENOENT;
1794
1795
1796 if (!fc)
1797 return -EIO;
1798
1799 memset(sb, 0, sizeof(struct stat));
1800
1801 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1802 return -EINVAL;
1803
1804 sb->st_uid = sb->st_gid = 0;
1805 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1806 sb->st_size = 0;
1807
1808 if (strcmp(path, "/cgroup") == 0) {
1809 sb->st_mode = S_IFDIR | 00755;
1810 sb->st_nlink = 2;
1811 return 0;
1812 }
1813
1814 controller = pick_controller_from_path(fc, path);
1815 if (!controller)
1816 return -errno;
1817 cgroup = find_cgroup_in_path(path);
1818 if (!cgroup) {
1819 /* this is just /cgroup/controller, return it as a dir */
1820 sb->st_mode = S_IFDIR | 00755;
1821 sb->st_nlink = 2;
1822 return 0;
1823 }
1824
1825 get_cgdir_and_path(cgroup, &cgdir, &last);
1826
1827 if (!last) {
1828 path1 = "/";
1829 path2 = cgdir;
1830 } else {
1831 path1 = cgdir;
1832 path2 = last;
1833 }
1834
1835 pid_t initpid = lookup_initpid_in_store(fc->pid);
1836 if (initpid <= 0)
1837 initpid = fc->pid;
1838 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1839 * Then check that caller's cgroup is under path if last is a child
1840 * cgroup, or cgdir if last is a file */
1841
1842 if (is_child_cgroup(controller, path1, path2)) {
1843 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1844 ret = -ENOENT;
1845 goto out;
1846 }
1847 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1848 /* this is just /cgroup/controller, return it as a dir */
1849 sb->st_mode = S_IFDIR | 00555;
1850 sb->st_nlink = 2;
1851 ret = 0;
1852 goto out;
1853 }
1854 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1855 ret = -EACCES;
1856 goto out;
1857 }
1858
1859 // get uid, gid, from '/tasks' file and make up a mode
1860 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1861 sb->st_mode = S_IFDIR | 00755;
1862 k = cgfs_get_key(controller, cgroup, NULL);
1863 if (!k) {
1864 sb->st_uid = sb->st_gid = 0;
1865 } else {
1866 sb->st_uid = k->uid;
1867 sb->st_gid = k->gid;
1868 }
1869 free_key(k);
1870 sb->st_nlink = 2;
1871 ret = 0;
1872 goto out;
1873 }
1874
1875 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1876 sb->st_mode = S_IFREG | k->mode;
1877 sb->st_nlink = 1;
1878 sb->st_uid = k->uid;
1879 sb->st_gid = k->gid;
1880 sb->st_size = 0;
1881 free_key(k);
1882 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1883 ret = -ENOENT;
1884 goto out;
1885 }
1886 ret = 0;
1887 }
1888
1889 out:
1890 free(cgdir);
1891 return ret;
1892 }
1893
1894 int cg_opendir(const char *path, struct fuse_file_info *fi)
1895 {
1896 struct fuse_context *fc = fuse_get_context();
1897 const char *cgroup;
1898 struct file_info *dir_info;
1899 char *controller = NULL;
1900
1901 if (!fc)
1902 return -EIO;
1903
1904 if (strcmp(path, "/cgroup") == 0) {
1905 cgroup = NULL;
1906 controller = NULL;
1907 } else {
1908 // return list of keys for the controller, and list of child cgroups
1909 controller = pick_controller_from_path(fc, path);
1910 if (!controller)
1911 return -errno;
1912
1913 cgroup = find_cgroup_in_path(path);
1914 if (!cgroup) {
1915 /* this is just /cgroup/controller, return its contents */
1916 cgroup = "/";
1917 }
1918 }
1919
1920 pid_t initpid = lookup_initpid_in_store(fc->pid);
1921 if (initpid <= 0)
1922 initpid = fc->pid;
1923 if (cgroup) {
1924 if (!caller_may_see_dir(initpid, controller, cgroup))
1925 return -ENOENT;
1926 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1927 return -EACCES;
1928 }
1929
1930 /* we'll free this at cg_releasedir */
1931 dir_info = malloc(sizeof(*dir_info));
1932 if (!dir_info)
1933 return -ENOMEM;
1934 dir_info->controller = must_copy_string(controller);
1935 dir_info->cgroup = must_copy_string(cgroup);
1936 dir_info->type = LXC_TYPE_CGDIR;
1937 dir_info->buf = NULL;
1938 dir_info->file = NULL;
1939 dir_info->buflen = 0;
1940
1941 fi->fh = (unsigned long)dir_info;
1942 return 0;
1943 }
1944
1945 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1946 struct fuse_file_info *fi)
1947 {
1948 struct file_info *d = (struct file_info *)fi->fh;
1949 struct cgfs_files **list = NULL;
1950 int i, ret;
1951 char *nextcg = NULL;
1952 struct fuse_context *fc = fuse_get_context();
1953 char **clist = NULL;
1954
1955 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1956 return -EIO;
1957
1958 if (d->type != LXC_TYPE_CGDIR) {
1959 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1960 return -EIO;
1961 }
1962 if (!d->cgroup && !d->controller) {
1963 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1964 int i;
1965
1966 for (i = 0; i < num_hierarchies; i++) {
1967 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1968 return -EIO;
1969 }
1970 }
1971 return 0;
1972 }
1973
1974 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1975 // not a valid cgroup
1976 ret = -EINVAL;
1977 goto out;
1978 }
1979
1980 pid_t initpid = lookup_initpid_in_store(fc->pid);
1981 if (initpid <= 0)
1982 initpid = fc->pid;
1983 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1984 if (nextcg) {
1985 ret = filler(buf, nextcg, NULL, 0);
1986 free(nextcg);
1987 if (ret != 0) {
1988 ret = -EIO;
1989 goto out;
1990 }
1991 }
1992 ret = 0;
1993 goto out;
1994 }
1995
1996 for (i = 0; list[i]; i++) {
1997 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1998 ret = -EIO;
1999 goto out;
2000 }
2001 }
2002
2003 // now get the list of child cgroups
2004
2005 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2006 ret = 0;
2007 goto out;
2008 }
2009 if (clist) {
2010 for (i = 0; clist[i]; i++) {
2011 if (filler(buf, clist[i], NULL, 0) != 0) {
2012 ret = -EIO;
2013 goto out;
2014 }
2015 }
2016 }
2017 ret = 0;
2018
2019 out:
2020 free_keys(list);
2021 if (clist) {
2022 for (i = 0; clist[i]; i++)
2023 free(clist[i]);
2024 free(clist);
2025 }
2026 return ret;
2027 }
2028
2029 static void do_release_file_info(struct fuse_file_info *fi)
2030 {
2031 struct file_info *f = (struct file_info *)fi->fh;
2032
2033 if (!f)
2034 return;
2035
2036 fi->fh = 0;
2037
2038 free(f->controller);
2039 f->controller = NULL;
2040 free(f->cgroup);
2041 f->cgroup = NULL;
2042 free(f->file);
2043 f->file = NULL;
2044 free(f->buf);
2045 f->buf = NULL;
2046 free(f);
2047 }
2048
2049 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2050 {
2051 do_release_file_info(fi);
2052 return 0;
2053 }
2054
2055 int cg_open(const char *path, struct fuse_file_info *fi)
2056 {
2057 const char *cgroup;
2058 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2059 struct cgfs_files *k = NULL;
2060 struct file_info *file_info;
2061 struct fuse_context *fc = fuse_get_context();
2062 int ret;
2063
2064 if (!fc)
2065 return -EIO;
2066
2067 controller = pick_controller_from_path(fc, path);
2068 if (!controller)
2069 return -errno;
2070 cgroup = find_cgroup_in_path(path);
2071 if (!cgroup)
2072 return -errno;
2073
2074 get_cgdir_and_path(cgroup, &cgdir, &last);
2075 if (!last) {
2076 path1 = "/";
2077 path2 = cgdir;
2078 } else {
2079 path1 = cgdir;
2080 path2 = last;
2081 }
2082
2083 k = cgfs_get_key(controller, path1, path2);
2084 if (!k) {
2085 ret = -EINVAL;
2086 goto out;
2087 }
2088 free_key(k);
2089
2090 pid_t initpid = lookup_initpid_in_store(fc->pid);
2091 if (initpid <= 0)
2092 initpid = fc->pid;
2093 if (!caller_may_see_dir(initpid, controller, path1)) {
2094 ret = -ENOENT;
2095 goto out;
2096 }
2097 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2098 ret = -EACCES;
2099 goto out;
2100 }
2101
2102 /* we'll free this at cg_release */
2103 file_info = malloc(sizeof(*file_info));
2104 if (!file_info) {
2105 ret = -ENOMEM;
2106 goto out;
2107 }
2108 file_info->controller = must_copy_string(controller);
2109 file_info->cgroup = must_copy_string(path1);
2110 file_info->file = must_copy_string(path2);
2111 file_info->type = LXC_TYPE_CGFILE;
2112 file_info->buf = NULL;
2113 file_info->buflen = 0;
2114
2115 fi->fh = (unsigned long)file_info;
2116 ret = 0;
2117
2118 out:
2119 free(cgdir);
2120 return ret;
2121 }
2122
2123 int cg_access(const char *path, int mode)
2124 {
2125 int ret;
2126 const char *cgroup;
2127 char *path1, *path2, *controller;
2128 char *last = NULL, *cgdir = NULL;
2129 struct cgfs_files *k = NULL;
2130 struct fuse_context *fc = fuse_get_context();
2131
2132 if (strcmp(path, "/cgroup") == 0)
2133 return 0;
2134
2135 if (!fc)
2136 return -EIO;
2137
2138 controller = pick_controller_from_path(fc, path);
2139 if (!controller)
2140 return -errno;
2141 cgroup = find_cgroup_in_path(path);
2142 if (!cgroup) {
2143 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2144 if ((mode & W_OK) == 0)
2145 return 0;
2146 return -EACCES;
2147 }
2148
2149 get_cgdir_and_path(cgroup, &cgdir, &last);
2150 if (!last) {
2151 path1 = "/";
2152 path2 = cgdir;
2153 } else {
2154 path1 = cgdir;
2155 path2 = last;
2156 }
2157
2158 k = cgfs_get_key(controller, path1, path2);
2159 if (!k) {
2160 if ((mode & W_OK) == 0)
2161 ret = 0;
2162 else
2163 ret = -EACCES;
2164 goto out;
2165 }
2166 free_key(k);
2167
2168 pid_t initpid = lookup_initpid_in_store(fc->pid);
2169 if (initpid <= 0)
2170 initpid = fc->pid;
2171 if (!caller_may_see_dir(initpid, controller, path1)) {
2172 ret = -ENOENT;
2173 goto out;
2174 }
2175 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2176 ret = -EACCES;
2177 goto out;
2178 }
2179
2180 ret = 0;
2181
2182 out:
2183 free(cgdir);
2184 return ret;
2185 }
2186
2187 int cg_release(const char *path, struct fuse_file_info *fi)
2188 {
2189 do_release_file_info(fi);
2190 return 0;
2191 }
2192
2193 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2194
2195 static bool wait_for_sock(int sock, int timeout)
2196 {
2197 struct epoll_event ev;
2198 int epfd, ret, now, starttime, deltatime, saved_errno;
2199
2200 if ((starttime = time(NULL)) < 0)
2201 return false;
2202
2203 if ((epfd = epoll_create(1)) < 0) {
2204 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2205 return false;
2206 }
2207
2208 ev.events = POLLIN_SET;
2209 ev.data.fd = sock;
2210 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2211 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2212 close(epfd);
2213 return false;
2214 }
2215
2216 again:
2217 if ((now = time(NULL)) < 0) {
2218 close(epfd);
2219 return false;
2220 }
2221
2222 deltatime = (starttime + timeout) - now;
2223 if (deltatime < 0) { // timeout
2224 errno = 0;
2225 close(epfd);
2226 return false;
2227 }
2228 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2229 if (ret < 0 && errno == EINTR)
2230 goto again;
2231 saved_errno = errno;
2232 close(epfd);
2233
2234 if (ret <= 0) {
2235 errno = saved_errno;
2236 return false;
2237 }
2238 return true;
2239 }
2240
2241 static int msgrecv(int sockfd, void *buf, size_t len)
2242 {
2243 if (!wait_for_sock(sockfd, 2))
2244 return -1;
2245 return recv(sockfd, buf, len, MSG_DONTWAIT);
2246 }
2247
2248 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2249 {
2250 struct msghdr msg = { 0 };
2251 struct iovec iov;
2252 struct cmsghdr *cmsg;
2253 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2254 char buf[1];
2255 buf[0] = 'p';
2256
2257 if (pingfirst) {
2258 if (msgrecv(sock, buf, 1) != 1) {
2259 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2260 return SEND_CREDS_FAIL;
2261 }
2262 }
2263
2264 msg.msg_control = cmsgbuf;
2265 msg.msg_controllen = sizeof(cmsgbuf);
2266
2267 cmsg = CMSG_FIRSTHDR(&msg);
2268 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2269 cmsg->cmsg_level = SOL_SOCKET;
2270 cmsg->cmsg_type = SCM_CREDENTIALS;
2271 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2272
2273 msg.msg_name = NULL;
2274 msg.msg_namelen = 0;
2275
2276 buf[0] = v;
2277 iov.iov_base = buf;
2278 iov.iov_len = sizeof(buf);
2279 msg.msg_iov = &iov;
2280 msg.msg_iovlen = 1;
2281
2282 if (sendmsg(sock, &msg, 0) < 0) {
2283 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2284 if (errno == 3)
2285 return SEND_CREDS_NOTSK;
2286 return SEND_CREDS_FAIL;
2287 }
2288
2289 return SEND_CREDS_OK;
2290 }
2291
2292 static bool recv_creds(int sock, struct ucred *cred, char *v)
2293 {
2294 struct msghdr msg = { 0 };
2295 struct iovec iov;
2296 struct cmsghdr *cmsg;
2297 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2298 char buf[1];
2299 int ret;
2300 int optval = 1;
2301
2302 *v = '1';
2303
2304 cred->pid = -1;
2305 cred->uid = -1;
2306 cred->gid = -1;
2307
2308 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2309 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2310 return false;
2311 }
2312 buf[0] = '1';
2313 if (write(sock, buf, 1) != 1) {
2314 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2315 return false;
2316 }
2317
2318 msg.msg_name = NULL;
2319 msg.msg_namelen = 0;
2320 msg.msg_control = cmsgbuf;
2321 msg.msg_controllen = sizeof(cmsgbuf);
2322
2323 iov.iov_base = buf;
2324 iov.iov_len = sizeof(buf);
2325 msg.msg_iov = &iov;
2326 msg.msg_iovlen = 1;
2327
2328 if (!wait_for_sock(sock, 2)) {
2329 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2330 return false;
2331 }
2332 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2333 if (ret < 0) {
2334 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2335 return false;
2336 }
2337
2338 cmsg = CMSG_FIRSTHDR(&msg);
2339
2340 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2341 cmsg->cmsg_level == SOL_SOCKET &&
2342 cmsg->cmsg_type == SCM_CREDENTIALS) {
2343 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2344 }
2345 *v = buf[0];
2346
2347 return true;
2348 }
2349
2350 struct pid_ns_clone_args {
2351 int *cpipe;
2352 int sock;
2353 pid_t tpid;
2354 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2355 };
2356
2357 /*
2358 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2359 * with clone(). This simply writes '1' as ACK back to the parent
2360 * before calling the actual wrapped function.
2361 */
2362 static int pid_ns_clone_wrapper(void *arg) {
2363 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2364 char b = '1';
2365
2366 close(args->cpipe[0]);
2367 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2368 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2369 close(args->cpipe[1]);
2370 return args->wrapped(args->sock, args->tpid);
2371 }
2372
2373 /*
2374 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2375 * int value back over the socket. This shifts the pid from the
2376 * sender's pidns into tpid's pidns.
2377 */
2378 static int pid_to_ns(int sock, pid_t tpid)
2379 {
2380 char v = '0';
2381 struct ucred cred;
2382
2383 while (recv_creds(sock, &cred, &v)) {
2384 if (v == '1')
2385 return 0;
2386 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2387 return 1;
2388 }
2389 return 0;
2390 }
2391
2392
2393 /*
2394 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2395 * in your old pidns. Only children which you clone will be in the target
2396 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2397 * actually convert pids.
2398 *
2399 * Note: glibc's fork() does not respect pidns, which can lead to failed
2400 * assertions inside glibc (and thus failed forks) if the child's pid in
2401 * the pidns and the parent pid outside are identical. Using clone prevents
2402 * this issue.
2403 */
2404 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2405 {
2406 int newnsfd = -1, ret, cpipe[2];
2407 char fnam[100];
2408 pid_t cpid;
2409 char v;
2410
2411 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2412 if (ret < 0 || ret >= sizeof(fnam))
2413 _exit(1);
2414 newnsfd = open(fnam, O_RDONLY);
2415 if (newnsfd < 0)
2416 _exit(1);
2417 if (setns(newnsfd, 0) < 0)
2418 _exit(1);
2419 close(newnsfd);
2420
2421 if (pipe(cpipe) < 0)
2422 _exit(1);
2423
2424 struct pid_ns_clone_args args = {
2425 .cpipe = cpipe,
2426 .sock = sock,
2427 .tpid = tpid,
2428 .wrapped = &pid_to_ns
2429 };
2430 size_t stack_size = sysconf(_SC_PAGESIZE);
2431 void *stack = alloca(stack_size);
2432
2433 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2434 if (cpid < 0)
2435 _exit(1);
2436
2437 // give the child 1 second to be done forking and
2438 // write its ack
2439 if (!wait_for_sock(cpipe[0], 1))
2440 _exit(1);
2441 ret = read(cpipe[0], &v, 1);
2442 if (ret != sizeof(char) || v != '1')
2443 _exit(1);
2444
2445 if (!wait_for_pid(cpid))
2446 _exit(1);
2447 _exit(0);
2448 }
2449
2450 /*
2451 * To read cgroup files with a particular pid, we will setns into the child
2452 * pidns, open a pipe, fork a child - which will be the first to really be in
2453 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2454 */
2455 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2456 {
2457 int sock[2] = {-1, -1};
2458 char *tmpdata = NULL;
2459 int ret;
2460 pid_t qpid, cpid = -1;
2461 bool answer = false;
2462 char v = '0';
2463 struct ucred cred;
2464 size_t sz = 0, asz = 0;
2465
2466 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2467 return false;
2468
2469 /*
2470 * Now we read the pids from returned data one by one, pass
2471 * them into a child in the target namespace, read back the
2472 * translated pids, and put them into our to-return data
2473 */
2474
2475 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2476 perror("socketpair");
2477 free(tmpdata);
2478 return false;
2479 }
2480
2481 cpid = fork();
2482 if (cpid == -1)
2483 goto out;
2484
2485 if (!cpid) // child - exits when done
2486 pid_to_ns_wrapper(sock[1], tpid);
2487
2488 char *ptr = tmpdata;
2489 cred.uid = 0;
2490 cred.gid = 0;
2491 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2492 cred.pid = qpid;
2493 ret = send_creds(sock[0], &cred, v, true);
2494
2495 if (ret == SEND_CREDS_NOTSK)
2496 goto next;
2497 if (ret == SEND_CREDS_FAIL)
2498 goto out;
2499
2500 // read converted results
2501 if (!wait_for_sock(sock[0], 2)) {
2502 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2503 goto out;
2504 }
2505 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2506 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2507 goto out;
2508 }
2509 must_strcat_pid(d, &sz, &asz, qpid);
2510 next:
2511 ptr = strchr(ptr, '\n');
2512 if (!ptr)
2513 break;
2514 ptr++;
2515 }
2516
2517 cred.pid = getpid();
2518 v = '1';
2519 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2520 // failed to ask child to exit
2521 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2522 goto out;
2523 }
2524
2525 answer = true;
2526
2527 out:
2528 free(tmpdata);
2529 if (cpid != -1)
2530 wait_for_pid(cpid);
2531 if (sock[0] != -1) {
2532 close(sock[0]);
2533 close(sock[1]);
2534 }
2535 return answer;
2536 }
2537
2538 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2539 struct fuse_file_info *fi)
2540 {
2541 struct fuse_context *fc = fuse_get_context();
2542 struct file_info *f = (struct file_info *)fi->fh;
2543 struct cgfs_files *k = NULL;
2544 char *data = NULL;
2545 int ret, s;
2546 bool r;
2547
2548 if (f->type != LXC_TYPE_CGFILE) {
2549 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2550 return -EIO;
2551 }
2552
2553 if (offset)
2554 return 0;
2555
2556 if (!fc)
2557 return -EIO;
2558
2559 if (!f->controller)
2560 return -EINVAL;
2561
2562 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2563 return -EINVAL;
2564 }
2565 free_key(k);
2566
2567
2568 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2569 ret = -EACCES;
2570 goto out;
2571 }
2572
2573 if (strcmp(f->file, "tasks") == 0 ||
2574 strcmp(f->file, "/tasks") == 0 ||
2575 strcmp(f->file, "/cgroup.procs") == 0 ||
2576 strcmp(f->file, "cgroup.procs") == 0)
2577 // special case - we have to translate the pids
2578 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2579 else
2580 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2581
2582 if (!r) {
2583 ret = -EINVAL;
2584 goto out;
2585 }
2586
2587 if (!data) {
2588 ret = 0;
2589 goto out;
2590 }
2591 s = strlen(data);
2592 if (s > size)
2593 s = size;
2594 memcpy(buf, data, s);
2595 if (s > 0 && s < size && data[s-1] != '\n')
2596 buf[s++] = '\n';
2597
2598 ret = s;
2599
2600 out:
2601 free(data);
2602 return ret;
2603 }
2604
2605 static int pid_from_ns(int sock, pid_t tpid)
2606 {
2607 pid_t vpid;
2608 struct ucred cred;
2609 char v;
2610 int ret;
2611
2612 cred.uid = 0;
2613 cred.gid = 0;
2614 while (1) {
2615 if (!wait_for_sock(sock, 2)) {
2616 lxcfs_error("%s\n", "Timeout reading from parent.");
2617 return 1;
2618 }
2619 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2620 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2621 return 1;
2622 }
2623 if (vpid == -1) // done
2624 break;
2625 v = '0';
2626 cred.pid = vpid;
2627 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2628 v = '1';
2629 cred.pid = getpid();
2630 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2631 return 1;
2632 }
2633 }
2634 return 0;
2635 }
2636
2637 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2638 {
2639 int newnsfd = -1, ret, cpipe[2];
2640 char fnam[100];
2641 pid_t cpid;
2642 char v;
2643
2644 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2645 if (ret < 0 || ret >= sizeof(fnam))
2646 _exit(1);
2647 newnsfd = open(fnam, O_RDONLY);
2648 if (newnsfd < 0)
2649 _exit(1);
2650 if (setns(newnsfd, 0) < 0)
2651 _exit(1);
2652 close(newnsfd);
2653
2654 if (pipe(cpipe) < 0)
2655 _exit(1);
2656
2657 struct pid_ns_clone_args args = {
2658 .cpipe = cpipe,
2659 .sock = sock,
2660 .tpid = tpid,
2661 .wrapped = &pid_from_ns
2662 };
2663 size_t stack_size = sysconf(_SC_PAGESIZE);
2664 void *stack = alloca(stack_size);
2665
2666 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2667 if (cpid < 0)
2668 _exit(1);
2669
2670 // give the child 1 second to be done forking and
2671 // write its ack
2672 if (!wait_for_sock(cpipe[0], 1))
2673 _exit(1);
2674 ret = read(cpipe[0], &v, 1);
2675 if (ret != sizeof(char) || v != '1')
2676 _exit(1);
2677
2678 if (!wait_for_pid(cpid))
2679 _exit(1);
2680 _exit(0);
2681 }
2682
2683 /*
2684 * Given host @uid, return the uid to which it maps in
2685 * @pid's user namespace, or -1 if none.
2686 */
2687 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2688 {
2689 FILE *f;
2690 char line[400];
2691
2692 sprintf(line, "/proc/%d/uid_map", pid);
2693 if ((f = fopen(line, "r")) == NULL) {
2694 return false;
2695 }
2696
2697 *answer = convert_id_to_ns(f, uid);
2698 fclose(f);
2699
2700 if (*answer == -1)
2701 return false;
2702 return true;
2703 }
2704
2705 /*
2706 * get_pid_creds: get the real uid and gid of @pid from
2707 * /proc/$$/status
2708 * (XXX should we use euid here?)
2709 */
2710 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2711 {
2712 char line[400];
2713 uid_t u;
2714 gid_t g;
2715 FILE *f;
2716
2717 *uid = -1;
2718 *gid = -1;
2719 sprintf(line, "/proc/%d/status", pid);
2720 if ((f = fopen(line, "r")) == NULL) {
2721 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2722 return;
2723 }
2724 while (fgets(line, 400, f)) {
2725 if (strncmp(line, "Uid:", 4) == 0) {
2726 if (sscanf(line+4, "%u", &u) != 1) {
2727 lxcfs_error("bad uid line for pid %u\n", pid);
2728 fclose(f);
2729 return;
2730 }
2731 *uid = u;
2732 } else if (strncmp(line, "Gid:", 4) == 0) {
2733 if (sscanf(line+4, "%u", &g) != 1) {
2734 lxcfs_error("bad gid line for pid %u\n", pid);
2735 fclose(f);
2736 return;
2737 }
2738 *gid = g;
2739 }
2740 }
2741 fclose(f);
2742 }
2743
2744 /*
2745 * May the requestor @r move victim @v to a new cgroup?
2746 * This is allowed if
2747 * . they are the same task
2748 * . they are ownedy by the same uid
2749 * . @r is root on the host, or
2750 * . @v's uid is mapped into @r's where @r is root.
2751 */
2752 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2753 {
2754 uid_t v_uid, tmpuid;
2755 gid_t v_gid;
2756
2757 if (r == v)
2758 return true;
2759 if (r_uid == 0)
2760 return true;
2761 get_pid_creds(v, &v_uid, &v_gid);
2762 if (r_uid == v_uid)
2763 return true;
2764 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2765 && hostuid_to_ns(v_uid, r, &tmpuid))
2766 return true;
2767 return false;
2768 }
2769
2770 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2771 const char *file, const char *buf)
2772 {
2773 int sock[2] = {-1, -1};
2774 pid_t qpid, cpid = -1;
2775 FILE *pids_file = NULL;
2776 bool answer = false, fail = false;
2777
2778 pids_file = open_pids_file(contrl, cg);
2779 if (!pids_file)
2780 return false;
2781
2782 /*
2783 * write the pids to a socket, have helper in writer's pidns
2784 * call movepid for us
2785 */
2786 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2787 perror("socketpair");
2788 goto out;
2789 }
2790
2791 cpid = fork();
2792 if (cpid == -1)
2793 goto out;
2794
2795 if (!cpid) { // child
2796 fclose(pids_file);
2797 pid_from_ns_wrapper(sock[1], tpid);
2798 }
2799
2800 const char *ptr = buf;
2801 while (sscanf(ptr, "%d", &qpid) == 1) {
2802 struct ucred cred;
2803 char v;
2804
2805 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2806 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2807 goto out;
2808 }
2809
2810 if (recv_creds(sock[0], &cred, &v)) {
2811 if (v == '0') {
2812 if (!may_move_pid(tpid, tuid, cred.pid)) {
2813 fail = true;
2814 break;
2815 }
2816 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2817 fail = true;
2818 }
2819 }
2820
2821 ptr = strchr(ptr, '\n');
2822 if (!ptr)
2823 break;
2824 ptr++;
2825 }
2826
2827 /* All good, write the value */
2828 qpid = -1;
2829 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2830 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2831
2832 if (!fail)
2833 answer = true;
2834
2835 out:
2836 if (cpid != -1)
2837 wait_for_pid(cpid);
2838 if (sock[0] != -1) {
2839 close(sock[0]);
2840 close(sock[1]);
2841 }
2842 if (pids_file) {
2843 if (fclose(pids_file) != 0)
2844 answer = false;
2845 }
2846 return answer;
2847 }
2848
2849 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2850 struct fuse_file_info *fi)
2851 {
2852 struct fuse_context *fc = fuse_get_context();
2853 char *localbuf = NULL;
2854 struct cgfs_files *k = NULL;
2855 struct file_info *f = (struct file_info *)fi->fh;
2856 bool r;
2857
2858 if (f->type != LXC_TYPE_CGFILE) {
2859 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2860 return -EIO;
2861 }
2862
2863 if (offset)
2864 return 0;
2865
2866 if (!fc)
2867 return -EIO;
2868
2869 localbuf = alloca(size+1);
2870 localbuf[size] = '\0';
2871 memcpy(localbuf, buf, size);
2872
2873 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2874 size = -EINVAL;
2875 goto out;
2876 }
2877
2878 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2879 size = -EACCES;
2880 goto out;
2881 }
2882
2883 if (strcmp(f->file, "tasks") == 0 ||
2884 strcmp(f->file, "/tasks") == 0 ||
2885 strcmp(f->file, "/cgroup.procs") == 0 ||
2886 strcmp(f->file, "cgroup.procs") == 0)
2887 // special case - we have to translate the pids
2888 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2889 else
2890 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2891
2892 if (!r)
2893 size = -EINVAL;
2894
2895 out:
2896 free_key(k);
2897 return size;
2898 }
2899
2900 int cg_chown(const char *path, uid_t uid, gid_t gid)
2901 {
2902 struct fuse_context *fc = fuse_get_context();
2903 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2904 struct cgfs_files *k = NULL;
2905 const char *cgroup;
2906 int ret;
2907
2908 if (!fc)
2909 return -EIO;
2910
2911 if (strcmp(path, "/cgroup") == 0)
2912 return -EPERM;
2913
2914 controller = pick_controller_from_path(fc, path);
2915 if (!controller)
2916 return errno == ENOENT ? -EPERM : -errno;
2917
2918 cgroup = find_cgroup_in_path(path);
2919 if (!cgroup)
2920 /* this is just /cgroup/controller */
2921 return -EPERM;
2922
2923 get_cgdir_and_path(cgroup, &cgdir, &last);
2924
2925 if (!last) {
2926 path1 = "/";
2927 path2 = cgdir;
2928 } else {
2929 path1 = cgdir;
2930 path2 = last;
2931 }
2932
2933 if (is_child_cgroup(controller, path1, path2)) {
2934 // get uid, gid, from '/tasks' file and make up a mode
2935 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2936 k = cgfs_get_key(controller, cgroup, "tasks");
2937
2938 } else
2939 k = cgfs_get_key(controller, path1, path2);
2940
2941 if (!k) {
2942 ret = -EINVAL;
2943 goto out;
2944 }
2945
2946 /*
2947 * This being a fuse request, the uid and gid must be valid
2948 * in the caller's namespace. So we can just check to make
2949 * sure that the caller is root in his uid, and privileged
2950 * over the file's current owner.
2951 */
2952 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2953 ret = -EACCES;
2954 goto out;
2955 }
2956
2957 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2958
2959 out:
2960 free_key(k);
2961 free(cgdir);
2962
2963 return ret;
2964 }
2965
2966 int cg_chmod(const char *path, mode_t mode)
2967 {
2968 struct fuse_context *fc = fuse_get_context();
2969 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2970 struct cgfs_files *k = NULL;
2971 const char *cgroup;
2972 int ret;
2973
2974 if (!fc)
2975 return -EIO;
2976
2977 if (strcmp(path, "/cgroup") == 0)
2978 return -EPERM;
2979
2980 controller = pick_controller_from_path(fc, path);
2981 if (!controller)
2982 return errno == ENOENT ? -EPERM : -errno;
2983
2984 cgroup = find_cgroup_in_path(path);
2985 if (!cgroup)
2986 /* this is just /cgroup/controller */
2987 return -EPERM;
2988
2989 get_cgdir_and_path(cgroup, &cgdir, &last);
2990
2991 if (!last) {
2992 path1 = "/";
2993 path2 = cgdir;
2994 } else {
2995 path1 = cgdir;
2996 path2 = last;
2997 }
2998
2999 if (is_child_cgroup(controller, path1, path2)) {
3000 // get uid, gid, from '/tasks' file and make up a mode
3001 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3002 k = cgfs_get_key(controller, cgroup, "tasks");
3003
3004 } else
3005 k = cgfs_get_key(controller, path1, path2);
3006
3007 if (!k) {
3008 ret = -EINVAL;
3009 goto out;
3010 }
3011
3012 /*
3013 * This being a fuse request, the uid and gid must be valid
3014 * in the caller's namespace. So we can just check to make
3015 * sure that the caller is root in his uid, and privileged
3016 * over the file's current owner.
3017 */
3018 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3019 ret = -EPERM;
3020 goto out;
3021 }
3022
3023 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3024 ret = -EINVAL;
3025 goto out;
3026 }
3027
3028 ret = 0;
3029 out:
3030 free_key(k);
3031 free(cgdir);
3032 return ret;
3033 }
3034
3035 int cg_mkdir(const char *path, mode_t mode)
3036 {
3037 struct fuse_context *fc = fuse_get_context();
3038 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3039 const char *cgroup;
3040 int ret;
3041
3042 if (!fc)
3043 return -EIO;
3044
3045 controller = pick_controller_from_path(fc, path);
3046 if (!controller)
3047 return errno == ENOENT ? -EPERM : -errno;
3048
3049 cgroup = find_cgroup_in_path(path);
3050 if (!cgroup)
3051 return -errno;
3052
3053 get_cgdir_and_path(cgroup, &cgdir, &last);
3054 if (!last)
3055 path1 = "/";
3056 else
3057 path1 = cgdir;
3058
3059 pid_t initpid = lookup_initpid_in_store(fc->pid);
3060 if (initpid <= 0)
3061 initpid = fc->pid;
3062 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3063 if (!next)
3064 ret = -EINVAL;
3065 else if (last && strcmp(next, last) == 0)
3066 ret = -EEXIST;
3067 else
3068 ret = -EPERM;
3069 goto out;
3070 }
3071
3072 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3073 ret = -EACCES;
3074 goto out;
3075 }
3076 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3077 ret = -EACCES;
3078 goto out;
3079 }
3080
3081 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3082
3083 out:
3084 free(cgdir);
3085 free(next);
3086 return ret;
3087 }
3088
3089 int cg_rmdir(const char *path)
3090 {
3091 struct fuse_context *fc = fuse_get_context();
3092 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3093 const char *cgroup;
3094 int ret;
3095
3096 if (!fc)
3097 return -EIO;
3098
3099 controller = pick_controller_from_path(fc, path);
3100 if (!controller) /* Someone's trying to delete "/cgroup". */
3101 return -EPERM;
3102
3103 cgroup = find_cgroup_in_path(path);
3104 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3105 return -EPERM;
3106
3107 get_cgdir_and_path(cgroup, &cgdir, &last);
3108 if (!last) {
3109 /* Someone's trying to delete a cgroup on the same level as the
3110 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3111 * rmdir "/cgroup/blkio/init.slice".
3112 */
3113 ret = -EPERM;
3114 goto out;
3115 }
3116
3117 pid_t initpid = lookup_initpid_in_store(fc->pid);
3118 if (initpid <= 0)
3119 initpid = fc->pid;
3120 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3121 if (!last || (next && (strcmp(next, last) == 0)))
3122 ret = -EBUSY;
3123 else
3124 ret = -ENOENT;
3125 goto out;
3126 }
3127
3128 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3129 ret = -EACCES;
3130 goto out;
3131 }
3132 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3133 ret = -EACCES;
3134 goto out;
3135 }
3136
3137 if (!cgfs_remove(controller, cgroup)) {
3138 ret = -EINVAL;
3139 goto out;
3140 }
3141
3142 ret = 0;
3143
3144 out:
3145 free(cgdir);
3146 free(next);
3147 return ret;
3148 }
3149
3150 static bool startswith(const char *line, const char *pref)
3151 {
3152 if (strncmp(line, pref, strlen(pref)) == 0)
3153 return true;
3154 return false;
3155 }
3156
3157 static void parse_memstat(char *memstat, unsigned long *cached,
3158 unsigned long *active_anon, unsigned long *inactive_anon,
3159 unsigned long *active_file, unsigned long *inactive_file,
3160 unsigned long *unevictable)
3161 {
3162 char *eol;
3163
3164 while (*memstat) {
3165 if (startswith(memstat, "total_cache")) {
3166 sscanf(memstat + 11, "%lu", cached);
3167 *cached /= 1024;
3168 } else if (startswith(memstat, "total_active_anon")) {
3169 sscanf(memstat + 17, "%lu", active_anon);
3170 *active_anon /= 1024;
3171 } else if (startswith(memstat, "total_inactive_anon")) {
3172 sscanf(memstat + 19, "%lu", inactive_anon);
3173 *inactive_anon /= 1024;
3174 } else if (startswith(memstat, "total_active_file")) {
3175 sscanf(memstat + 17, "%lu", active_file);
3176 *active_file /= 1024;
3177 } else if (startswith(memstat, "total_inactive_file")) {
3178 sscanf(memstat + 19, "%lu", inactive_file);
3179 *inactive_file /= 1024;
3180 } else if (startswith(memstat, "total_unevictable")) {
3181 sscanf(memstat + 17, "%lu", unevictable);
3182 *unevictable /= 1024;
3183 }
3184 eol = strchr(memstat, '\n');
3185 if (!eol)
3186 return;
3187 memstat = eol+1;
3188 }
3189 }
3190
3191 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3192 {
3193 char *eol;
3194 char key[32];
3195
3196 memset(key, 0, 32);
3197 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3198
3199 size_t len = strlen(key);
3200 *v = 0;
3201
3202 while (*str) {
3203 if (startswith(str, key)) {
3204 sscanf(str + len, "%lu", v);
3205 return;
3206 }
3207 eol = strchr(str, '\n');
3208 if (!eol)
3209 return;
3210 str = eol+1;
3211 }
3212 }
3213
3214 static int read_file(const char *path, char *buf, size_t size,
3215 struct file_info *d)
3216 {
3217 size_t linelen = 0, total_len = 0, rv = 0;
3218 char *line = NULL;
3219 char *cache = d->buf;
3220 size_t cache_size = d->buflen;
3221 FILE *f = fopen(path, "r");
3222 if (!f)
3223 return 0;
3224
3225 while (getline(&line, &linelen, f) != -1) {
3226 ssize_t l = snprintf(cache, cache_size, "%s", line);
3227 if (l < 0) {
3228 perror("Error writing to cache");
3229 rv = 0;
3230 goto err;
3231 }
3232 if (l >= cache_size) {
3233 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3234 rv = 0;
3235 goto err;
3236 }
3237 cache += l;
3238 cache_size -= l;
3239 total_len += l;
3240 }
3241
3242 d->size = total_len;
3243 if (total_len > size)
3244 total_len = size;
3245
3246 /* read from off 0 */
3247 memcpy(buf, d->buf, total_len);
3248 rv = total_len;
3249 err:
3250 fclose(f);
3251 free(line);
3252 return rv;
3253 }
3254
3255 /*
3256 * FUSE ops for /proc
3257 */
3258
3259 static unsigned long get_memlimit(const char *cgroup, const char *file)
3260 {
3261 char *memlimit_str = NULL;
3262 unsigned long memlimit = -1;
3263
3264 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3265 memlimit = strtoul(memlimit_str, NULL, 10);
3266
3267 free(memlimit_str);
3268
3269 return memlimit;
3270 }
3271
3272 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3273 {
3274 char *copy = strdupa(cgroup);
3275 unsigned long memlimit = 0, retlimit;
3276
3277 retlimit = get_memlimit(copy, file);
3278
3279 while (strcmp(copy, "/") != 0) {
3280 copy = dirname(copy);
3281 memlimit = get_memlimit(copy, file);
3282 if (memlimit != -1 && memlimit < retlimit)
3283 retlimit = memlimit;
3284 };
3285
3286 return retlimit;
3287 }
3288
3289 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3290 struct fuse_file_info *fi)
3291 {
3292 struct fuse_context *fc = fuse_get_context();
3293 struct file_info *d = (struct file_info *)fi->fh;
3294 char *cg;
3295 char *memusage_str = NULL, *memstat_str = NULL,
3296 *memswlimit_str = NULL, *memswusage_str = NULL;
3297 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3298 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3299 active_file = 0, inactive_file = 0, unevictable = 0,
3300 hostswtotal = 0;
3301 char *line = NULL;
3302 size_t linelen = 0, total_len = 0, rv = 0;
3303 char *cache = d->buf;
3304 size_t cache_size = d->buflen;
3305 FILE *f = NULL;
3306
3307 if (offset){
3308 if (offset > d->size)
3309 return -EINVAL;
3310 if (!d->cached)
3311 return 0;
3312 int left = d->size - offset;
3313 total_len = left > size ? size: left;
3314 memcpy(buf, cache + offset, total_len);
3315 return total_len;
3316 }
3317
3318 pid_t initpid = lookup_initpid_in_store(fc->pid);
3319 if (initpid <= 0)
3320 initpid = fc->pid;
3321 cg = get_pid_cgroup(initpid, "memory");
3322 if (!cg)
3323 return read_file("/proc/meminfo", buf, size, d);
3324 prune_init_slice(cg);
3325
3326 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3327 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3328 goto err;
3329 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3330 goto err;
3331
3332 // Following values are allowed to fail, because swapaccount might be turned
3333 // off for current kernel
3334 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3335 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3336 {
3337 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3338 memswusage = strtoul(memswusage_str, NULL, 10);
3339
3340 memswlimit = memswlimit / 1024;
3341 memswusage = memswusage / 1024;
3342 }
3343
3344 memusage = strtoul(memusage_str, NULL, 10);
3345 memlimit /= 1024;
3346 memusage /= 1024;
3347
3348 parse_memstat(memstat_str, &cached, &active_anon,
3349 &inactive_anon, &active_file, &inactive_file,
3350 &unevictable);
3351
3352 f = fopen("/proc/meminfo", "r");
3353 if (!f)
3354 goto err;
3355
3356 while (getline(&line, &linelen, f) != -1) {
3357 ssize_t l;
3358 char *printme, lbuf[100];
3359
3360 memset(lbuf, 0, 100);
3361 if (startswith(line, "MemTotal:")) {
3362 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3363 if (hosttotal < memlimit)
3364 memlimit = hosttotal;
3365 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3366 printme = lbuf;
3367 } else if (startswith(line, "MemFree:")) {
3368 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3369 printme = lbuf;
3370 } else if (startswith(line, "MemAvailable:")) {
3371 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
3372 printme = lbuf;
3373 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3374 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3375 if (hostswtotal < memswlimit)
3376 memswlimit = hostswtotal;
3377 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
3378 printme = lbuf;
3379 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3380 unsigned long swaptotal = memswlimit,
3381 swapusage = memswusage - memusage,
3382 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3383 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3384 printme = lbuf;
3385 } else if (startswith(line, "Slab:")) {
3386 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3387 printme = lbuf;
3388 } else if (startswith(line, "Buffers:")) {
3389 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3390 printme = lbuf;
3391 } else if (startswith(line, "Cached:")) {
3392 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3393 printme = lbuf;
3394 } else if (startswith(line, "SwapCached:")) {
3395 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3396 printme = lbuf;
3397 } else if (startswith(line, "Active:")) {
3398 snprintf(lbuf, 100, "Active: %8lu kB\n",
3399 active_anon + active_file);
3400 printme = lbuf;
3401 } else if (startswith(line, "Inactive:")) {
3402 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3403 inactive_anon + inactive_file);
3404 printme = lbuf;
3405 } else if (startswith(line, "Active(anon)")) {
3406 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3407 printme = lbuf;
3408 } else if (startswith(line, "Inactive(anon)")) {
3409 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3410 printme = lbuf;
3411 } else if (startswith(line, "Active(file)")) {
3412 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3413 printme = lbuf;
3414 } else if (startswith(line, "Inactive(file)")) {
3415 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3416 printme = lbuf;
3417 } else if (startswith(line, "Unevictable")) {
3418 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3419 printme = lbuf;
3420 } else if (startswith(line, "SReclaimable")) {
3421 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3422 printme = lbuf;
3423 } else if (startswith(line, "SUnreclaim")) {
3424 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3425 printme = lbuf;
3426 } else
3427 printme = line;
3428
3429 l = snprintf(cache, cache_size, "%s", printme);
3430 if (l < 0) {
3431 perror("Error writing to cache");
3432 rv = 0;
3433 goto err;
3434
3435 }
3436 if (l >= cache_size) {
3437 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3438 rv = 0;
3439 goto err;
3440 }
3441
3442 cache += l;
3443 cache_size -= l;
3444 total_len += l;
3445 }
3446
3447 d->cached = 1;
3448 d->size = total_len;
3449 if (total_len > size ) total_len = size;
3450 memcpy(buf, d->buf, total_len);
3451
3452 rv = total_len;
3453 err:
3454 if (f)
3455 fclose(f);
3456 free(line);
3457 free(cg);
3458 free(memusage_str);
3459 free(memswlimit_str);
3460 free(memswusage_str);
3461 free(memstat_str);
3462 return rv;
3463 }
3464
3465 /*
3466 * Read the cpuset.cpus for cg
3467 * Return the answer in a newly allocated string which must be freed
3468 */
3469 static char *get_cpuset(const char *cg)
3470 {
3471 char *answer;
3472
3473 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3474 return NULL;
3475 return answer;
3476 }
3477
3478 bool cpu_in_cpuset(int cpu, const char *cpuset);
3479
3480 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3481 {
3482 int cpu;
3483
3484 if (sscanf(line, "processor : %d", &cpu) != 1)
3485 return false;
3486 return cpu_in_cpuset(cpu, cpuset);
3487 }
3488
3489 /*
3490 * check whether this is a '^processor" line in /proc/cpuinfo
3491 */
3492 static bool is_processor_line(const char *line)
3493 {
3494 int cpu;
3495
3496 if (sscanf(line, "processor : %d", &cpu) == 1)
3497 return true;
3498 return false;
3499 }
3500
3501 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3502 struct fuse_file_info *fi)
3503 {
3504 struct fuse_context *fc = fuse_get_context();
3505 struct file_info *d = (struct file_info *)fi->fh;
3506 char *cg;
3507 char *cpuset = NULL;
3508 char *line = NULL;
3509 size_t linelen = 0, total_len = 0, rv = 0;
3510 bool am_printing = false, firstline = true, is_s390x = false;
3511 int curcpu = -1, cpu;
3512 char *cache = d->buf;
3513 size_t cache_size = d->buflen;
3514 FILE *f = NULL;
3515
3516 if (offset){
3517 if (offset > d->size)
3518 return -EINVAL;
3519 if (!d->cached)
3520 return 0;
3521 int left = d->size - offset;
3522 total_len = left > size ? size: left;
3523 memcpy(buf, cache + offset, total_len);
3524 return total_len;
3525 }
3526
3527 pid_t initpid = lookup_initpid_in_store(fc->pid);
3528 if (initpid <= 0)
3529 initpid = fc->pid;
3530 cg = get_pid_cgroup(initpid, "cpuset");
3531 if (!cg)
3532 return read_file("proc/cpuinfo", buf, size, d);
3533 prune_init_slice(cg);
3534
3535 cpuset = get_cpuset(cg);
3536 if (!cpuset)
3537 goto err;
3538
3539 f = fopen("/proc/cpuinfo", "r");
3540 if (!f)
3541 goto err;
3542
3543 while (getline(&line, &linelen, f) != -1) {
3544 ssize_t l;
3545 if (firstline) {
3546 firstline = false;
3547 if (strstr(line, "IBM/S390") != NULL) {
3548 is_s390x = true;
3549 am_printing = true;
3550 continue;
3551 }
3552 }
3553 if (strncmp(line, "# processors:", 12) == 0)
3554 continue;
3555 if (is_processor_line(line)) {
3556 am_printing = cpuline_in_cpuset(line, cpuset);
3557 if (am_printing) {
3558 curcpu ++;
3559 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3560 if (l < 0) {
3561 perror("Error writing to cache");
3562 rv = 0;
3563 goto err;
3564 }
3565 if (l >= cache_size) {
3566 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3567 rv = 0;
3568 goto err;
3569 }
3570 cache += l;
3571 cache_size -= l;
3572 total_len += l;
3573 }
3574 continue;
3575 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3576 char *p;
3577 if (!cpu_in_cpuset(cpu, cpuset))
3578 continue;
3579 curcpu ++;
3580 p = strchr(line, ':');
3581 if (!p || !*p)
3582 goto err;
3583 p++;
3584 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3585 if (l < 0) {
3586 perror("Error writing to cache");
3587 rv = 0;
3588 goto err;
3589 }
3590 if (l >= cache_size) {
3591 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3592 rv = 0;
3593 goto err;
3594 }
3595 cache += l;
3596 cache_size -= l;
3597 total_len += l;
3598 continue;
3599
3600 }
3601 if (am_printing) {
3602 l = snprintf(cache, cache_size, "%s", line);
3603 if (l < 0) {
3604 perror("Error writing to cache");
3605 rv = 0;
3606 goto err;
3607 }
3608 if (l >= cache_size) {
3609 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3610 rv = 0;
3611 goto err;
3612 }
3613 cache += l;
3614 cache_size -= l;
3615 total_len += l;
3616 }
3617 }
3618
3619 if (is_s390x) {
3620 char *origcache = d->buf;
3621 ssize_t l;
3622 do {
3623 d->buf = malloc(d->buflen);
3624 } while (!d->buf);
3625 cache = d->buf;
3626 cache_size = d->buflen;
3627 total_len = 0;
3628 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3629 if (l < 0 || l >= cache_size) {
3630 free(origcache);
3631 goto err;
3632 }
3633 cache_size -= l;
3634 cache += l;
3635 total_len += l;
3636 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3637 if (l < 0 || l >= cache_size) {
3638 free(origcache);
3639 goto err;
3640 }
3641 cache_size -= l;
3642 cache += l;
3643 total_len += l;
3644 l = snprintf(cache, cache_size, "%s", origcache);
3645 free(origcache);
3646 if (l < 0 || l >= cache_size)
3647 goto err;
3648 total_len += l;
3649 }
3650
3651 d->cached = 1;
3652 d->size = total_len;
3653 if (total_len > size ) total_len = size;
3654
3655 /* read from off 0 */
3656 memcpy(buf, d->buf, total_len);
3657 rv = total_len;
3658 err:
3659 if (f)
3660 fclose(f);
3661 free(line);
3662 free(cpuset);
3663 free(cg);
3664 return rv;
3665 }
3666
3667 static uint64_t get_reaper_start_time(pid_t pid)
3668 {
3669 int ret;
3670 FILE *f;
3671 uint64_t starttime;
3672 /* strlen("/proc/") = 6
3673 * +
3674 * LXCFS_NUMSTRLEN64
3675 * +
3676 * strlen("/stat") = 5
3677 * +
3678 * \0 = 1
3679 * */
3680 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3681 char path[__PROC_PID_STAT_LEN];
3682 pid_t qpid;
3683
3684 qpid = lookup_initpid_in_store(pid);
3685 if (qpid <= 0) {
3686 /* Caller can check for EINVAL on 0. */
3687 errno = EINVAL;
3688 return 0;
3689 }
3690
3691 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3692 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3693 /* Caller can check for EINVAL on 0. */
3694 errno = EINVAL;
3695 return 0;
3696 }
3697
3698 f = fopen(path, "r");
3699 if (!f) {
3700 /* Caller can check for EINVAL on 0. */
3701 errno = EINVAL;
3702 return 0;
3703 }
3704
3705 /* Note that the *scanf() argument supression requires that length
3706 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3707 * at us. It's like telling someone you're not married and then asking
3708 * if you can bring your wife to the party.
3709 */
3710 ret = fscanf(f, "%*d " /* (1) pid %d */
3711 "%*s " /* (2) comm %s */
3712 "%*c " /* (3) state %c */
3713 "%*d " /* (4) ppid %d */
3714 "%*d " /* (5) pgrp %d */
3715 "%*d " /* (6) session %d */
3716 "%*d " /* (7) tty_nr %d */
3717 "%*d " /* (8) tpgid %d */
3718 "%*u " /* (9) flags %u */
3719 "%*u " /* (10) minflt %lu */
3720 "%*u " /* (11) cminflt %lu */
3721 "%*u " /* (12) majflt %lu */
3722 "%*u " /* (13) cmajflt %lu */
3723 "%*u " /* (14) utime %lu */
3724 "%*u " /* (15) stime %lu */
3725 "%*d " /* (16) cutime %ld */
3726 "%*d " /* (17) cstime %ld */
3727 "%*d " /* (18) priority %ld */
3728 "%*d " /* (19) nice %ld */
3729 "%*d " /* (20) num_threads %ld */
3730 "%*d " /* (21) itrealvalue %ld */
3731 "%" PRIu64, /* (22) starttime %llu */
3732 &starttime);
3733 if (ret != 1) {
3734 fclose(f);
3735 /* Caller can check for EINVAL on 0. */
3736 errno = EINVAL;
3737 return 0;
3738 }
3739
3740 fclose(f);
3741
3742 errno = 0;
3743 return starttime;
3744 }
3745
3746 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3747 {
3748 uint64_t clockticks;
3749 int64_t ticks_per_sec;
3750
3751 clockticks = get_reaper_start_time(pid);
3752 if (clockticks == 0 && errno == EINVAL) {
3753 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3754 return 0;
3755 }
3756
3757 ticks_per_sec = sysconf(_SC_CLK_TCK);
3758 if (ticks_per_sec < 0 && errno == EINVAL) {
3759 lxcfs_debug(
3760 "%s\n",
3761 "failed to determine number of clock ticks in a second");
3762 return 0;
3763 }
3764
3765 return (clockticks /= ticks_per_sec);
3766 }
3767
3768 static uint64_t get_reaper_age(pid_t pid)
3769 {
3770 uint64_t procstart, uptime, procage;
3771
3772 /* We need to substract the time the process has started since system
3773 * boot minus the time when the system has started to get the actual
3774 * reaper age.
3775 */
3776 procstart = get_reaper_start_time_in_sec(pid);
3777 procage = procstart;
3778 if (procstart > 0) {
3779 int ret;
3780 struct timespec spec;
3781
3782 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3783 if (ret < 0)
3784 return 0;
3785 /* We could make this more precise here by using the tv_nsec
3786 * field in the timespec struct and convert it to milliseconds
3787 * and then create a double for the seconds and milliseconds but
3788 * that seems more work than it is worth.
3789 */
3790 uptime = spec.tv_sec;
3791 procage = uptime - procstart;
3792 }
3793
3794 return procage;
3795 }
3796
3797 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
3798 static int proc_stat_read(char *buf, size_t size, off_t offset,
3799 struct fuse_file_info *fi)
3800 {
3801 struct fuse_context *fc = fuse_get_context();
3802 struct file_info *d = (struct file_info *)fi->fh;
3803 char *cg;
3804 char *cpuset = NULL;
3805 char *line = NULL;
3806 size_t linelen = 0, total_len = 0, rv = 0;
3807 int curcpu = -1; /* cpu numbering starts at 0 */
3808 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
3809 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3810 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
3811 char cpuall[CPUALL_MAX_SIZE];
3812 /* reserve for cpu all */
3813 char *cache = d->buf + CPUALL_MAX_SIZE;
3814 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3815 FILE *f = NULL;
3816
3817 if (offset){
3818 if (offset > d->size)
3819 return -EINVAL;
3820 if (!d->cached)
3821 return 0;
3822 int left = d->size - offset;
3823 total_len = left > size ? size: left;
3824 memcpy(buf, d->buf + offset, total_len);
3825 return total_len;
3826 }
3827
3828 pid_t initpid = lookup_initpid_in_store(fc->pid);
3829 if (initpid <= 0)
3830 initpid = fc->pid;
3831 cg = get_pid_cgroup(initpid, "cpuset");
3832 if (!cg)
3833 return read_file("/proc/stat", buf, size, d);
3834 prune_init_slice(cg);
3835
3836 cpuset = get_cpuset(cg);
3837 if (!cpuset)
3838 goto err;
3839
3840 f = fopen("/proc/stat", "r");
3841 if (!f)
3842 goto err;
3843
3844 //skip first line
3845 if (getline(&line, &linelen, f) < 0) {
3846 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
3847 goto err;
3848 }
3849
3850 while (getline(&line, &linelen, f) != -1) {
3851 ssize_t l;
3852 int cpu;
3853 char cpu_char[10]; /* That's a lot of cores */
3854 char *c;
3855
3856 if (strlen(line) == 0)
3857 continue;
3858 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3859 /* not a ^cpuN line containing a number N, just print it */
3860 l = snprintf(cache, cache_size, "%s", line);
3861 if (l < 0) {
3862 perror("Error writing to cache");
3863 rv = 0;
3864 goto err;
3865 }
3866 if (l >= cache_size) {
3867 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3868 rv = 0;
3869 goto err;
3870 }
3871 cache += l;
3872 cache_size -= l;
3873 total_len += l;
3874 continue;
3875 }
3876
3877 if (sscanf(cpu_char, "%d", &cpu) != 1)
3878 continue;
3879 if (!cpu_in_cpuset(cpu, cpuset))
3880 continue;
3881 curcpu ++;
3882
3883 c = strchr(line, ' ');
3884 if (!c)
3885 continue;
3886 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3887 if (l < 0) {
3888 perror("Error writing to cache");
3889 rv = 0;
3890 goto err;
3891
3892 }
3893 if (l >= cache_size) {
3894 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3895 rv = 0;
3896 goto err;
3897 }
3898
3899 cache += l;
3900 cache_size -= l;
3901 total_len += l;
3902
3903 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
3904 &user,
3905 &nice,
3906 &system,
3907 &idle,
3908 &iowait,
3909 &irq,
3910 &softirq,
3911 &steal,
3912 &guest,
3913 &guest_nice) != 10)
3914 continue;
3915 user_sum += user;
3916 nice_sum += nice;
3917 system_sum += system;
3918 idle_sum += idle;
3919 iowait_sum += iowait;
3920 irq_sum += irq;
3921 softirq_sum += softirq;
3922 steal_sum += steal;
3923 guest_sum += guest;
3924 guest_nice_sum += guest_nice;
3925 }
3926
3927 cache = d->buf;
3928
3929 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3930 user_sum,
3931 nice_sum,
3932 system_sum,
3933 idle_sum,
3934 iowait_sum,
3935 irq_sum,
3936 softirq_sum,
3937 steal_sum,
3938 guest_sum,
3939 guest_nice_sum);
3940 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
3941 memcpy(cache, cpuall, cpuall_len);
3942 cache += cpuall_len;
3943 } else {
3944 /* shouldn't happen */
3945 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
3946 cpuall_len = 0;
3947 }
3948
3949 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3950 total_len += cpuall_len;
3951 d->cached = 1;
3952 d->size = total_len;
3953 if (total_len > size)
3954 total_len = size;
3955
3956 memcpy(buf, d->buf, total_len);
3957 rv = total_len;
3958
3959 err:
3960 if (f)
3961 fclose(f);
3962 free(line);
3963 free(cpuset);
3964 free(cg);
3965 return rv;
3966 }
3967
3968 /* This function retrieves the busy time of a group of tasks by looking at
3969 * cpuacct.usage. Unfortunately, this only makes sense when the container has
3970 * been given it's own cpuacct cgroup. If not, this function will take the busy
3971 * time of all other taks that do not actually belong to the container into
3972 * account as well. If someone has a clever solution for this please send a
3973 * patch!
3974 */
3975 static unsigned long get_reaper_busy(pid_t task)
3976 {
3977 pid_t initpid = lookup_initpid_in_store(task);
3978 char *cgroup = NULL, *usage_str = NULL;
3979 unsigned long usage = 0;
3980
3981 if (initpid <= 0)
3982 return 0;
3983
3984 cgroup = get_pid_cgroup(initpid, "cpuacct");
3985 if (!cgroup)
3986 goto out;
3987 prune_init_slice(cgroup);
3988 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3989 goto out;
3990 usage = strtoul(usage_str, NULL, 10);
3991 usage /= 1000000000;
3992
3993 out:
3994 free(cgroup);
3995 free(usage_str);
3996 return usage;
3997 }
3998
3999 #if RELOADTEST
4000 void iwashere(void)
4001 {
4002 int fd;
4003
4004 fd = creat("/tmp/lxcfs-iwashere", 0644);
4005 if (fd >= 0)
4006 close(fd);
4007 }
4008 #endif
4009
4010 /*
4011 * We read /proc/uptime and reuse its second field.
4012 * For the first field, we use the mtime for the reaper for
4013 * the calling pid as returned by getreaperage
4014 */
4015 static int proc_uptime_read(char *buf, size_t size, off_t offset,
4016 struct fuse_file_info *fi)
4017 {
4018 struct fuse_context *fc = fuse_get_context();
4019 struct file_info *d = (struct file_info *)fi->fh;
4020 unsigned long int busytime = get_reaper_busy(fc->pid);
4021 char *cache = d->buf;
4022 ssize_t total_len = 0;
4023 uint64_t idletime, reaperage;
4024
4025 #if RELOADTEST
4026 iwashere();
4027 #endif
4028
4029 if (offset){
4030 if (!d->cached)
4031 return 0;
4032 if (offset > d->size)
4033 return -EINVAL;
4034 int left = d->size - offset;
4035 total_len = left > size ? size: left;
4036 memcpy(buf, cache + offset, total_len);
4037 return total_len;
4038 }
4039
4040 reaperage = get_reaper_age(fc->pid);
4041 /* To understand why this is done, please read the comment to the
4042 * get_reaper_busy() function.
4043 */
4044 idletime = reaperage;
4045 if (reaperage >= busytime)
4046 idletime = reaperage - busytime;
4047
4048 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
4049 if (total_len < 0 || total_len >= d->buflen){
4050 lxcfs_error("%s\n", "failed to write to cache");
4051 return 0;
4052 }
4053
4054 d->size = (int)total_len;
4055 d->cached = 1;
4056
4057 if (total_len > size) total_len = size;
4058
4059 memcpy(buf, d->buf, total_len);
4060 return total_len;
4061 }
4062
4063 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4064 struct fuse_file_info *fi)
4065 {
4066 char dev_name[72];
4067 struct fuse_context *fc = fuse_get_context();
4068 struct file_info *d = (struct file_info *)fi->fh;
4069 char *cg;
4070 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
4071 *io_wait_time_str = NULL, *io_service_time_str = NULL;
4072 unsigned long read = 0, write = 0;
4073 unsigned long read_merged = 0, write_merged = 0;
4074 unsigned long read_sectors = 0, write_sectors = 0;
4075 unsigned long read_ticks = 0, write_ticks = 0;
4076 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4077 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4078 char *cache = d->buf;
4079 size_t cache_size = d->buflen;
4080 char *line = NULL;
4081 size_t linelen = 0, total_len = 0, rv = 0;
4082 unsigned int major = 0, minor = 0;
4083 int i = 0;
4084 FILE *f = NULL;
4085
4086 if (offset){
4087 if (offset > d->size)
4088 return -EINVAL;
4089 if (!d->cached)
4090 return 0;
4091 int left = d->size - offset;
4092 total_len = left > size ? size: left;
4093 memcpy(buf, cache + offset, total_len);
4094 return total_len;
4095 }
4096
4097 pid_t initpid = lookup_initpid_in_store(fc->pid);
4098 if (initpid <= 0)
4099 initpid = fc->pid;
4100 cg = get_pid_cgroup(initpid, "blkio");
4101 if (!cg)
4102 return read_file("/proc/diskstats", buf, size, d);
4103 prune_init_slice(cg);
4104
4105 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
4106 goto err;
4107 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
4108 goto err;
4109 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
4110 goto err;
4111 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
4112 goto err;
4113 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
4114 goto err;
4115
4116
4117 f = fopen("/proc/diskstats", "r");
4118 if (!f)
4119 goto err;
4120
4121 while (getline(&line, &linelen, f) != -1) {
4122 ssize_t l;
4123 char lbuf[256];
4124
4125 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
4126 if (i != 3)
4127 continue;
4128
4129 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
4130 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
4131 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
4132 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
4133 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
4134 read_sectors = read_sectors/512;
4135 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
4136 write_sectors = write_sectors/512;
4137
4138 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
4139 rd_svctm = rd_svctm/1000000;
4140 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
4141 rd_wait = rd_wait/1000000;
4142 read_ticks = rd_svctm + rd_wait;
4143
4144 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
4145 wr_svctm = wr_svctm/1000000;
4146 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
4147 wr_wait = wr_wait/1000000;
4148 write_ticks = wr_svctm + wr_wait;
4149
4150 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
4151 tot_ticks = tot_ticks/1000000;
4152
4153 memset(lbuf, 0, 256);
4154 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
4155 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4156 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
4157 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
4158 else
4159 continue;
4160
4161 l = snprintf(cache, cache_size, "%s", lbuf);
4162 if (l < 0) {
4163 perror("Error writing to fuse buf");
4164 rv = 0;
4165 goto err;
4166 }
4167 if (l >= cache_size) {
4168 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4169 rv = 0;
4170 goto err;
4171 }
4172 cache += l;
4173 cache_size -= l;
4174 total_len += l;
4175 }
4176
4177 d->cached = 1;
4178 d->size = total_len;
4179 if (total_len > size ) total_len = size;
4180 memcpy(buf, d->buf, total_len);
4181
4182 rv = total_len;
4183 err:
4184 free(cg);
4185 if (f)
4186 fclose(f);
4187 free(line);
4188 free(io_serviced_str);
4189 free(io_merged_str);
4190 free(io_service_bytes_str);
4191 free(io_wait_time_str);
4192 free(io_service_time_str);
4193 return rv;
4194 }
4195
4196 static int proc_swaps_read(char *buf, size_t size, off_t offset,
4197 struct fuse_file_info *fi)
4198 {
4199 struct fuse_context *fc = fuse_get_context();
4200 struct file_info *d = (struct file_info *)fi->fh;
4201 char *cg = NULL;
4202 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
4203 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
4204 ssize_t total_len = 0, rv = 0;
4205 ssize_t l = 0;
4206 char *cache = d->buf;
4207
4208 if (offset) {
4209 if (offset > d->size)
4210 return -EINVAL;
4211 if (!d->cached)
4212 return 0;
4213 int left = d->size - offset;
4214 total_len = left > size ? size: left;
4215 memcpy(buf, cache + offset, total_len);
4216 return total_len;
4217 }
4218
4219 pid_t initpid = lookup_initpid_in_store(fc->pid);
4220 if (initpid <= 0)
4221 initpid = fc->pid;
4222 cg = get_pid_cgroup(initpid, "memory");
4223 if (!cg)
4224 return read_file("/proc/swaps", buf, size, d);
4225 prune_init_slice(cg);
4226
4227 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
4228
4229 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4230 goto err;
4231
4232 memusage = strtoul(memusage_str, NULL, 10);
4233
4234 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
4235 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
4236
4237 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
4238 memswusage = strtoul(memswusage_str, NULL, 10);
4239
4240 swap_total = (memswlimit - memlimit) / 1024;
4241 swap_free = (memswusage - memusage) / 1024;
4242 }
4243
4244 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
4245
4246 /* When no mem + swap limit is specified or swapaccount=0*/
4247 if (!memswlimit) {
4248 char *line = NULL;
4249 size_t linelen = 0;
4250 FILE *f = fopen("/proc/meminfo", "r");
4251
4252 if (!f)
4253 goto err;
4254
4255 while (getline(&line, &linelen, f) != -1) {
4256 if (startswith(line, "SwapTotal:")) {
4257 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
4258 } else if (startswith(line, "SwapFree:")) {
4259 sscanf(line, "SwapFree: %8lu kB", &swap_free);
4260 }
4261 }
4262
4263 free(line);
4264 fclose(f);
4265 }
4266
4267 if (swap_total > 0) {
4268 l = snprintf(d->buf + total_len, d->size - total_len,
4269 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
4270 swap_total, swap_free);
4271 total_len += l;
4272 }
4273
4274 if (total_len < 0 || l < 0) {
4275 perror("Error writing to cache");
4276 rv = 0;
4277 goto err;
4278 }
4279
4280 d->cached = 1;
4281 d->size = (int)total_len;
4282
4283 if (total_len > size) total_len = size;
4284 memcpy(buf, d->buf, total_len);
4285 rv = total_len;
4286
4287 err:
4288 free(cg);
4289 free(memswlimit_str);
4290 free(memlimit_str);
4291 free(memusage_str);
4292 free(memswusage_str);
4293 return rv;
4294 }
4295 /*
4296 * Find the process pid from cgroup path.
4297 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
4298 * @pid_buf : put pid to pid_buf.
4299 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
4300 * @depth : the depth of cgroup in container.
4301 * @sum : return the number of pid.
4302 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
4303 */
4304 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
4305 {
4306 DIR *dir;
4307 int fd;
4308 struct dirent *file;
4309 FILE *f = NULL;
4310 size_t linelen = 0;
4311 char *line = NULL;
4312 int pd;
4313 char *path_dir, *path;
4314 char **pid;
4315
4316 /* path = dpath + "/cgroup.procs" + /0 */
4317 do {
4318 path = malloc(strlen(dpath) + 20);
4319 } while (!path);
4320
4321 strcpy(path, dpath);
4322 fd = openat(cfd, path, O_RDONLY);
4323 if (fd < 0)
4324 goto out;
4325
4326 dir = fdopendir(fd);
4327 if (dir == NULL) {
4328 close(fd);
4329 goto out;
4330 }
4331
4332 while (((file = readdir(dir)) != NULL) && depth > 0) {
4333 if (strncmp(file->d_name, ".", 1) == 0)
4334 continue;
4335 if (strncmp(file->d_name, "..", 1) == 0)
4336 continue;
4337 if (file->d_type == DT_DIR) {
4338 /* path + '/' + d_name +/0 */
4339 do {
4340 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
4341 } while (!path_dir);
4342 strcpy(path_dir, path);
4343 strcat(path_dir, "/");
4344 strcat(path_dir, file->d_name);
4345 pd = depth - 1;
4346 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
4347 free(path_dir);
4348 }
4349 }
4350 closedir(dir);
4351
4352 strcat(path, "/cgroup.procs");
4353 fd = openat(cfd, path, O_RDONLY);
4354 if (fd < 0)
4355 goto out;
4356
4357 f = fdopen(fd, "r");
4358 if (!f) {
4359 close(fd);
4360 goto out;
4361 }
4362
4363 while (getline(&line, &linelen, f) != -1) {
4364 do {
4365 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
4366 } while (!pid);
4367 *pid_buf = pid;
4368 do {
4369 *(*pid_buf + sum) = malloc(strlen(line) + 1);
4370 } while (*(*pid_buf + sum) == NULL);
4371 strcpy(*(*pid_buf + sum), line);
4372 sum++;
4373 }
4374 fclose(f);
4375 out:
4376 free(path);
4377 return sum;
4378 }
4379 /*
4380 * calc_load calculates the load according to the following formula:
4381 * load1 = load0 * exp + active * (1 - exp)
4382 *
4383 * @load1: the new loadavg.
4384 * @load0: the former loadavg.
4385 * @active: the total number of running pid at this moment.
4386 * @exp: the fixed-point defined in the beginning.
4387 */
4388 static unsigned long
4389 calc_load(unsigned long load, unsigned long exp, unsigned long active)
4390 {
4391 unsigned long newload;
4392
4393 active = active > 0 ? active * FIXED_1 : 0;
4394 newload = load * exp + active * (FIXED_1 - exp);
4395 if (active >= load)
4396 newload += FIXED_1 - 1;
4397
4398 return newload / FIXED_1;
4399 }
4400
4401 /*
4402 * Return 0 means that container p->cg is closed.
4403 * Return -1 means that error occurred in refresh.
4404 * Positive num equals the total number of pid.
4405 */
4406 static int refresh_load(struct load_node *p, char *path)
4407 {
4408 FILE *f = NULL;
4409 char **idbuf;
4410 char proc_path[256];
4411 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
4412 char *line = NULL;
4413 size_t linelen = 0;
4414 int sum, length;
4415 DIR *dp;
4416 struct dirent *file;
4417
4418 do {
4419 idbuf = malloc(sizeof(char *));
4420 } while (!idbuf);
4421 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
4422 /* normal exit */
4423 if (sum == 0)
4424 goto out;
4425
4426 for (i = 0; i < sum; i++) {
4427 /*clean up '\n' */
4428 length = strlen(idbuf[i])-1;
4429 idbuf[i][length] = '\0';
4430 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
4431 if (ret < 0 || ret > 255) {
4432 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4433 i = sum;
4434 sum = -1;
4435 goto err_out;
4436 }
4437
4438 dp = opendir(proc_path);
4439 if (!dp) {
4440 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
4441 continue;
4442 }
4443 while ((file = readdir(dp)) != NULL) {
4444 if (strncmp(file->d_name, ".", 1) == 0)
4445 continue;
4446 if (strncmp(file->d_name, "..", 1) == 0)
4447 continue;
4448 total_pid++;
4449 /* We make the biggest pid become last_pid.*/
4450 ret = atof(file->d_name);
4451 last_pid = (ret > last_pid) ? ret : last_pid;
4452
4453 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
4454 if (ret < 0 || ret > 255) {
4455 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4456 i = sum;
4457 sum = -1;
4458 closedir(dp);
4459 goto err_out;
4460 }
4461 f = fopen(proc_path, "r");
4462 if (f != NULL) {
4463 while (getline(&line, &linelen, f) != -1) {
4464 /* Find State */
4465 if ((line[0] == 'S') && (line[1] == 't'))
4466 break;
4467 }
4468 if ((line[7] == 'R') || (line[7] == 'D'))
4469 run_pid++;
4470 fclose(f);
4471 }
4472 }
4473 closedir(dp);
4474 }
4475 /*Calculate the loadavg.*/
4476 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
4477 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
4478 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
4479 p->run_pid = run_pid;
4480 p->total_pid = total_pid;
4481 p->last_pid = last_pid;
4482
4483 free(line);
4484 err_out:
4485 for (; i > 0; i--)
4486 free(idbuf[i-1]);
4487 out:
4488 free(idbuf);
4489 return sum;
4490 }
4491 /*
4492 * Traverse the hash table and update it.
4493 */
4494 void *load_begin(void *arg)
4495 {
4496
4497 char *path = NULL;
4498 int i, sum, length, ret;
4499 struct load_node *f;
4500 int first_node;
4501 clock_t time1, time2;
4502
4503 while (1) {
4504 if (loadavg_stop == 1)
4505 return NULL;
4506
4507 time1 = clock();
4508 for (i = 0; i < LOAD_SIZE; i++) {
4509 pthread_mutex_lock(&load_hash[i].lock);
4510 if (load_hash[i].next == NULL) {
4511 pthread_mutex_unlock(&load_hash[i].lock);
4512 continue;
4513 }
4514 f = load_hash[i].next;
4515 first_node = 1;
4516 while (f) {
4517 length = strlen(f->cg) + 2;
4518 do {
4519 /* strlen(f->cg) + '.' or '' + \0 */
4520 path = malloc(length);
4521 } while (!path);
4522
4523 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
4524 if (ret < 0 || ret > length - 1) {
4525 /* snprintf failed, ignore the node.*/
4526 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
4527 goto out;
4528 }
4529 sum = refresh_load(f, path);
4530 if (sum == 0) {
4531 f = del_node(f, i);
4532 } else {
4533 out: f = f->next;
4534 }
4535 free(path);
4536 /* load_hash[i].lock locks only on the first node.*/
4537 if (first_node == 1) {
4538 first_node = 0;
4539 pthread_mutex_unlock(&load_hash[i].lock);
4540 }
4541 }
4542 }
4543
4544 if (loadavg_stop == 1)
4545 return NULL;
4546
4547 time2 = clock();
4548 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
4549 }
4550 }
4551
4552 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
4553 struct fuse_file_info *fi)
4554 {
4555 struct fuse_context *fc = fuse_get_context();
4556 struct file_info *d = (struct file_info *)fi->fh;
4557 pid_t initpid;
4558 char *cg;
4559 size_t total_len = 0;
4560 char *cache = d->buf;
4561 struct load_node *n;
4562 int hash;
4563 int cfd, rv = 0;
4564 unsigned long a, b, c;
4565
4566 if (offset) {
4567 if (offset > d->size)
4568 return -EINVAL;
4569 if (!d->cached)
4570 return 0;
4571 int left = d->size - offset;
4572 total_len = left > size ? size : left;
4573 memcpy(buf, cache + offset, total_len);
4574 return total_len;
4575 }
4576 if (!loadavg)
4577 return read_file("/proc/loadavg", buf, size, d);
4578
4579 initpid = lookup_initpid_in_store(fc->pid);
4580 if (initpid <= 0)
4581 initpid = fc->pid;
4582 cg = get_pid_cgroup(initpid, "cpu");
4583 if (!cg)
4584 return read_file("/proc/loadavg", buf, size, d);
4585
4586 prune_init_slice(cg);
4587 hash = calc_hash(cg);
4588 n = locate_node(cg, hash);
4589
4590 /* First time */
4591 if (n == NULL) {
4592 if (!find_mounted_controller("cpu", &cfd)) {
4593 /*
4594 * In locate_node() above, pthread_rwlock_unlock() isn't used
4595 * because delete is not allowed before read has ended.
4596 */
4597 pthread_rwlock_unlock(&load_hash[hash].rdlock);
4598 rv = 0;
4599 goto err;
4600 }
4601 do {
4602 n = malloc(sizeof(struct load_node));
4603 } while (!n);
4604
4605 do {
4606 n->cg = malloc(strlen(cg)+1);
4607 } while (!n->cg);
4608 strcpy(n->cg, cg);
4609 n->avenrun[0] = 0;
4610 n->avenrun[1] = 0;
4611 n->avenrun[2] = 0;
4612 n->run_pid = 0;
4613 n->total_pid = 1;
4614 n->last_pid = initpid;
4615 n->cfd = cfd;
4616 insert_node(&n, hash);
4617 }
4618 a = n->avenrun[0] + (FIXED_1/200);
4619 b = n->avenrun[1] + (FIXED_1/200);
4620 c = n->avenrun[2] + (FIXED_1/200);
4621 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
4622 LOAD_INT(a), LOAD_FRAC(a),
4623 LOAD_INT(b), LOAD_FRAC(b),
4624 LOAD_INT(c), LOAD_FRAC(c),
4625 n->run_pid, n->total_pid, n->last_pid);
4626 pthread_rwlock_unlock(&load_hash[hash].rdlock);
4627 if (total_len < 0 || total_len >= d->buflen) {
4628 lxcfs_error("%s\n", "Failed to write to cache");
4629 rv = 0;
4630 goto err;
4631 }
4632 d->size = (int)total_len;
4633 d->cached = 1;
4634
4635 if (total_len > size)
4636 total_len = size;
4637 memcpy(buf, d->buf, total_len);
4638 rv = total_len;
4639
4640 err:
4641 free(cg);
4642 return rv;
4643 }
4644 /* Return a positive number on success, return 0 on failure.*/
4645 pthread_t load_daemon(int load_use)
4646 {
4647 int ret;
4648 pthread_t pid;
4649
4650 ret = init_load();
4651 if (ret == -1) {
4652 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
4653 return 0;
4654 }
4655 ret = pthread_create(&pid, NULL, load_begin, NULL);
4656 if (ret != 0) {
4657 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
4658 load_free();
4659 return 0;
4660 }
4661 /* use loadavg, here loadavg = 1*/
4662 loadavg = load_use;
4663 return pid;
4664 }
4665
4666 /* Returns 0 on success. */
4667 int stop_load_daemon(pthread_t pid)
4668 {
4669 int s;
4670
4671 /* Signal the thread to gracefully stop */
4672 loadavg_stop = 1;
4673
4674 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
4675 if (s != 0) {
4676 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
4677 return -1;
4678 }
4679
4680 load_free();
4681 loadavg_stop = 0;
4682
4683 return 0;
4684 }
4685
4686 static off_t get_procfile_size(const char *which)
4687 {
4688 FILE *f = fopen(which, "r");
4689 char *line = NULL;
4690 size_t len = 0;
4691 ssize_t sz, answer = 0;
4692 if (!f)
4693 return 0;
4694
4695 while ((sz = getline(&line, &len, f)) != -1)
4696 answer += sz;
4697 fclose (f);
4698 free(line);
4699
4700 return answer;
4701 }
4702
4703 int proc_getattr(const char *path, struct stat *sb)
4704 {
4705 struct timespec now;
4706
4707 memset(sb, 0, sizeof(struct stat));
4708 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4709 return -EINVAL;
4710 sb->st_uid = sb->st_gid = 0;
4711 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4712 if (strcmp(path, "/proc") == 0) {
4713 sb->st_mode = S_IFDIR | 00555;
4714 sb->st_nlink = 2;
4715 return 0;
4716 }
4717 if (strcmp(path, "/proc/meminfo") == 0 ||
4718 strcmp(path, "/proc/cpuinfo") == 0 ||
4719 strcmp(path, "/proc/uptime") == 0 ||
4720 strcmp(path, "/proc/stat") == 0 ||
4721 strcmp(path, "/proc/diskstats") == 0 ||
4722 strcmp(path, "/proc/swaps") == 0 ||
4723 strcmp(path, "/proc/loadavg") == 0) {
4724 sb->st_size = 0;
4725 sb->st_mode = S_IFREG | 00444;
4726 sb->st_nlink = 1;
4727 return 0;
4728 }
4729
4730 return -ENOENT;
4731 }
4732
4733 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4734 struct fuse_file_info *fi)
4735 {
4736 if (filler(buf, ".", NULL, 0) != 0 ||
4737 filler(buf, "..", NULL, 0) != 0 ||
4738 filler(buf, "cpuinfo", NULL, 0) != 0 ||
4739 filler(buf, "meminfo", NULL, 0) != 0 ||
4740 filler(buf, "stat", NULL, 0) != 0 ||
4741 filler(buf, "uptime", NULL, 0) != 0 ||
4742 filler(buf, "diskstats", NULL, 0) != 0 ||
4743 filler(buf, "swaps", NULL, 0) != 0 ||
4744 filler(buf, "loadavg", NULL, 0) != 0)
4745 return -EINVAL;
4746 return 0;
4747 }
4748
4749 int proc_open(const char *path, struct fuse_file_info *fi)
4750 {
4751 int type = -1;
4752 struct file_info *info;
4753
4754 if (strcmp(path, "/proc/meminfo") == 0)
4755 type = LXC_TYPE_PROC_MEMINFO;
4756 else if (strcmp(path, "/proc/cpuinfo") == 0)
4757 type = LXC_TYPE_PROC_CPUINFO;
4758 else if (strcmp(path, "/proc/uptime") == 0)
4759 type = LXC_TYPE_PROC_UPTIME;
4760 else if (strcmp(path, "/proc/stat") == 0)
4761 type = LXC_TYPE_PROC_STAT;
4762 else if (strcmp(path, "/proc/diskstats") == 0)
4763 type = LXC_TYPE_PROC_DISKSTATS;
4764 else if (strcmp(path, "/proc/swaps") == 0)
4765 type = LXC_TYPE_PROC_SWAPS;
4766 else if (strcmp(path, "/proc/loadavg") == 0)
4767 type = LXC_TYPE_PROC_LOADAVG;
4768 if (type == -1)
4769 return -ENOENT;
4770
4771 info = malloc(sizeof(*info));
4772 if (!info)
4773 return -ENOMEM;
4774
4775 memset(info, 0, sizeof(*info));
4776 info->type = type;
4777
4778 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4779 do {
4780 info->buf = malloc(info->buflen);
4781 } while (!info->buf);
4782 memset(info->buf, 0, info->buflen);
4783 /* set actual size to buffer size */
4784 info->size = info->buflen;
4785
4786 fi->fh = (unsigned long)info;
4787 return 0;
4788 }
4789
4790 int proc_access(const char *path, int mask)
4791 {
4792 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4793 return 0;
4794
4795 /* these are all read-only */
4796 if ((mask & ~R_OK) != 0)
4797 return -EACCES;
4798 return 0;
4799 }
4800
4801 int proc_release(const char *path, struct fuse_file_info *fi)
4802 {
4803 do_release_file_info(fi);
4804 return 0;
4805 }
4806
4807 int proc_read(const char *path, char *buf, size_t size, off_t offset,
4808 struct fuse_file_info *fi)
4809 {
4810 struct file_info *f = (struct file_info *) fi->fh;
4811
4812 switch (f->type) {
4813 case LXC_TYPE_PROC_MEMINFO:
4814 return proc_meminfo_read(buf, size, offset, fi);
4815 case LXC_TYPE_PROC_CPUINFO:
4816 return proc_cpuinfo_read(buf, size, offset, fi);
4817 case LXC_TYPE_PROC_UPTIME:
4818 return proc_uptime_read(buf, size, offset, fi);
4819 case LXC_TYPE_PROC_STAT:
4820 return proc_stat_read(buf, size, offset, fi);
4821 case LXC_TYPE_PROC_DISKSTATS:
4822 return proc_diskstats_read(buf, size, offset, fi);
4823 case LXC_TYPE_PROC_SWAPS:
4824 return proc_swaps_read(buf, size, offset, fi);
4825 case LXC_TYPE_PROC_LOADAVG:
4826 return proc_loadavg_read(buf, size, offset, fi);
4827 default:
4828 return -EINVAL;
4829 }
4830 }
4831
4832 /*
4833 * Functions needed to setup cgroups in the __constructor__.
4834 */
4835
4836 static bool mkdir_p(const char *dir, mode_t mode)
4837 {
4838 const char *tmp = dir;
4839 const char *orig = dir;
4840 char *makeme;
4841
4842 do {
4843 dir = tmp + strspn(tmp, "/");
4844 tmp = dir + strcspn(dir, "/");
4845 makeme = strndup(orig, dir - orig);
4846 if (!makeme)
4847 return false;
4848 if (mkdir(makeme, mode) && errno != EEXIST) {
4849 lxcfs_error("Failed to create directory '%s': %s.\n",
4850 makeme, strerror(errno));
4851 free(makeme);
4852 return false;
4853 }
4854 free(makeme);
4855 } while(tmp != dir);
4856
4857 return true;
4858 }
4859
4860 static bool umount_if_mounted(void)
4861 {
4862 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
4863 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
4864 return false;
4865 }
4866 return true;
4867 }
4868
4869 /* __typeof__ should be safe to use with all compilers. */
4870 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
4871 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
4872 {
4873 return (fs->f_type == (fs_type_magic)magic_val);
4874 }
4875
4876 /*
4877 * looking at fs/proc_namespace.c, it appears we can
4878 * actually expect the rootfs entry to very specifically contain
4879 * " - rootfs rootfs "
4880 * IIUC, so long as we've chrooted so that rootfs is not our root,
4881 * the rootfs entry should always be skipped in mountinfo contents.
4882 */
4883 static bool is_on_ramfs(void)
4884 {
4885 FILE *f;
4886 char *p, *p2;
4887 char *line = NULL;
4888 size_t len = 0;
4889 int i;
4890
4891 f = fopen("/proc/self/mountinfo", "r");
4892 if (!f)
4893 return false;
4894
4895 while (getline(&line, &len, f) != -1) {
4896 for (p = line, i = 0; p && i < 4; i++)
4897 p = strchr(p + 1, ' ');
4898 if (!p)
4899 continue;
4900 p2 = strchr(p + 1, ' ');
4901 if (!p2)
4902 continue;
4903 *p2 = '\0';
4904 if (strcmp(p + 1, "/") == 0) {
4905 // this is '/'. is it the ramfs?
4906 p = strchr(p2 + 1, '-');
4907 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
4908 free(line);
4909 fclose(f);
4910 return true;
4911 }
4912 }
4913 }
4914 free(line);
4915 fclose(f);
4916 return false;
4917 }
4918
4919 static int pivot_enter()
4920 {
4921 int ret = -1, oldroot = -1, newroot = -1;
4922
4923 oldroot = open("/", O_DIRECTORY | O_RDONLY);
4924 if (oldroot < 0) {
4925 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
4926 return ret;
4927 }
4928
4929 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
4930 if (newroot < 0) {
4931 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
4932 goto err;
4933 }
4934
4935 /* change into new root fs */
4936 if (fchdir(newroot) < 0) {
4937 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
4938 goto err;
4939 }
4940
4941 /* pivot_root into our new root fs */
4942 if (pivot_root(".", ".") < 0) {
4943 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
4944 goto err;
4945 }
4946
4947 /*
4948 * At this point the old-root is mounted on top of our new-root.
4949 * To unmounted it we must not be chdir'd into it, so escape back
4950 * to the old-root.
4951 */
4952 if (fchdir(oldroot) < 0) {
4953 lxcfs_error("%s\n", "Failed to enter old root.");
4954 goto err;
4955 }
4956
4957 if (umount2(".", MNT_DETACH) < 0) {
4958 lxcfs_error("%s\n", "Failed to detach old root.");
4959 goto err;
4960 }
4961
4962 if (fchdir(newroot) < 0) {
4963 lxcfs_error("%s\n", "Failed to re-enter new root.");
4964 goto err;
4965 }
4966
4967 ret = 0;
4968
4969 err:
4970 if (oldroot > 0)
4971 close(oldroot);
4972 if (newroot > 0)
4973 close(newroot);
4974
4975 return ret;
4976 }
4977
4978 static int chroot_enter()
4979 {
4980 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
4981 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
4982 return -1;
4983 }
4984
4985 if (chroot(".") < 0) {
4986 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
4987 return -1;
4988 }
4989
4990 if (chdir("/") < 0) {
4991 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
4992 return -1;
4993 }
4994
4995 return 0;
4996 }
4997
4998 static int permute_and_enter(void)
4999 {
5000 struct statfs sb;
5001
5002 if (statfs("/", &sb) < 0) {
5003 lxcfs_error("%s\n", "Could not stat / mountpoint.");
5004 return -1;
5005 }
5006
5007 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5008 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5009 * /proc/1/mountinfo. */
5010 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5011 return chroot_enter();
5012
5013 if (pivot_enter() < 0) {
5014 lxcfs_error("%s\n", "Could not perform pivot root.");
5015 return -1;
5016 }
5017
5018 return 0;
5019 }
5020
5021 /* Prepare our new clean root. */
5022 static int permute_prepare(void)
5023 {
5024 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
5025 lxcfs_error("%s\n", "Failed to create directory for new root.");
5026 return -1;
5027 }
5028
5029 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
5030 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
5031 return -1;
5032 }
5033
5034 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
5035 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
5036 return -1;
5037 }
5038
5039 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
5040 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
5041 return -1;
5042 }
5043
5044 return 0;
5045 }
5046
5047 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
5048 static bool permute_root(void)
5049 {
5050 /* Prepare new root. */
5051 if (permute_prepare() < 0)
5052 return false;
5053
5054 /* Pivot into new root. */
5055 if (permute_and_enter() < 0)
5056 return false;
5057
5058 return true;
5059 }
5060
5061 static int preserve_mnt_ns(int pid)
5062 {
5063 int ret;
5064 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
5065 char path[len];
5066
5067 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
5068 if (ret < 0 || (size_t)ret >= len)
5069 return -1;
5070
5071 return open(path, O_RDONLY | O_CLOEXEC);
5072 }
5073
5074 static bool cgfs_prepare_mounts(void)
5075 {
5076 if (!mkdir_p(BASEDIR, 0700)) {
5077 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
5078 return false;
5079 }
5080
5081 if (!umount_if_mounted()) {
5082 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
5083 return false;
5084 }
5085
5086 if (unshare(CLONE_NEWNS) < 0) {
5087 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
5088 return false;
5089 }
5090
5091 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
5092 if (cgroup_mount_ns_fd < 0) {
5093 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
5094 return false;
5095 }
5096
5097 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
5098 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
5099 return false;
5100 }
5101
5102 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
5103 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
5104 return false;
5105 }
5106
5107 return true;
5108 }
5109
5110 static bool cgfs_mount_hierarchies(void)
5111 {
5112 char *target;
5113 size_t clen, len;
5114 int i, ret;
5115
5116 for (i = 0; i < num_hierarchies; i++) {
5117 char *controller = hierarchies[i];
5118
5119 clen = strlen(controller);
5120 len = strlen(BASEDIR) + clen + 2;
5121 target = malloc(len);
5122 if (!target)
5123 return false;
5124
5125 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
5126 if (ret < 0 || ret >= len) {
5127 free(target);
5128 return false;
5129 }
5130 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
5131 free(target);
5132 return false;
5133 }
5134 if (!strcmp(controller, "unified"))
5135 ret = mount("none", target, "cgroup2", 0, NULL);
5136 else
5137 ret = mount(controller, target, "cgroup", 0, controller);
5138 if (ret < 0) {
5139 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
5140 free(target);
5141 return false;
5142 }
5143
5144 fd_hierarchies[i] = open(target, O_DIRECTORY);
5145 if (fd_hierarchies[i] < 0) {
5146 free(target);
5147 return false;
5148 }
5149 free(target);
5150 }
5151 return true;
5152 }
5153
5154 static bool cgfs_setup_controllers(void)
5155 {
5156 if (!cgfs_prepare_mounts())
5157 return false;
5158
5159 if (!cgfs_mount_hierarchies()) {
5160 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
5161 return false;
5162 }
5163
5164 if (!permute_root())
5165 return false;
5166
5167 return true;
5168 }
5169
5170 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
5171 {
5172 FILE *f;
5173 char *cret, *line = NULL;
5174 char cwd[MAXPATHLEN];
5175 size_t len = 0;
5176 int i, init_ns = -1;
5177 bool found_unified = false;
5178
5179 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
5180 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
5181 return;
5182 }
5183
5184 while (getline(&line, &len, f) != -1) {
5185 char *idx, *p, *p2;
5186
5187 p = strchr(line, ':');
5188 if (!p)
5189 goto out;
5190 idx = line;
5191 *(p++) = '\0';
5192
5193 p2 = strrchr(p, ':');
5194 if (!p2)
5195 goto out;
5196 *p2 = '\0';
5197
5198 /* With cgroupv2 /proc/self/cgroup can contain entries of the
5199 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
5200 * because it parses out the empty string "" and later on passes
5201 * it to mount(). Let's skip such entries.
5202 */
5203 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
5204 found_unified = true;
5205 p = "unified";
5206 }
5207
5208 if (!store_hierarchy(line, p))
5209 goto out;
5210 }
5211
5212 /* Preserve initial namespace. */
5213 init_ns = preserve_mnt_ns(getpid());
5214 if (init_ns < 0) {
5215 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
5216 goto out;
5217 }
5218
5219 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
5220 if (!fd_hierarchies) {
5221 lxcfs_error("%s\n", strerror(errno));
5222 goto out;
5223 }
5224
5225 for (i = 0; i < num_hierarchies; i++)
5226 fd_hierarchies[i] = -1;
5227
5228 cret = getcwd(cwd, MAXPATHLEN);
5229 if (!cret)
5230 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
5231
5232 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
5233 * to privately mount lxcfs cgroups. */
5234 if (!cgfs_setup_controllers()) {
5235 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
5236 goto out;
5237 }
5238
5239 if (setns(init_ns, 0) < 0) {
5240 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
5241 goto out;
5242 }
5243
5244 if (!cret || chdir(cwd) < 0)
5245 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
5246
5247 print_subsystems();
5248
5249 out:
5250 free(line);
5251 fclose(f);
5252 if (init_ns >= 0)
5253 close(init_ns);
5254 }
5255
5256 static void __attribute__((destructor)) free_subsystems(void)
5257 {
5258 int i;
5259
5260 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
5261
5262 for (i = 0; i < num_hierarchies; i++) {
5263 if (hierarchies[i])
5264 free(hierarchies[i]);
5265 if (fd_hierarchies && fd_hierarchies[i] >= 0)
5266 close(fd_hierarchies[i]);
5267 }
5268 free(hierarchies);
5269 free(fd_hierarchies);
5270
5271 if (cgroup_mount_ns_fd >= 0)
5272 close(cgroup_mount_ns_fd);
5273 }