]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
bindings: better logging for write_string()
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #define __STDC_FORMAT_MACROS
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <fuse.h>
16 #include <inttypes.h>
17 #include <libgen.h>
18 #include <pthread.h>
19 #include <sched.h>
20 #include <stdbool.h>
21 #include <stdint.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <time.h>
26 #include <unistd.h>
27 #include <wait.h>
28 #include <linux/magic.h>
29 #include <linux/sched.h>
30 #include <sys/epoll.h>
31 #include <sys/mman.h>
32 #include <sys/mount.h>
33 #include <sys/param.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/sysinfo.h>
37 #include <sys/vfs.h>
38
39 #include "bindings.h"
40 #include "config.h" // for VERSION
41
42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
43 #define LXCFS_NUMSTRLEN64 21
44
45 /* Define pivot_root() if missing from the C library */
46 #ifndef HAVE_PIVOT_ROOT
47 static int pivot_root(const char * new_root, const char * put_old)
48 {
49 #ifdef __NR_pivot_root
50 return syscall(__NR_pivot_root, new_root, put_old);
51 #else
52 errno = ENOSYS;
53 return -1;
54 #endif
55 }
56 #else
57 extern int pivot_root(const char * new_root, const char * put_old);
58 #endif
59
60 enum {
61 LXC_TYPE_CGDIR,
62 LXC_TYPE_CGFILE,
63 LXC_TYPE_PROC_MEMINFO,
64 LXC_TYPE_PROC_CPUINFO,
65 LXC_TYPE_PROC_UPTIME,
66 LXC_TYPE_PROC_STAT,
67 LXC_TYPE_PROC_DISKSTATS,
68 LXC_TYPE_PROC_SWAPS,
69 LXC_TYPE_PROC_LOADAVG,
70 };
71
72 struct file_info {
73 char *controller;
74 char *cgroup;
75 char *file;
76 int type;
77 char *buf; // unused as of yet
78 int buflen;
79 int size; //actual data size
80 int cached;
81 };
82
83 struct cpuacct_usage {
84 uint64_t user;
85 uint64_t system;
86 };
87
88 /* The function of hash table.*/
89 #define LOAD_SIZE 100 /*the size of hash_table */
90 #define FLUSH_TIME 5 /*the flush rate */
91 #define DEPTH_DIR 3 /*the depth of per cgroup */
92 /* The function of calculate loadavg .*/
93 #define FSHIFT 11 /* nr of bits of precision */
94 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
95 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
96 #define EXP_5 2014 /* 1/exp(5sec/5min) */
97 #define EXP_15 2037 /* 1/exp(5sec/15min) */
98 #define LOAD_INT(x) ((x) >> FSHIFT)
99 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
100 /*
101 * This parameter is used for proc_loadavg_read().
102 * 1 means use loadavg, 0 means not use.
103 */
104 static int loadavg = 0;
105 static volatile sig_atomic_t loadavg_stop = 0;
106 static int calc_hash(char *name)
107 {
108 unsigned int hash = 0;
109 unsigned int x = 0;
110 /* ELFHash algorithm. */
111 while (*name) {
112 hash = (hash << 4) + *name++;
113 x = hash & 0xf0000000;
114 if (x != 0)
115 hash ^= (x >> 24);
116 hash &= ~x;
117 }
118 return ((hash & 0x7fffffff) % LOAD_SIZE);
119 }
120
121 struct load_node {
122 char *cg; /*cg */
123 unsigned long avenrun[3]; /* Load averages */
124 unsigned int run_pid;
125 unsigned int total_pid;
126 unsigned int last_pid;
127 int cfd; /* The file descriptor of the mounted cgroup */
128 struct load_node *next;
129 struct load_node **pre;
130 };
131
132 struct load_head {
133 /*
134 * The lock is about insert load_node and refresh load_node.To the first
135 * load_node of each hash bucket, insert and refresh in this hash bucket is
136 * mutually exclusive.
137 */
138 pthread_mutex_t lock;
139 /*
140 * The rdlock is about read loadavg and delete load_node.To each hash
141 * bucket, read and delete is mutually exclusive. But at the same time, we
142 * allow paratactic read operation. This rdlock is at list level.
143 */
144 pthread_rwlock_t rdlock;
145 /*
146 * The rilock is about read loadavg and insert load_node.To the first
147 * load_node of each hash bucket, read and insert is mutually exclusive.
148 * But at the same time, we allow paratactic read operation.
149 */
150 pthread_rwlock_t rilock;
151 struct load_node *next;
152 };
153
154 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
155 /*
156 * init_load initialize the hash table.
157 * Return 0 on success, return -1 on failure.
158 */
159 static int init_load(void)
160 {
161 int i;
162 int ret;
163
164 for (i = 0; i < LOAD_SIZE; i++) {
165 load_hash[i].next = NULL;
166 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
167 if (ret != 0) {
168 lxcfs_error("%s\n", "Failed to initialize lock");
169 goto out3;
170 }
171 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
172 if (ret != 0) {
173 lxcfs_error("%s\n", "Failed to initialize rdlock");
174 goto out2;
175 }
176 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
177 if (ret != 0) {
178 lxcfs_error("%s\n", "Failed to initialize rilock");
179 goto out1;
180 }
181 }
182 return 0;
183 out1:
184 pthread_rwlock_destroy(&load_hash[i].rdlock);
185 out2:
186 pthread_mutex_destroy(&load_hash[i].lock);
187 out3:
188 while (i > 0) {
189 i--;
190 pthread_mutex_destroy(&load_hash[i].lock);
191 pthread_rwlock_destroy(&load_hash[i].rdlock);
192 pthread_rwlock_destroy(&load_hash[i].rilock);
193 }
194 return -1;
195 }
196
197 static void insert_node(struct load_node **n, int locate)
198 {
199 struct load_node *f;
200
201 pthread_mutex_lock(&load_hash[locate].lock);
202 pthread_rwlock_wrlock(&load_hash[locate].rilock);
203 f = load_hash[locate].next;
204 load_hash[locate].next = *n;
205
206 (*n)->pre = &(load_hash[locate].next);
207 if (f)
208 f->pre = &((*n)->next);
209 (*n)->next = f;
210 pthread_mutex_unlock(&load_hash[locate].lock);
211 pthread_rwlock_unlock(&load_hash[locate].rilock);
212 }
213 /*
214 * locate_node() finds special node. Not return NULL means success.
215 * It should be noted that rdlock isn't unlocked at the end of code
216 * because this function is used to read special node. Delete is not
217 * allowed before read has ended.
218 * unlock rdlock only in proc_loadavg_read().
219 */
220 static struct load_node *locate_node(char *cg, int locate)
221 {
222 struct load_node *f = NULL;
223 int i = 0;
224
225 pthread_rwlock_rdlock(&load_hash[locate].rilock);
226 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
227 if (load_hash[locate].next == NULL) {
228 pthread_rwlock_unlock(&load_hash[locate].rilock);
229 return f;
230 }
231 f = load_hash[locate].next;
232 pthread_rwlock_unlock(&load_hash[locate].rilock);
233 while (f && ((i = strcmp(f->cg, cg)) != 0))
234 f = f->next;
235 return f;
236 }
237 /* Delete the load_node n and return the next node of it. */
238 static struct load_node *del_node(struct load_node *n, int locate)
239 {
240 struct load_node *g;
241
242 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
243 if (n->next == NULL) {
244 *(n->pre) = NULL;
245 } else {
246 *(n->pre) = n->next;
247 n->next->pre = n->pre;
248 }
249 g = n->next;
250 free(n->cg);
251 free(n);
252 pthread_rwlock_unlock(&load_hash[locate].rdlock);
253 return g;
254 }
255
256 static void load_free(void)
257 {
258 int i;
259 struct load_node *f, *p;
260
261 for (i = 0; i < LOAD_SIZE; i++) {
262 pthread_mutex_lock(&load_hash[i].lock);
263 pthread_rwlock_wrlock(&load_hash[i].rilock);
264 pthread_rwlock_wrlock(&load_hash[i].rdlock);
265 if (load_hash[i].next == NULL) {
266 pthread_mutex_unlock(&load_hash[i].lock);
267 pthread_mutex_destroy(&load_hash[i].lock);
268 pthread_rwlock_unlock(&load_hash[i].rilock);
269 pthread_rwlock_destroy(&load_hash[i].rilock);
270 pthread_rwlock_unlock(&load_hash[i].rdlock);
271 pthread_rwlock_destroy(&load_hash[i].rdlock);
272 continue;
273 }
274 for (f = load_hash[i].next; f; ) {
275 free(f->cg);
276 p = f->next;
277 free(f);
278 f = p;
279 }
280 pthread_mutex_unlock(&load_hash[i].lock);
281 pthread_mutex_destroy(&load_hash[i].lock);
282 pthread_rwlock_unlock(&load_hash[i].rilock);
283 pthread_rwlock_destroy(&load_hash[i].rilock);
284 pthread_rwlock_unlock(&load_hash[i].rdlock);
285 pthread_rwlock_destroy(&load_hash[i].rdlock);
286 }
287 }
288 /* Reserve buffer size to account for file size changes. */
289 #define BUF_RESERVE_SIZE 512
290
291 /*
292 * A table caching which pid is init for a pid namespace.
293 * When looking up which pid is init for $qpid, we first
294 * 1. Stat /proc/$qpid/ns/pid.
295 * 2. Check whether the ino_t is in our store.
296 * a. if not, fork a child in qpid's ns to send us
297 * ucred.pid = 1, and read the initpid. Cache
298 * initpid and creation time for /proc/initpid
299 * in a new store entry.
300 * b. if so, verify that /proc/initpid still matches
301 * what we have saved. If not, clear the store
302 * entry and go back to a. If so, return the
303 * cached initpid.
304 */
305 struct pidns_init_store {
306 ino_t ino; // inode number for /proc/$pid/ns/pid
307 pid_t initpid; // the pid of nit in that ns
308 long int ctime; // the time at which /proc/$initpid was created
309 struct pidns_init_store *next;
310 long int lastcheck;
311 };
312
313 /* lol - look at how they are allocated in the kernel */
314 #define PIDNS_HASH_SIZE 4096
315 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
316
317 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
318 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
319 static void lock_mutex(pthread_mutex_t *l)
320 {
321 int ret;
322
323 if ((ret = pthread_mutex_lock(l)) != 0) {
324 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
325 exit(1);
326 }
327 }
328
329 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
330 * Number of hierarchies mounted. */
331 static int num_hierarchies;
332
333 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
334 * Hierachies mounted {cpuset, blkio, ...}:
335 * Initialized via __constructor__ collect_and_mount_subsystems(). */
336 static char **hierarchies;
337
338 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
339 * Open file descriptors:
340 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
341 * private mount namespace.
342 * Initialized via __constructor__ collect_and_mount_subsystems().
343 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
344 * mounts and respective files in the private namespace even when located in
345 * another namespace using the *at() family of functions
346 * {openat(), fchownat(), ...}. */
347 static int *fd_hierarchies;
348 static int cgroup_mount_ns_fd = -1;
349
350 static void unlock_mutex(pthread_mutex_t *l)
351 {
352 int ret;
353
354 if ((ret = pthread_mutex_unlock(l)) != 0) {
355 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
356 exit(1);
357 }
358 }
359
360 static void store_lock(void)
361 {
362 lock_mutex(&pidns_store_mutex);
363 }
364
365 static void store_unlock(void)
366 {
367 unlock_mutex(&pidns_store_mutex);
368 }
369
370 /* Must be called under store_lock */
371 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
372 {
373 struct stat initsb;
374 char fnam[100];
375
376 snprintf(fnam, 100, "/proc/%d", e->initpid);
377 if (stat(fnam, &initsb) < 0)
378 return false;
379
380 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
381 initsb.st_ctime, e->initpid);
382
383 if (e->ctime != initsb.st_ctime)
384 return false;
385 return true;
386 }
387
388 /* Must be called under store_lock */
389 static void remove_initpid(struct pidns_init_store *e)
390 {
391 struct pidns_init_store *tmp;
392 int h;
393
394 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
395
396 h = HASH(e->ino);
397 if (pidns_hash_table[h] == e) {
398 pidns_hash_table[h] = e->next;
399 free(e);
400 return;
401 }
402
403 tmp = pidns_hash_table[h];
404 while (tmp) {
405 if (tmp->next == e) {
406 tmp->next = e->next;
407 free(e);
408 return;
409 }
410 tmp = tmp->next;
411 }
412 }
413
414 #define PURGE_SECS 5
415 /* Must be called under store_lock */
416 static void prune_initpid_store(void)
417 {
418 static long int last_prune = 0;
419 struct pidns_init_store *e, *prev, *delme;
420 long int now, threshold;
421 int i;
422
423 if (!last_prune) {
424 last_prune = time(NULL);
425 return;
426 }
427 now = time(NULL);
428 if (now < last_prune + PURGE_SECS)
429 return;
430
431 lxcfs_debug("%s\n", "Pruning.");
432
433 last_prune = now;
434 threshold = now - 2 * PURGE_SECS;
435
436 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
437 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
438 if (e->lastcheck < threshold) {
439
440 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
441
442 delme = e;
443 if (prev)
444 prev->next = e->next;
445 else
446 pidns_hash_table[i] = e->next;
447 e = e->next;
448 free(delme);
449 } else {
450 prev = e;
451 e = e->next;
452 }
453 }
454 }
455 }
456
457 /* Must be called under store_lock */
458 static void save_initpid(struct stat *sb, pid_t pid)
459 {
460 struct pidns_init_store *e;
461 char fpath[100];
462 struct stat procsb;
463 int h;
464
465 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
466
467 snprintf(fpath, 100, "/proc/%d", pid);
468 if (stat(fpath, &procsb) < 0)
469 return;
470 do {
471 e = malloc(sizeof(*e));
472 } while (!e);
473 e->ino = sb->st_ino;
474 e->initpid = pid;
475 e->ctime = procsb.st_ctime;
476 h = HASH(e->ino);
477 e->next = pidns_hash_table[h];
478 e->lastcheck = time(NULL);
479 pidns_hash_table[h] = e;
480 }
481
482 /*
483 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
484 * entry for the inode number and creation time. Verify that the init pid
485 * is still valid. If not, remove it. Return the entry if valid, NULL
486 * otherwise.
487 * Must be called under store_lock
488 */
489 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
490 {
491 int h = HASH(sb->st_ino);
492 struct pidns_init_store *e = pidns_hash_table[h];
493
494 while (e) {
495 if (e->ino == sb->st_ino) {
496 if (initpid_still_valid(e, sb)) {
497 e->lastcheck = time(NULL);
498 return e;
499 }
500 remove_initpid(e);
501 return NULL;
502 }
503 e = e->next;
504 }
505
506 return NULL;
507 }
508
509 static int is_dir(const char *path, int fd)
510 {
511 struct stat statbuf;
512 int ret = fstatat(fd, path, &statbuf, fd);
513 if (ret == 0 && S_ISDIR(statbuf.st_mode))
514 return 1;
515 return 0;
516 }
517
518 static char *must_copy_string(const char *str)
519 {
520 char *dup = NULL;
521 if (!str)
522 return NULL;
523 do {
524 dup = strdup(str);
525 } while (!dup);
526
527 return dup;
528 }
529
530 static inline void drop_trailing_newlines(char *s)
531 {
532 int l;
533
534 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
535 s[l-1] = '\0';
536 }
537
538 #define BATCH_SIZE 50
539 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
540 {
541 int newbatches = (newlen / BATCH_SIZE) + 1;
542 int oldbatches = (oldlen / BATCH_SIZE) + 1;
543
544 if (!*mem || newbatches > oldbatches) {
545 char *tmp;
546 do {
547 tmp = realloc(*mem, newbatches * BATCH_SIZE);
548 } while (!tmp);
549 *mem = tmp;
550 }
551 }
552 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
553 {
554 size_t newlen = *len + linelen;
555 dorealloc(contents, *len, newlen + 1);
556 memcpy(*contents + *len, line, linelen+1);
557 *len = newlen;
558 }
559
560 static char *slurp_file(const char *from, int fd)
561 {
562 char *line = NULL;
563 char *contents = NULL;
564 FILE *f = fdopen(fd, "r");
565 size_t len = 0, fulllen = 0;
566 ssize_t linelen;
567
568 if (!f)
569 return NULL;
570
571 while ((linelen = getline(&line, &len, f)) != -1) {
572 append_line(&contents, &fulllen, line, linelen);
573 }
574 fclose(f);
575
576 if (contents)
577 drop_trailing_newlines(contents);
578 free(line);
579 return contents;
580 }
581
582 static bool write_string(const char *fnam, const char *string, int fd)
583 {
584 FILE *f;
585 size_t len, ret;
586
587 f = fdopen(fd, "w");
588 if (!f)
589 return false;
590
591 len = strlen(string);
592 ret = fwrite(string, 1, len, f);
593 if (ret != len) {
594 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
595 strerror(errno), string, fnam);
596 fclose(f);
597 return false;
598 }
599
600 if (fclose(f) < 0) {
601 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
602 return false;
603 }
604
605 return true;
606 }
607
608 struct cgfs_files {
609 char *name;
610 uint32_t uid, gid;
611 uint32_t mode;
612 };
613
614 #define ALLOC_NUM 20
615 static bool store_hierarchy(char *stridx, char *h)
616 {
617 if (num_hierarchies % ALLOC_NUM == 0) {
618 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
619 n *= ALLOC_NUM;
620 char **tmp = realloc(hierarchies, n * sizeof(char *));
621 if (!tmp) {
622 lxcfs_error("%s\n", strerror(errno));
623 exit(1);
624 }
625 hierarchies = tmp;
626 }
627
628 hierarchies[num_hierarchies++] = must_copy_string(h);
629 return true;
630 }
631
632 static void print_subsystems(void)
633 {
634 int i;
635
636 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
637 fprintf(stderr, "hierarchies:\n");
638 for (i = 0; i < num_hierarchies; i++) {
639 if (hierarchies[i])
640 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
641 fd_hierarchies[i], hierarchies[i]);
642 }
643 }
644
645 static bool in_comma_list(const char *needle, const char *haystack)
646 {
647 const char *s = haystack, *e;
648 size_t nlen = strlen(needle);
649
650 while (*s && (e = strchr(s, ','))) {
651 if (nlen != e - s) {
652 s = e + 1;
653 continue;
654 }
655 if (strncmp(needle, s, nlen) == 0)
656 return true;
657 s = e + 1;
658 }
659 if (strcmp(needle, s) == 0)
660 return true;
661 return false;
662 }
663
664 /* do we need to do any massaging here? I'm not sure... */
665 /* Return the mounted controller and store the corresponding open file descriptor
666 * referring to the controller mountpoint in the private lxcfs namespace in
667 * @cfd.
668 */
669 static char *find_mounted_controller(const char *controller, int *cfd)
670 {
671 int i;
672
673 for (i = 0; i < num_hierarchies; i++) {
674 if (!hierarchies[i])
675 continue;
676 if (strcmp(hierarchies[i], controller) == 0) {
677 *cfd = fd_hierarchies[i];
678 return hierarchies[i];
679 }
680 if (in_comma_list(controller, hierarchies[i])) {
681 *cfd = fd_hierarchies[i];
682 return hierarchies[i];
683 }
684 }
685
686 return NULL;
687 }
688
689 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
690 const char *value)
691 {
692 int ret, fd, cfd;
693 size_t len;
694 char *fnam, *tmpc;
695
696 tmpc = find_mounted_controller(controller, &cfd);
697 if (!tmpc)
698 return false;
699
700 /* Make sure we pass a relative path to *at() family of functions.
701 * . + /cgroup + / + file + \0
702 */
703 len = strlen(cgroup) + strlen(file) + 3;
704 fnam = alloca(len);
705 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
706 if (ret < 0 || (size_t)ret >= len)
707 return false;
708
709 fd = openat(cfd, fnam, O_WRONLY);
710 if (fd < 0)
711 return false;
712
713 return write_string(fnam, value, fd);
714 }
715
716 // Chown all the files in the cgroup directory. We do this when we create
717 // a cgroup on behalf of a user.
718 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
719 {
720 struct dirent *direntp;
721 char path[MAXPATHLEN];
722 size_t len;
723 DIR *d;
724 int fd1, ret;
725
726 len = strlen(dirname);
727 if (len >= MAXPATHLEN) {
728 lxcfs_error("Pathname too long: %s\n", dirname);
729 return;
730 }
731
732 fd1 = openat(fd, dirname, O_DIRECTORY);
733 if (fd1 < 0)
734 return;
735
736 d = fdopendir(fd1);
737 if (!d) {
738 lxcfs_error("Failed to open %s\n", dirname);
739 return;
740 }
741
742 while ((direntp = readdir(d))) {
743 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
744 continue;
745 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
746 if (ret < 0 || ret >= MAXPATHLEN) {
747 lxcfs_error("Pathname too long under %s\n", dirname);
748 continue;
749 }
750 if (fchownat(fd, path, uid, gid, 0) < 0)
751 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
752 }
753 closedir(d);
754 }
755
756 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
757 {
758 int cfd;
759 size_t len;
760 char *dirnam, *tmpc;
761
762 tmpc = find_mounted_controller(controller, &cfd);
763 if (!tmpc)
764 return -EINVAL;
765
766 /* Make sure we pass a relative path to *at() family of functions.
767 * . + /cg + \0
768 */
769 len = strlen(cg) + 2;
770 dirnam = alloca(len);
771 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
772
773 if (mkdirat(cfd, dirnam, 0755) < 0)
774 return -errno;
775
776 if (uid == 0 && gid == 0)
777 return 0;
778
779 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
780 return -errno;
781
782 chown_all_cgroup_files(dirnam, uid, gid, cfd);
783
784 return 0;
785 }
786
787 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
788 {
789 struct dirent *direntp;
790 DIR *dir;
791 bool ret = false;
792 char pathname[MAXPATHLEN];
793 int dupfd;
794
795 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
796 if (dupfd < 0)
797 return false;
798
799 dir = fdopendir(dupfd);
800 if (!dir) {
801 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
802 close(dupfd);
803 return false;
804 }
805
806 while ((direntp = readdir(dir))) {
807 struct stat mystat;
808 int rc;
809
810 if (!strcmp(direntp->d_name, ".") ||
811 !strcmp(direntp->d_name, ".."))
812 continue;
813
814 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
815 if (rc < 0 || rc >= MAXPATHLEN) {
816 lxcfs_error("%s\n", "Pathname too long.");
817 continue;
818 }
819
820 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
821 if (rc) {
822 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
823 continue;
824 }
825 if (S_ISDIR(mystat.st_mode))
826 if (!recursive_rmdir(pathname, fd, cfd))
827 lxcfs_debug("Error removing %s.\n", pathname);
828 }
829
830 ret = true;
831 if (closedir(dir) < 0) {
832 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
833 ret = false;
834 }
835
836 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
837 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
838 ret = false;
839 }
840
841 close(dupfd);
842
843 return ret;
844 }
845
846 bool cgfs_remove(const char *controller, const char *cg)
847 {
848 int fd, cfd;
849 size_t len;
850 char *dirnam, *tmpc;
851 bool bret;
852
853 tmpc = find_mounted_controller(controller, &cfd);
854 if (!tmpc)
855 return false;
856
857 /* Make sure we pass a relative path to *at() family of functions.
858 * . + /cg + \0
859 */
860 len = strlen(cg) + 2;
861 dirnam = alloca(len);
862 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
863
864 fd = openat(cfd, dirnam, O_DIRECTORY);
865 if (fd < 0)
866 return false;
867
868 bret = recursive_rmdir(dirnam, fd, cfd);
869 close(fd);
870 return bret;
871 }
872
873 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
874 {
875 int cfd;
876 size_t len;
877 char *pathname, *tmpc;
878
879 tmpc = find_mounted_controller(controller, &cfd);
880 if (!tmpc)
881 return false;
882
883 /* Make sure we pass a relative path to *at() family of functions.
884 * . + /file + \0
885 */
886 len = strlen(file) + 2;
887 pathname = alloca(len);
888 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
889 if (fchmodat(cfd, pathname, mode, 0) < 0)
890 return false;
891 return true;
892 }
893
894 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
895 {
896 size_t len;
897 char *fname;
898
899 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
900 fname = alloca(len);
901 snprintf(fname, len, "%s/tasks", dirname);
902 if (fchownat(fd, fname, uid, gid, 0) != 0)
903 return -errno;
904 snprintf(fname, len, "%s/cgroup.procs", dirname);
905 if (fchownat(fd, fname, uid, gid, 0) != 0)
906 return -errno;
907 return 0;
908 }
909
910 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
911 {
912 int cfd;
913 size_t len;
914 char *pathname, *tmpc;
915
916 tmpc = find_mounted_controller(controller, &cfd);
917 if (!tmpc)
918 return -EINVAL;
919
920 /* Make sure we pass a relative path to *at() family of functions.
921 * . + /file + \0
922 */
923 len = strlen(file) + 2;
924 pathname = alloca(len);
925 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
926 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
927 return -errno;
928
929 if (is_dir(pathname, cfd))
930 // like cgmanager did, we want to chown the tasks file as well
931 return chown_tasks_files(pathname, uid, gid, cfd);
932
933 return 0;
934 }
935
936 FILE *open_pids_file(const char *controller, const char *cgroup)
937 {
938 int fd, cfd;
939 size_t len;
940 char *pathname, *tmpc;
941
942 tmpc = find_mounted_controller(controller, &cfd);
943 if (!tmpc)
944 return NULL;
945
946 /* Make sure we pass a relative path to *at() family of functions.
947 * . + /cgroup + / "cgroup.procs" + \0
948 */
949 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
950 pathname = alloca(len);
951 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
952
953 fd = openat(cfd, pathname, O_WRONLY);
954 if (fd < 0)
955 return NULL;
956
957 return fdopen(fd, "w");
958 }
959
960 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
961 void ***list, size_t typesize,
962 void* (*iterator)(const char*, const char*, const char*))
963 {
964 int cfd, fd, ret;
965 size_t len;
966 char *cg, *tmpc;
967 char pathname[MAXPATHLEN];
968 size_t sz = 0, asz = 0;
969 struct dirent *dirent;
970 DIR *dir;
971
972 tmpc = find_mounted_controller(controller, &cfd);
973 *list = NULL;
974 if (!tmpc)
975 return false;
976
977 /* Make sure we pass a relative path to *at() family of functions. */
978 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
979 cg = alloca(len);
980 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
981 if (ret < 0 || (size_t)ret >= len) {
982 lxcfs_error("Pathname too long under %s\n", cgroup);
983 return false;
984 }
985
986 fd = openat(cfd, cg, O_DIRECTORY);
987 if (fd < 0)
988 return false;
989
990 dir = fdopendir(fd);
991 if (!dir)
992 return false;
993
994 while ((dirent = readdir(dir))) {
995 struct stat mystat;
996
997 if (!strcmp(dirent->d_name, ".") ||
998 !strcmp(dirent->d_name, ".."))
999 continue;
1000
1001 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1002 if (ret < 0 || ret >= MAXPATHLEN) {
1003 lxcfs_error("Pathname too long under %s\n", cg);
1004 continue;
1005 }
1006
1007 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1008 if (ret) {
1009 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1010 continue;
1011 }
1012 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1013 (directories && !S_ISDIR(mystat.st_mode)))
1014 continue;
1015
1016 if (sz+2 >= asz) {
1017 void **tmp;
1018 asz += BATCH_SIZE;
1019 do {
1020 tmp = realloc(*list, asz * typesize);
1021 } while (!tmp);
1022 *list = tmp;
1023 }
1024 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1025 (*list)[sz+1] = NULL;
1026 sz++;
1027 }
1028 if (closedir(dir) < 0) {
1029 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1030 return false;
1031 }
1032 return true;
1033 }
1034
1035 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1036 {
1037 char *dup;
1038 do {
1039 dup = strdup(dir_entry);
1040 } while (!dup);
1041 return dup;
1042 }
1043
1044 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1045 {
1046 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1047 }
1048
1049 void free_key(struct cgfs_files *k)
1050 {
1051 if (!k)
1052 return;
1053 free(k->name);
1054 free(k);
1055 }
1056
1057 void free_keys(struct cgfs_files **keys)
1058 {
1059 int i;
1060
1061 if (!keys)
1062 return;
1063 for (i = 0; keys[i]; i++) {
1064 free_key(keys[i]);
1065 }
1066 free(keys);
1067 }
1068
1069 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1070 {
1071 int ret, fd, cfd;
1072 size_t len;
1073 char *fnam, *tmpc;
1074
1075 tmpc = find_mounted_controller(controller, &cfd);
1076 if (!tmpc)
1077 return false;
1078
1079 /* Make sure we pass a relative path to *at() family of functions.
1080 * . + /cgroup + / + file + \0
1081 */
1082 len = strlen(cgroup) + strlen(file) + 3;
1083 fnam = alloca(len);
1084 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1085 if (ret < 0 || (size_t)ret >= len)
1086 return false;
1087
1088 fd = openat(cfd, fnam, O_RDONLY);
1089 if (fd < 0)
1090 return false;
1091
1092 *value = slurp_file(fnam, fd);
1093 return *value != NULL;
1094 }
1095
1096 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1097 {
1098 int ret, cfd;
1099 size_t len;
1100 char *fnam, *tmpc;
1101 struct stat sb;
1102 struct cgfs_files *newkey;
1103
1104 tmpc = find_mounted_controller(controller, &cfd);
1105 if (!tmpc)
1106 return false;
1107
1108 if (file && *file == '/')
1109 file++;
1110
1111 if (file && strchr(file, '/'))
1112 return NULL;
1113
1114 /* Make sure we pass a relative path to *at() family of functions.
1115 * . + /cgroup + / + file + \0
1116 */
1117 len = strlen(cgroup) + 3;
1118 if (file)
1119 len += strlen(file) + 1;
1120 fnam = alloca(len);
1121 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1122 file ? "/" : "", file ? file : "");
1123
1124 ret = fstatat(cfd, fnam, &sb, 0);
1125 if (ret < 0)
1126 return NULL;
1127
1128 do {
1129 newkey = malloc(sizeof(struct cgfs_files));
1130 } while (!newkey);
1131 if (file)
1132 newkey->name = must_copy_string(file);
1133 else if (strrchr(cgroup, '/'))
1134 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1135 else
1136 newkey->name = must_copy_string(cgroup);
1137 newkey->uid = sb.st_uid;
1138 newkey->gid = sb.st_gid;
1139 newkey->mode = sb.st_mode;
1140
1141 return newkey;
1142 }
1143
1144 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1145 {
1146 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1147 if (!entry) {
1148 lxcfs_error("Error getting files under %s:%s\n", controller,
1149 cgroup);
1150 }
1151 return entry;
1152 }
1153
1154 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1155 {
1156 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1157 }
1158
1159 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1160 {
1161 int cfd;
1162 size_t len;
1163 char *fnam, *tmpc;
1164 int ret;
1165 struct stat sb;
1166
1167 tmpc = find_mounted_controller(controller, &cfd);
1168 if (!tmpc)
1169 return false;
1170
1171 /* Make sure we pass a relative path to *at() family of functions.
1172 * . + /cgroup + / + f + \0
1173 */
1174 len = strlen(cgroup) + strlen(f) + 3;
1175 fnam = alloca(len);
1176 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1177 if (ret < 0 || (size_t)ret >= len)
1178 return false;
1179
1180 ret = fstatat(cfd, fnam, &sb, 0);
1181 if (ret < 0 || !S_ISDIR(sb.st_mode))
1182 return false;
1183
1184 return true;
1185 }
1186
1187 #define SEND_CREDS_OK 0
1188 #define SEND_CREDS_NOTSK 1
1189 #define SEND_CREDS_FAIL 2
1190 static bool recv_creds(int sock, struct ucred *cred, char *v);
1191 static int wait_for_pid(pid_t pid);
1192 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1193 static int send_creds_clone_wrapper(void *arg);
1194
1195 /*
1196 * clone a task which switches to @task's namespace and writes '1'.
1197 * over a unix sock so we can read the task's reaper's pid in our
1198 * namespace
1199 *
1200 * Note: glibc's fork() does not respect pidns, which can lead to failed
1201 * assertions inside glibc (and thus failed forks) if the child's pid in
1202 * the pidns and the parent pid outside are identical. Using clone prevents
1203 * this issue.
1204 */
1205 static void write_task_init_pid_exit(int sock, pid_t target)
1206 {
1207 char fnam[100];
1208 pid_t pid;
1209 int fd, ret;
1210 size_t stack_size = sysconf(_SC_PAGESIZE);
1211 void *stack = alloca(stack_size);
1212
1213 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1214 if (ret < 0 || ret >= sizeof(fnam))
1215 _exit(1);
1216
1217 fd = open(fnam, O_RDONLY);
1218 if (fd < 0) {
1219 perror("write_task_init_pid_exit open of ns/pid");
1220 _exit(1);
1221 }
1222 if (setns(fd, 0)) {
1223 perror("write_task_init_pid_exit setns 1");
1224 close(fd);
1225 _exit(1);
1226 }
1227 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1228 if (pid < 0)
1229 _exit(1);
1230 if (pid != 0) {
1231 if (!wait_for_pid(pid))
1232 _exit(1);
1233 _exit(0);
1234 }
1235 }
1236
1237 static int send_creds_clone_wrapper(void *arg) {
1238 struct ucred cred;
1239 char v;
1240 int sock = *(int *)arg;
1241
1242 /* we are the child */
1243 cred.uid = 0;
1244 cred.gid = 0;
1245 cred.pid = 1;
1246 v = '1';
1247 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1248 return 1;
1249 return 0;
1250 }
1251
1252 static pid_t get_init_pid_for_task(pid_t task)
1253 {
1254 int sock[2];
1255 pid_t pid;
1256 pid_t ret = -1;
1257 char v = '0';
1258 struct ucred cred;
1259
1260 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1261 perror("socketpair");
1262 return -1;
1263 }
1264
1265 pid = fork();
1266 if (pid < 0)
1267 goto out;
1268 if (!pid) {
1269 close(sock[1]);
1270 write_task_init_pid_exit(sock[0], task);
1271 _exit(0);
1272 }
1273
1274 if (!recv_creds(sock[1], &cred, &v))
1275 goto out;
1276 ret = cred.pid;
1277
1278 out:
1279 close(sock[0]);
1280 close(sock[1]);
1281 if (pid > 0)
1282 wait_for_pid(pid);
1283 return ret;
1284 }
1285
1286 static pid_t lookup_initpid_in_store(pid_t qpid)
1287 {
1288 pid_t answer = 0;
1289 struct stat sb;
1290 struct pidns_init_store *e;
1291 char fnam[100];
1292
1293 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1294 store_lock();
1295 if (stat(fnam, &sb) < 0)
1296 goto out;
1297 e = lookup_verify_initpid(&sb);
1298 if (e) {
1299 answer = e->initpid;
1300 goto out;
1301 }
1302 answer = get_init_pid_for_task(qpid);
1303 if (answer > 0)
1304 save_initpid(&sb, answer);
1305
1306 out:
1307 /* we prune at end in case we are returning
1308 * the value we were about to return */
1309 prune_initpid_store();
1310 store_unlock();
1311 return answer;
1312 }
1313
1314 static int wait_for_pid(pid_t pid)
1315 {
1316 int status, ret;
1317
1318 if (pid <= 0)
1319 return -1;
1320
1321 again:
1322 ret = waitpid(pid, &status, 0);
1323 if (ret == -1) {
1324 if (errno == EINTR)
1325 goto again;
1326 return -1;
1327 }
1328 if (ret != pid)
1329 goto again;
1330 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1331 return -1;
1332 return 0;
1333 }
1334
1335
1336 /*
1337 * append pid to *src.
1338 * src: a pointer to a char* in which ot append the pid.
1339 * sz: the number of characters printed so far, minus trailing \0.
1340 * asz: the allocated size so far
1341 * pid: the pid to append
1342 */
1343 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1344 {
1345 char tmp[30];
1346
1347 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1348
1349 if (!*src || tmplen + *sz + 1 >= *asz) {
1350 char *tmp;
1351 do {
1352 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1353 } while (!tmp);
1354 *src = tmp;
1355 *asz += BUF_RESERVE_SIZE;
1356 }
1357 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1358 *sz += tmplen;
1359 }
1360
1361 /*
1362 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1363 * valid in the caller's namespace, return the id mapped into
1364 * pid's namespace.
1365 * Returns the mapped id, or -1 on error.
1366 */
1367 unsigned int
1368 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1369 {
1370 unsigned int nsuid, // base id for a range in the idfile's namespace
1371 hostuid, // base id for a range in the caller's namespace
1372 count; // number of ids in this range
1373 char line[400];
1374 int ret;
1375
1376 fseek(idfile, 0L, SEEK_SET);
1377 while (fgets(line, 400, idfile)) {
1378 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1379 if (ret != 3)
1380 continue;
1381 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1382 /*
1383 * uids wrapped around - unexpected as this is a procfile,
1384 * so just bail.
1385 */
1386 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1387 nsuid, hostuid, count, line);
1388 return -1;
1389 }
1390 if (hostuid <= in_id && hostuid+count > in_id) {
1391 /*
1392 * now since hostuid <= in_id < hostuid+count, and
1393 * hostuid+count and nsuid+count do not wrap around,
1394 * we know that nsuid+(in_id-hostuid) which must be
1395 * less that nsuid+(count) must not wrap around
1396 */
1397 return (in_id - hostuid) + nsuid;
1398 }
1399 }
1400
1401 // no answer found
1402 return -1;
1403 }
1404
1405 /*
1406 * for is_privileged_over,
1407 * specify whether we require the calling uid to be root in his
1408 * namespace
1409 */
1410 #define NS_ROOT_REQD true
1411 #define NS_ROOT_OPT false
1412
1413 #define PROCLEN 100
1414
1415 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1416 {
1417 char fpath[PROCLEN];
1418 int ret;
1419 bool answer = false;
1420 uid_t nsuid;
1421
1422 if (victim == -1 || uid == -1)
1423 return false;
1424
1425 /*
1426 * If the request is one not requiring root in the namespace,
1427 * then having the same uid suffices. (i.e. uid 1000 has write
1428 * access to files owned by uid 1000
1429 */
1430 if (!req_ns_root && uid == victim)
1431 return true;
1432
1433 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1434 if (ret < 0 || ret >= PROCLEN)
1435 return false;
1436 FILE *f = fopen(fpath, "r");
1437 if (!f)
1438 return false;
1439
1440 /* if caller's not root in his namespace, reject */
1441 nsuid = convert_id_to_ns(f, uid);
1442 if (nsuid)
1443 goto out;
1444
1445 /*
1446 * If victim is not mapped into caller's ns, reject.
1447 * XXX I'm not sure this check is needed given that fuse
1448 * will be sending requests where the vfs has converted
1449 */
1450 nsuid = convert_id_to_ns(f, victim);
1451 if (nsuid == -1)
1452 goto out;
1453
1454 answer = true;
1455
1456 out:
1457 fclose(f);
1458 return answer;
1459 }
1460
1461 static bool perms_include(int fmode, mode_t req_mode)
1462 {
1463 mode_t r;
1464
1465 switch (req_mode & O_ACCMODE) {
1466 case O_RDONLY:
1467 r = S_IROTH;
1468 break;
1469 case O_WRONLY:
1470 r = S_IWOTH;
1471 break;
1472 case O_RDWR:
1473 r = S_IROTH | S_IWOTH;
1474 break;
1475 default:
1476 return false;
1477 }
1478 return ((fmode & r) == r);
1479 }
1480
1481
1482 /*
1483 * taskcg is a/b/c
1484 * querycg is /a/b/c/d/e
1485 * we return 'd'
1486 */
1487 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1488 {
1489 char *start, *end;
1490
1491 if (strlen(taskcg) <= strlen(querycg)) {
1492 lxcfs_error("%s\n", "I was fed bad input.");
1493 return NULL;
1494 }
1495
1496 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1497 start = strdup(taskcg + 1);
1498 else
1499 start = strdup(taskcg + strlen(querycg) + 1);
1500 if (!start)
1501 return NULL;
1502 end = strchr(start, '/');
1503 if (end)
1504 *end = '\0';
1505 return start;
1506 }
1507
1508 static void stripnewline(char *x)
1509 {
1510 size_t l = strlen(x);
1511 if (l && x[l-1] == '\n')
1512 x[l-1] = '\0';
1513 }
1514
1515 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1516 {
1517 int cfd;
1518 char fnam[PROCLEN];
1519 FILE *f;
1520 char *answer = NULL;
1521 char *line = NULL;
1522 size_t len = 0;
1523 int ret;
1524 const char *h = find_mounted_controller(contrl, &cfd);
1525 if (!h)
1526 return NULL;
1527
1528 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1529 if (ret < 0 || ret >= PROCLEN)
1530 return NULL;
1531 if (!(f = fopen(fnam, "r")))
1532 return NULL;
1533
1534 while (getline(&line, &len, f) != -1) {
1535 char *c1, *c2;
1536 if (!line[0])
1537 continue;
1538 c1 = strchr(line, ':');
1539 if (!c1)
1540 goto out;
1541 c1++;
1542 c2 = strchr(c1, ':');
1543 if (!c2)
1544 goto out;
1545 *c2 = '\0';
1546 if (strcmp(c1, h) != 0)
1547 continue;
1548 c2++;
1549 stripnewline(c2);
1550 do {
1551 answer = strdup(c2);
1552 } while (!answer);
1553 break;
1554 }
1555
1556 out:
1557 fclose(f);
1558 free(line);
1559 return answer;
1560 }
1561
1562 /*
1563 * check whether a fuse context may access a cgroup dir or file
1564 *
1565 * If file is not null, it is a cgroup file to check under cg.
1566 * If file is null, then we are checking perms on cg itself.
1567 *
1568 * For files we can check the mode of the list_keys result.
1569 * For cgroups, we must make assumptions based on the files under the
1570 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1571 * yet.
1572 */
1573 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1574 {
1575 struct cgfs_files *k = NULL;
1576 bool ret = false;
1577
1578 k = cgfs_get_key(contrl, cg, file);
1579 if (!k)
1580 return false;
1581
1582 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1583 if (perms_include(k->mode >> 6, mode)) {
1584 ret = true;
1585 goto out;
1586 }
1587 }
1588 if (fc->gid == k->gid) {
1589 if (perms_include(k->mode >> 3, mode)) {
1590 ret = true;
1591 goto out;
1592 }
1593 }
1594 ret = perms_include(k->mode, mode);
1595
1596 out:
1597 free_key(k);
1598 return ret;
1599 }
1600
1601 #define INITSCOPE "/init.scope"
1602 static void prune_init_slice(char *cg)
1603 {
1604 char *point;
1605 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1606
1607 if (cg_len < initscope_len)
1608 return;
1609
1610 point = cg + cg_len - initscope_len;
1611 if (strcmp(point, INITSCOPE) == 0) {
1612 if (point == cg)
1613 *(point+1) = '\0';
1614 else
1615 *point = '\0';
1616 }
1617 }
1618
1619 /*
1620 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1621 * If pid is in /a, he may act on /a/b, but not on /b.
1622 * if the answer is false and nextcg is not NULL, then *nextcg will point
1623 * to a string containing the next cgroup directory under cg, which must be
1624 * freed by the caller.
1625 */
1626 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1627 {
1628 bool answer = false;
1629 char *c2 = get_pid_cgroup(pid, contrl);
1630 char *linecmp;
1631
1632 if (!c2)
1633 return false;
1634 prune_init_slice(c2);
1635
1636 /*
1637 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1638 * they pass in a cgroup without leading '/'
1639 *
1640 * The original line here was:
1641 * linecmp = *cg == '/' ? c2 : c2+1;
1642 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1643 * Serge, do you know?
1644 */
1645 if (*cg == '/' || !strncmp(cg, "./", 2))
1646 linecmp = c2;
1647 else
1648 linecmp = c2 + 1;
1649 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1650 if (nextcg) {
1651 *nextcg = get_next_cgroup_dir(linecmp, cg);
1652 }
1653 goto out;
1654 }
1655 answer = true;
1656
1657 out:
1658 free(c2);
1659 return answer;
1660 }
1661
1662 /*
1663 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1664 */
1665 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1666 {
1667 bool answer = false;
1668 char *c2, *task_cg;
1669 size_t target_len, task_len;
1670
1671 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1672 return true;
1673
1674 c2 = get_pid_cgroup(pid, contrl);
1675 if (!c2)
1676 return false;
1677 prune_init_slice(c2);
1678
1679 task_cg = c2 + 1;
1680 target_len = strlen(cg);
1681 task_len = strlen(task_cg);
1682 if (task_len == 0) {
1683 /* Task is in the root cg, it can see everything. This case is
1684 * not handled by the strmcps below, since they test for the
1685 * last /, but that is the first / that we've chopped off
1686 * above.
1687 */
1688 answer = true;
1689 goto out;
1690 }
1691 if (strcmp(cg, task_cg) == 0) {
1692 answer = true;
1693 goto out;
1694 }
1695 if (target_len < task_len) {
1696 /* looking up a parent dir */
1697 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1698 answer = true;
1699 goto out;
1700 }
1701 if (target_len > task_len) {
1702 /* looking up a child dir */
1703 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1704 answer = true;
1705 goto out;
1706 }
1707
1708 out:
1709 free(c2);
1710 return answer;
1711 }
1712
1713 /*
1714 * given /cgroup/freezer/a/b, return "freezer".
1715 * the returned char* should NOT be freed.
1716 */
1717 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1718 {
1719 const char *p1;
1720 char *contr, *slash;
1721
1722 if (strlen(path) < 9) {
1723 errno = EACCES;
1724 return NULL;
1725 }
1726 if (*(path + 7) != '/') {
1727 errno = EINVAL;
1728 return NULL;
1729 }
1730 p1 = path + 8;
1731 contr = strdupa(p1);
1732 if (!contr) {
1733 errno = ENOMEM;
1734 return NULL;
1735 }
1736 slash = strstr(contr, "/");
1737 if (slash)
1738 *slash = '\0';
1739
1740 int i;
1741 for (i = 0; i < num_hierarchies; i++) {
1742 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1743 return hierarchies[i];
1744 }
1745 errno = ENOENT;
1746 return NULL;
1747 }
1748
1749 /*
1750 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1751 * Note that the returned value may include files (keynames) etc
1752 */
1753 static const char *find_cgroup_in_path(const char *path)
1754 {
1755 const char *p1;
1756
1757 if (strlen(path) < 9) {
1758 errno = EACCES;
1759 return NULL;
1760 }
1761 p1 = strstr(path + 8, "/");
1762 if (!p1) {
1763 errno = EINVAL;
1764 return NULL;
1765 }
1766 errno = 0;
1767 return p1 + 1;
1768 }
1769
1770 /*
1771 * split the last path element from the path in @cg.
1772 * @dir is newly allocated and should be freed, @last not
1773 */
1774 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1775 {
1776 char *p;
1777
1778 do {
1779 *dir = strdup(cg);
1780 } while (!*dir);
1781 *last = strrchr(cg, '/');
1782 if (!*last) {
1783 *last = NULL;
1784 return;
1785 }
1786 p = strrchr(*dir, '/');
1787 *p = '\0';
1788 }
1789
1790 /*
1791 * FUSE ops for /cgroup
1792 */
1793
1794 int cg_getattr(const char *path, struct stat *sb)
1795 {
1796 struct timespec now;
1797 struct fuse_context *fc = fuse_get_context();
1798 char * cgdir = NULL;
1799 char *last = NULL, *path1, *path2;
1800 struct cgfs_files *k = NULL;
1801 const char *cgroup;
1802 const char *controller = NULL;
1803 int ret = -ENOENT;
1804
1805
1806 if (!fc)
1807 return -EIO;
1808
1809 memset(sb, 0, sizeof(struct stat));
1810
1811 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1812 return -EINVAL;
1813
1814 sb->st_uid = sb->st_gid = 0;
1815 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1816 sb->st_size = 0;
1817
1818 if (strcmp(path, "/cgroup") == 0) {
1819 sb->st_mode = S_IFDIR | 00755;
1820 sb->st_nlink = 2;
1821 return 0;
1822 }
1823
1824 controller = pick_controller_from_path(fc, path);
1825 if (!controller)
1826 return -errno;
1827 cgroup = find_cgroup_in_path(path);
1828 if (!cgroup) {
1829 /* this is just /cgroup/controller, return it as a dir */
1830 sb->st_mode = S_IFDIR | 00755;
1831 sb->st_nlink = 2;
1832 return 0;
1833 }
1834
1835 get_cgdir_and_path(cgroup, &cgdir, &last);
1836
1837 if (!last) {
1838 path1 = "/";
1839 path2 = cgdir;
1840 } else {
1841 path1 = cgdir;
1842 path2 = last;
1843 }
1844
1845 pid_t initpid = lookup_initpid_in_store(fc->pid);
1846 if (initpid <= 0)
1847 initpid = fc->pid;
1848 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1849 * Then check that caller's cgroup is under path if last is a child
1850 * cgroup, or cgdir if last is a file */
1851
1852 if (is_child_cgroup(controller, path1, path2)) {
1853 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1854 ret = -ENOENT;
1855 goto out;
1856 }
1857 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1858 /* this is just /cgroup/controller, return it as a dir */
1859 sb->st_mode = S_IFDIR | 00555;
1860 sb->st_nlink = 2;
1861 ret = 0;
1862 goto out;
1863 }
1864 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1865 ret = -EACCES;
1866 goto out;
1867 }
1868
1869 // get uid, gid, from '/tasks' file and make up a mode
1870 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1871 sb->st_mode = S_IFDIR | 00755;
1872 k = cgfs_get_key(controller, cgroup, NULL);
1873 if (!k) {
1874 sb->st_uid = sb->st_gid = 0;
1875 } else {
1876 sb->st_uid = k->uid;
1877 sb->st_gid = k->gid;
1878 }
1879 free_key(k);
1880 sb->st_nlink = 2;
1881 ret = 0;
1882 goto out;
1883 }
1884
1885 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1886 sb->st_mode = S_IFREG | k->mode;
1887 sb->st_nlink = 1;
1888 sb->st_uid = k->uid;
1889 sb->st_gid = k->gid;
1890 sb->st_size = 0;
1891 free_key(k);
1892 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1893 ret = -ENOENT;
1894 goto out;
1895 }
1896 ret = 0;
1897 }
1898
1899 out:
1900 free(cgdir);
1901 return ret;
1902 }
1903
1904 int cg_opendir(const char *path, struct fuse_file_info *fi)
1905 {
1906 struct fuse_context *fc = fuse_get_context();
1907 const char *cgroup;
1908 struct file_info *dir_info;
1909 char *controller = NULL;
1910
1911 if (!fc)
1912 return -EIO;
1913
1914 if (strcmp(path, "/cgroup") == 0) {
1915 cgroup = NULL;
1916 controller = NULL;
1917 } else {
1918 // return list of keys for the controller, and list of child cgroups
1919 controller = pick_controller_from_path(fc, path);
1920 if (!controller)
1921 return -errno;
1922
1923 cgroup = find_cgroup_in_path(path);
1924 if (!cgroup) {
1925 /* this is just /cgroup/controller, return its contents */
1926 cgroup = "/";
1927 }
1928 }
1929
1930 pid_t initpid = lookup_initpid_in_store(fc->pid);
1931 if (initpid <= 0)
1932 initpid = fc->pid;
1933 if (cgroup) {
1934 if (!caller_may_see_dir(initpid, controller, cgroup))
1935 return -ENOENT;
1936 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1937 return -EACCES;
1938 }
1939
1940 /* we'll free this at cg_releasedir */
1941 dir_info = malloc(sizeof(*dir_info));
1942 if (!dir_info)
1943 return -ENOMEM;
1944 dir_info->controller = must_copy_string(controller);
1945 dir_info->cgroup = must_copy_string(cgroup);
1946 dir_info->type = LXC_TYPE_CGDIR;
1947 dir_info->buf = NULL;
1948 dir_info->file = NULL;
1949 dir_info->buflen = 0;
1950
1951 fi->fh = (unsigned long)dir_info;
1952 return 0;
1953 }
1954
1955 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1956 struct fuse_file_info *fi)
1957 {
1958 struct file_info *d = (struct file_info *)fi->fh;
1959 struct cgfs_files **list = NULL;
1960 int i, ret;
1961 char *nextcg = NULL;
1962 struct fuse_context *fc = fuse_get_context();
1963 char **clist = NULL;
1964
1965 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1966 return -EIO;
1967
1968 if (d->type != LXC_TYPE_CGDIR) {
1969 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1970 return -EIO;
1971 }
1972 if (!d->cgroup && !d->controller) {
1973 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1974 int i;
1975
1976 for (i = 0; i < num_hierarchies; i++) {
1977 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1978 return -EIO;
1979 }
1980 }
1981 return 0;
1982 }
1983
1984 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1985 // not a valid cgroup
1986 ret = -EINVAL;
1987 goto out;
1988 }
1989
1990 pid_t initpid = lookup_initpid_in_store(fc->pid);
1991 if (initpid <= 0)
1992 initpid = fc->pid;
1993 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1994 if (nextcg) {
1995 ret = filler(buf, nextcg, NULL, 0);
1996 free(nextcg);
1997 if (ret != 0) {
1998 ret = -EIO;
1999 goto out;
2000 }
2001 }
2002 ret = 0;
2003 goto out;
2004 }
2005
2006 for (i = 0; list[i]; i++) {
2007 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2008 ret = -EIO;
2009 goto out;
2010 }
2011 }
2012
2013 // now get the list of child cgroups
2014
2015 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2016 ret = 0;
2017 goto out;
2018 }
2019 if (clist) {
2020 for (i = 0; clist[i]; i++) {
2021 if (filler(buf, clist[i], NULL, 0) != 0) {
2022 ret = -EIO;
2023 goto out;
2024 }
2025 }
2026 }
2027 ret = 0;
2028
2029 out:
2030 free_keys(list);
2031 if (clist) {
2032 for (i = 0; clist[i]; i++)
2033 free(clist[i]);
2034 free(clist);
2035 }
2036 return ret;
2037 }
2038
2039 static void do_release_file_info(struct fuse_file_info *fi)
2040 {
2041 struct file_info *f = (struct file_info *)fi->fh;
2042
2043 if (!f)
2044 return;
2045
2046 fi->fh = 0;
2047
2048 free(f->controller);
2049 f->controller = NULL;
2050 free(f->cgroup);
2051 f->cgroup = NULL;
2052 free(f->file);
2053 f->file = NULL;
2054 free(f->buf);
2055 f->buf = NULL;
2056 free(f);
2057 }
2058
2059 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2060 {
2061 do_release_file_info(fi);
2062 return 0;
2063 }
2064
2065 int cg_open(const char *path, struct fuse_file_info *fi)
2066 {
2067 const char *cgroup;
2068 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2069 struct cgfs_files *k = NULL;
2070 struct file_info *file_info;
2071 struct fuse_context *fc = fuse_get_context();
2072 int ret;
2073
2074 if (!fc)
2075 return -EIO;
2076
2077 controller = pick_controller_from_path(fc, path);
2078 if (!controller)
2079 return -errno;
2080 cgroup = find_cgroup_in_path(path);
2081 if (!cgroup)
2082 return -errno;
2083
2084 get_cgdir_and_path(cgroup, &cgdir, &last);
2085 if (!last) {
2086 path1 = "/";
2087 path2 = cgdir;
2088 } else {
2089 path1 = cgdir;
2090 path2 = last;
2091 }
2092
2093 k = cgfs_get_key(controller, path1, path2);
2094 if (!k) {
2095 ret = -EINVAL;
2096 goto out;
2097 }
2098 free_key(k);
2099
2100 pid_t initpid = lookup_initpid_in_store(fc->pid);
2101 if (initpid <= 0)
2102 initpid = fc->pid;
2103 if (!caller_may_see_dir(initpid, controller, path1)) {
2104 ret = -ENOENT;
2105 goto out;
2106 }
2107 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2108 ret = -EACCES;
2109 goto out;
2110 }
2111
2112 /* we'll free this at cg_release */
2113 file_info = malloc(sizeof(*file_info));
2114 if (!file_info) {
2115 ret = -ENOMEM;
2116 goto out;
2117 }
2118 file_info->controller = must_copy_string(controller);
2119 file_info->cgroup = must_copy_string(path1);
2120 file_info->file = must_copy_string(path2);
2121 file_info->type = LXC_TYPE_CGFILE;
2122 file_info->buf = NULL;
2123 file_info->buflen = 0;
2124
2125 fi->fh = (unsigned long)file_info;
2126 ret = 0;
2127
2128 out:
2129 free(cgdir);
2130 return ret;
2131 }
2132
2133 int cg_access(const char *path, int mode)
2134 {
2135 int ret;
2136 const char *cgroup;
2137 char *path1, *path2, *controller;
2138 char *last = NULL, *cgdir = NULL;
2139 struct cgfs_files *k = NULL;
2140 struct fuse_context *fc = fuse_get_context();
2141
2142 if (strcmp(path, "/cgroup") == 0)
2143 return 0;
2144
2145 if (!fc)
2146 return -EIO;
2147
2148 controller = pick_controller_from_path(fc, path);
2149 if (!controller)
2150 return -errno;
2151 cgroup = find_cgroup_in_path(path);
2152 if (!cgroup) {
2153 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2154 if ((mode & W_OK) == 0)
2155 return 0;
2156 return -EACCES;
2157 }
2158
2159 get_cgdir_and_path(cgroup, &cgdir, &last);
2160 if (!last) {
2161 path1 = "/";
2162 path2 = cgdir;
2163 } else {
2164 path1 = cgdir;
2165 path2 = last;
2166 }
2167
2168 k = cgfs_get_key(controller, path1, path2);
2169 if (!k) {
2170 if ((mode & W_OK) == 0)
2171 ret = 0;
2172 else
2173 ret = -EACCES;
2174 goto out;
2175 }
2176 free_key(k);
2177
2178 pid_t initpid = lookup_initpid_in_store(fc->pid);
2179 if (initpid <= 0)
2180 initpid = fc->pid;
2181 if (!caller_may_see_dir(initpid, controller, path1)) {
2182 ret = -ENOENT;
2183 goto out;
2184 }
2185 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2186 ret = -EACCES;
2187 goto out;
2188 }
2189
2190 ret = 0;
2191
2192 out:
2193 free(cgdir);
2194 return ret;
2195 }
2196
2197 int cg_release(const char *path, struct fuse_file_info *fi)
2198 {
2199 do_release_file_info(fi);
2200 return 0;
2201 }
2202
2203 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2204
2205 static bool wait_for_sock(int sock, int timeout)
2206 {
2207 struct epoll_event ev;
2208 int epfd, ret, now, starttime, deltatime, saved_errno;
2209
2210 if ((starttime = time(NULL)) < 0)
2211 return false;
2212
2213 if ((epfd = epoll_create(1)) < 0) {
2214 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2215 return false;
2216 }
2217
2218 ev.events = POLLIN_SET;
2219 ev.data.fd = sock;
2220 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2221 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2222 close(epfd);
2223 return false;
2224 }
2225
2226 again:
2227 if ((now = time(NULL)) < 0) {
2228 close(epfd);
2229 return false;
2230 }
2231
2232 deltatime = (starttime + timeout) - now;
2233 if (deltatime < 0) { // timeout
2234 errno = 0;
2235 close(epfd);
2236 return false;
2237 }
2238 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2239 if (ret < 0 && errno == EINTR)
2240 goto again;
2241 saved_errno = errno;
2242 close(epfd);
2243
2244 if (ret <= 0) {
2245 errno = saved_errno;
2246 return false;
2247 }
2248 return true;
2249 }
2250
2251 static int msgrecv(int sockfd, void *buf, size_t len)
2252 {
2253 if (!wait_for_sock(sockfd, 2))
2254 return -1;
2255 return recv(sockfd, buf, len, MSG_DONTWAIT);
2256 }
2257
2258 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2259 {
2260 struct msghdr msg = { 0 };
2261 struct iovec iov;
2262 struct cmsghdr *cmsg;
2263 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2264 char buf[1];
2265 buf[0] = 'p';
2266
2267 if (pingfirst) {
2268 if (msgrecv(sock, buf, 1) != 1) {
2269 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2270 return SEND_CREDS_FAIL;
2271 }
2272 }
2273
2274 msg.msg_control = cmsgbuf;
2275 msg.msg_controllen = sizeof(cmsgbuf);
2276
2277 cmsg = CMSG_FIRSTHDR(&msg);
2278 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2279 cmsg->cmsg_level = SOL_SOCKET;
2280 cmsg->cmsg_type = SCM_CREDENTIALS;
2281 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2282
2283 msg.msg_name = NULL;
2284 msg.msg_namelen = 0;
2285
2286 buf[0] = v;
2287 iov.iov_base = buf;
2288 iov.iov_len = sizeof(buf);
2289 msg.msg_iov = &iov;
2290 msg.msg_iovlen = 1;
2291
2292 if (sendmsg(sock, &msg, 0) < 0) {
2293 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2294 if (errno == 3)
2295 return SEND_CREDS_NOTSK;
2296 return SEND_CREDS_FAIL;
2297 }
2298
2299 return SEND_CREDS_OK;
2300 }
2301
2302 static bool recv_creds(int sock, struct ucred *cred, char *v)
2303 {
2304 struct msghdr msg = { 0 };
2305 struct iovec iov;
2306 struct cmsghdr *cmsg;
2307 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2308 char buf[1];
2309 int ret;
2310 int optval = 1;
2311
2312 *v = '1';
2313
2314 cred->pid = -1;
2315 cred->uid = -1;
2316 cred->gid = -1;
2317
2318 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2319 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2320 return false;
2321 }
2322 buf[0] = '1';
2323 if (write(sock, buf, 1) != 1) {
2324 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2325 return false;
2326 }
2327
2328 msg.msg_name = NULL;
2329 msg.msg_namelen = 0;
2330 msg.msg_control = cmsgbuf;
2331 msg.msg_controllen = sizeof(cmsgbuf);
2332
2333 iov.iov_base = buf;
2334 iov.iov_len = sizeof(buf);
2335 msg.msg_iov = &iov;
2336 msg.msg_iovlen = 1;
2337
2338 if (!wait_for_sock(sock, 2)) {
2339 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2340 return false;
2341 }
2342 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2343 if (ret < 0) {
2344 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2345 return false;
2346 }
2347
2348 cmsg = CMSG_FIRSTHDR(&msg);
2349
2350 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2351 cmsg->cmsg_level == SOL_SOCKET &&
2352 cmsg->cmsg_type == SCM_CREDENTIALS) {
2353 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2354 }
2355 *v = buf[0];
2356
2357 return true;
2358 }
2359
2360 struct pid_ns_clone_args {
2361 int *cpipe;
2362 int sock;
2363 pid_t tpid;
2364 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2365 };
2366
2367 /*
2368 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2369 * with clone(). This simply writes '1' as ACK back to the parent
2370 * before calling the actual wrapped function.
2371 */
2372 static int pid_ns_clone_wrapper(void *arg) {
2373 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2374 char b = '1';
2375
2376 close(args->cpipe[0]);
2377 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2378 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2379 close(args->cpipe[1]);
2380 return args->wrapped(args->sock, args->tpid);
2381 }
2382
2383 /*
2384 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2385 * int value back over the socket. This shifts the pid from the
2386 * sender's pidns into tpid's pidns.
2387 */
2388 static int pid_to_ns(int sock, pid_t tpid)
2389 {
2390 char v = '0';
2391 struct ucred cred;
2392
2393 while (recv_creds(sock, &cred, &v)) {
2394 if (v == '1')
2395 return 0;
2396 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2397 return 1;
2398 }
2399 return 0;
2400 }
2401
2402
2403 /*
2404 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2405 * in your old pidns. Only children which you clone will be in the target
2406 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2407 * actually convert pids.
2408 *
2409 * Note: glibc's fork() does not respect pidns, which can lead to failed
2410 * assertions inside glibc (and thus failed forks) if the child's pid in
2411 * the pidns and the parent pid outside are identical. Using clone prevents
2412 * this issue.
2413 */
2414 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2415 {
2416 int newnsfd = -1, ret, cpipe[2];
2417 char fnam[100];
2418 pid_t cpid;
2419 char v;
2420
2421 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2422 if (ret < 0 || ret >= sizeof(fnam))
2423 _exit(1);
2424 newnsfd = open(fnam, O_RDONLY);
2425 if (newnsfd < 0)
2426 _exit(1);
2427 if (setns(newnsfd, 0) < 0)
2428 _exit(1);
2429 close(newnsfd);
2430
2431 if (pipe(cpipe) < 0)
2432 _exit(1);
2433
2434 struct pid_ns_clone_args args = {
2435 .cpipe = cpipe,
2436 .sock = sock,
2437 .tpid = tpid,
2438 .wrapped = &pid_to_ns
2439 };
2440 size_t stack_size = sysconf(_SC_PAGESIZE);
2441 void *stack = alloca(stack_size);
2442
2443 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2444 if (cpid < 0)
2445 _exit(1);
2446
2447 // give the child 1 second to be done forking and
2448 // write its ack
2449 if (!wait_for_sock(cpipe[0], 1))
2450 _exit(1);
2451 ret = read(cpipe[0], &v, 1);
2452 if (ret != sizeof(char) || v != '1')
2453 _exit(1);
2454
2455 if (!wait_for_pid(cpid))
2456 _exit(1);
2457 _exit(0);
2458 }
2459
2460 /*
2461 * To read cgroup files with a particular pid, we will setns into the child
2462 * pidns, open a pipe, fork a child - which will be the first to really be in
2463 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2464 */
2465 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2466 {
2467 int sock[2] = {-1, -1};
2468 char *tmpdata = NULL;
2469 int ret;
2470 pid_t qpid, cpid = -1;
2471 bool answer = false;
2472 char v = '0';
2473 struct ucred cred;
2474 size_t sz = 0, asz = 0;
2475
2476 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2477 return false;
2478
2479 /*
2480 * Now we read the pids from returned data one by one, pass
2481 * them into a child in the target namespace, read back the
2482 * translated pids, and put them into our to-return data
2483 */
2484
2485 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2486 perror("socketpair");
2487 free(tmpdata);
2488 return false;
2489 }
2490
2491 cpid = fork();
2492 if (cpid == -1)
2493 goto out;
2494
2495 if (!cpid) // child - exits when done
2496 pid_to_ns_wrapper(sock[1], tpid);
2497
2498 char *ptr = tmpdata;
2499 cred.uid = 0;
2500 cred.gid = 0;
2501 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2502 cred.pid = qpid;
2503 ret = send_creds(sock[0], &cred, v, true);
2504
2505 if (ret == SEND_CREDS_NOTSK)
2506 goto next;
2507 if (ret == SEND_CREDS_FAIL)
2508 goto out;
2509
2510 // read converted results
2511 if (!wait_for_sock(sock[0], 2)) {
2512 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2513 goto out;
2514 }
2515 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2516 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2517 goto out;
2518 }
2519 must_strcat_pid(d, &sz, &asz, qpid);
2520 next:
2521 ptr = strchr(ptr, '\n');
2522 if (!ptr)
2523 break;
2524 ptr++;
2525 }
2526
2527 cred.pid = getpid();
2528 v = '1';
2529 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2530 // failed to ask child to exit
2531 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2532 goto out;
2533 }
2534
2535 answer = true;
2536
2537 out:
2538 free(tmpdata);
2539 if (cpid != -1)
2540 wait_for_pid(cpid);
2541 if (sock[0] != -1) {
2542 close(sock[0]);
2543 close(sock[1]);
2544 }
2545 return answer;
2546 }
2547
2548 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2549 struct fuse_file_info *fi)
2550 {
2551 struct fuse_context *fc = fuse_get_context();
2552 struct file_info *f = (struct file_info *)fi->fh;
2553 struct cgfs_files *k = NULL;
2554 char *data = NULL;
2555 int ret, s;
2556 bool r;
2557
2558 if (f->type != LXC_TYPE_CGFILE) {
2559 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2560 return -EIO;
2561 }
2562
2563 if (offset)
2564 return 0;
2565
2566 if (!fc)
2567 return -EIO;
2568
2569 if (!f->controller)
2570 return -EINVAL;
2571
2572 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2573 return -EINVAL;
2574 }
2575 free_key(k);
2576
2577
2578 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2579 ret = -EACCES;
2580 goto out;
2581 }
2582
2583 if (strcmp(f->file, "tasks") == 0 ||
2584 strcmp(f->file, "/tasks") == 0 ||
2585 strcmp(f->file, "/cgroup.procs") == 0 ||
2586 strcmp(f->file, "cgroup.procs") == 0)
2587 // special case - we have to translate the pids
2588 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2589 else
2590 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2591
2592 if (!r) {
2593 ret = -EINVAL;
2594 goto out;
2595 }
2596
2597 if (!data) {
2598 ret = 0;
2599 goto out;
2600 }
2601 s = strlen(data);
2602 if (s > size)
2603 s = size;
2604 memcpy(buf, data, s);
2605 if (s > 0 && s < size && data[s-1] != '\n')
2606 buf[s++] = '\n';
2607
2608 ret = s;
2609
2610 out:
2611 free(data);
2612 return ret;
2613 }
2614
2615 static int pid_from_ns(int sock, pid_t tpid)
2616 {
2617 pid_t vpid;
2618 struct ucred cred;
2619 char v;
2620 int ret;
2621
2622 cred.uid = 0;
2623 cred.gid = 0;
2624 while (1) {
2625 if (!wait_for_sock(sock, 2)) {
2626 lxcfs_error("%s\n", "Timeout reading from parent.");
2627 return 1;
2628 }
2629 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2630 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2631 return 1;
2632 }
2633 if (vpid == -1) // done
2634 break;
2635 v = '0';
2636 cred.pid = vpid;
2637 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2638 v = '1';
2639 cred.pid = getpid();
2640 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2641 return 1;
2642 }
2643 }
2644 return 0;
2645 }
2646
2647 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2648 {
2649 int newnsfd = -1, ret, cpipe[2];
2650 char fnam[100];
2651 pid_t cpid;
2652 char v;
2653
2654 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2655 if (ret < 0 || ret >= sizeof(fnam))
2656 _exit(1);
2657 newnsfd = open(fnam, O_RDONLY);
2658 if (newnsfd < 0)
2659 _exit(1);
2660 if (setns(newnsfd, 0) < 0)
2661 _exit(1);
2662 close(newnsfd);
2663
2664 if (pipe(cpipe) < 0)
2665 _exit(1);
2666
2667 struct pid_ns_clone_args args = {
2668 .cpipe = cpipe,
2669 .sock = sock,
2670 .tpid = tpid,
2671 .wrapped = &pid_from_ns
2672 };
2673 size_t stack_size = sysconf(_SC_PAGESIZE);
2674 void *stack = alloca(stack_size);
2675
2676 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2677 if (cpid < 0)
2678 _exit(1);
2679
2680 // give the child 1 second to be done forking and
2681 // write its ack
2682 if (!wait_for_sock(cpipe[0], 1))
2683 _exit(1);
2684 ret = read(cpipe[0], &v, 1);
2685 if (ret != sizeof(char) || v != '1')
2686 _exit(1);
2687
2688 if (!wait_for_pid(cpid))
2689 _exit(1);
2690 _exit(0);
2691 }
2692
2693 /*
2694 * Given host @uid, return the uid to which it maps in
2695 * @pid's user namespace, or -1 if none.
2696 */
2697 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2698 {
2699 FILE *f;
2700 char line[400];
2701
2702 sprintf(line, "/proc/%d/uid_map", pid);
2703 if ((f = fopen(line, "r")) == NULL) {
2704 return false;
2705 }
2706
2707 *answer = convert_id_to_ns(f, uid);
2708 fclose(f);
2709
2710 if (*answer == -1)
2711 return false;
2712 return true;
2713 }
2714
2715 /*
2716 * get_pid_creds: get the real uid and gid of @pid from
2717 * /proc/$$/status
2718 * (XXX should we use euid here?)
2719 */
2720 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2721 {
2722 char line[400];
2723 uid_t u;
2724 gid_t g;
2725 FILE *f;
2726
2727 *uid = -1;
2728 *gid = -1;
2729 sprintf(line, "/proc/%d/status", pid);
2730 if ((f = fopen(line, "r")) == NULL) {
2731 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2732 return;
2733 }
2734 while (fgets(line, 400, f)) {
2735 if (strncmp(line, "Uid:", 4) == 0) {
2736 if (sscanf(line+4, "%u", &u) != 1) {
2737 lxcfs_error("bad uid line for pid %u\n", pid);
2738 fclose(f);
2739 return;
2740 }
2741 *uid = u;
2742 } else if (strncmp(line, "Gid:", 4) == 0) {
2743 if (sscanf(line+4, "%u", &g) != 1) {
2744 lxcfs_error("bad gid line for pid %u\n", pid);
2745 fclose(f);
2746 return;
2747 }
2748 *gid = g;
2749 }
2750 }
2751 fclose(f);
2752 }
2753
2754 /*
2755 * May the requestor @r move victim @v to a new cgroup?
2756 * This is allowed if
2757 * . they are the same task
2758 * . they are ownedy by the same uid
2759 * . @r is root on the host, or
2760 * . @v's uid is mapped into @r's where @r is root.
2761 */
2762 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2763 {
2764 uid_t v_uid, tmpuid;
2765 gid_t v_gid;
2766
2767 if (r == v)
2768 return true;
2769 if (r_uid == 0)
2770 return true;
2771 get_pid_creds(v, &v_uid, &v_gid);
2772 if (r_uid == v_uid)
2773 return true;
2774 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2775 && hostuid_to_ns(v_uid, r, &tmpuid))
2776 return true;
2777 return false;
2778 }
2779
2780 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2781 const char *file, const char *buf)
2782 {
2783 int sock[2] = {-1, -1};
2784 pid_t qpid, cpid = -1;
2785 FILE *pids_file = NULL;
2786 bool answer = false, fail = false;
2787
2788 pids_file = open_pids_file(contrl, cg);
2789 if (!pids_file)
2790 return false;
2791
2792 /*
2793 * write the pids to a socket, have helper in writer's pidns
2794 * call movepid for us
2795 */
2796 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2797 perror("socketpair");
2798 goto out;
2799 }
2800
2801 cpid = fork();
2802 if (cpid == -1)
2803 goto out;
2804
2805 if (!cpid) { // child
2806 fclose(pids_file);
2807 pid_from_ns_wrapper(sock[1], tpid);
2808 }
2809
2810 const char *ptr = buf;
2811 while (sscanf(ptr, "%d", &qpid) == 1) {
2812 struct ucred cred;
2813 char v;
2814
2815 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2816 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2817 goto out;
2818 }
2819
2820 if (recv_creds(sock[0], &cred, &v)) {
2821 if (v == '0') {
2822 if (!may_move_pid(tpid, tuid, cred.pid)) {
2823 fail = true;
2824 break;
2825 }
2826 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2827 fail = true;
2828 }
2829 }
2830
2831 ptr = strchr(ptr, '\n');
2832 if (!ptr)
2833 break;
2834 ptr++;
2835 }
2836
2837 /* All good, write the value */
2838 qpid = -1;
2839 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2840 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2841
2842 if (!fail)
2843 answer = true;
2844
2845 out:
2846 if (cpid != -1)
2847 wait_for_pid(cpid);
2848 if (sock[0] != -1) {
2849 close(sock[0]);
2850 close(sock[1]);
2851 }
2852 if (pids_file) {
2853 if (fclose(pids_file) != 0)
2854 answer = false;
2855 }
2856 return answer;
2857 }
2858
2859 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2860 struct fuse_file_info *fi)
2861 {
2862 struct fuse_context *fc = fuse_get_context();
2863 char *localbuf = NULL;
2864 struct cgfs_files *k = NULL;
2865 struct file_info *f = (struct file_info *)fi->fh;
2866 bool r;
2867
2868 if (f->type != LXC_TYPE_CGFILE) {
2869 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2870 return -EIO;
2871 }
2872
2873 if (offset)
2874 return 0;
2875
2876 if (!fc)
2877 return -EIO;
2878
2879 localbuf = alloca(size+1);
2880 localbuf[size] = '\0';
2881 memcpy(localbuf, buf, size);
2882
2883 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2884 size = -EINVAL;
2885 goto out;
2886 }
2887
2888 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2889 size = -EACCES;
2890 goto out;
2891 }
2892
2893 if (strcmp(f->file, "tasks") == 0 ||
2894 strcmp(f->file, "/tasks") == 0 ||
2895 strcmp(f->file, "/cgroup.procs") == 0 ||
2896 strcmp(f->file, "cgroup.procs") == 0)
2897 // special case - we have to translate the pids
2898 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2899 else
2900 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2901
2902 if (!r)
2903 size = -EINVAL;
2904
2905 out:
2906 free_key(k);
2907 return size;
2908 }
2909
2910 int cg_chown(const char *path, uid_t uid, gid_t gid)
2911 {
2912 struct fuse_context *fc = fuse_get_context();
2913 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2914 struct cgfs_files *k = NULL;
2915 const char *cgroup;
2916 int ret;
2917
2918 if (!fc)
2919 return -EIO;
2920
2921 if (strcmp(path, "/cgroup") == 0)
2922 return -EPERM;
2923
2924 controller = pick_controller_from_path(fc, path);
2925 if (!controller)
2926 return errno == ENOENT ? -EPERM : -errno;
2927
2928 cgroup = find_cgroup_in_path(path);
2929 if (!cgroup)
2930 /* this is just /cgroup/controller */
2931 return -EPERM;
2932
2933 get_cgdir_and_path(cgroup, &cgdir, &last);
2934
2935 if (!last) {
2936 path1 = "/";
2937 path2 = cgdir;
2938 } else {
2939 path1 = cgdir;
2940 path2 = last;
2941 }
2942
2943 if (is_child_cgroup(controller, path1, path2)) {
2944 // get uid, gid, from '/tasks' file and make up a mode
2945 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2946 k = cgfs_get_key(controller, cgroup, "tasks");
2947
2948 } else
2949 k = cgfs_get_key(controller, path1, path2);
2950
2951 if (!k) {
2952 ret = -EINVAL;
2953 goto out;
2954 }
2955
2956 /*
2957 * This being a fuse request, the uid and gid must be valid
2958 * in the caller's namespace. So we can just check to make
2959 * sure that the caller is root in his uid, and privileged
2960 * over the file's current owner.
2961 */
2962 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2963 ret = -EACCES;
2964 goto out;
2965 }
2966
2967 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2968
2969 out:
2970 free_key(k);
2971 free(cgdir);
2972
2973 return ret;
2974 }
2975
2976 int cg_chmod(const char *path, mode_t mode)
2977 {
2978 struct fuse_context *fc = fuse_get_context();
2979 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2980 struct cgfs_files *k = NULL;
2981 const char *cgroup;
2982 int ret;
2983
2984 if (!fc)
2985 return -EIO;
2986
2987 if (strcmp(path, "/cgroup") == 0)
2988 return -EPERM;
2989
2990 controller = pick_controller_from_path(fc, path);
2991 if (!controller)
2992 return errno == ENOENT ? -EPERM : -errno;
2993
2994 cgroup = find_cgroup_in_path(path);
2995 if (!cgroup)
2996 /* this is just /cgroup/controller */
2997 return -EPERM;
2998
2999 get_cgdir_and_path(cgroup, &cgdir, &last);
3000
3001 if (!last) {
3002 path1 = "/";
3003 path2 = cgdir;
3004 } else {
3005 path1 = cgdir;
3006 path2 = last;
3007 }
3008
3009 if (is_child_cgroup(controller, path1, path2)) {
3010 // get uid, gid, from '/tasks' file and make up a mode
3011 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3012 k = cgfs_get_key(controller, cgroup, "tasks");
3013
3014 } else
3015 k = cgfs_get_key(controller, path1, path2);
3016
3017 if (!k) {
3018 ret = -EINVAL;
3019 goto out;
3020 }
3021
3022 /*
3023 * This being a fuse request, the uid and gid must be valid
3024 * in the caller's namespace. So we can just check to make
3025 * sure that the caller is root in his uid, and privileged
3026 * over the file's current owner.
3027 */
3028 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3029 ret = -EPERM;
3030 goto out;
3031 }
3032
3033 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3034 ret = -EINVAL;
3035 goto out;
3036 }
3037
3038 ret = 0;
3039 out:
3040 free_key(k);
3041 free(cgdir);
3042 return ret;
3043 }
3044
3045 int cg_mkdir(const char *path, mode_t mode)
3046 {
3047 struct fuse_context *fc = fuse_get_context();
3048 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3049 const char *cgroup;
3050 int ret;
3051
3052 if (!fc)
3053 return -EIO;
3054
3055 controller = pick_controller_from_path(fc, path);
3056 if (!controller)
3057 return errno == ENOENT ? -EPERM : -errno;
3058
3059 cgroup = find_cgroup_in_path(path);
3060 if (!cgroup)
3061 return -errno;
3062
3063 get_cgdir_and_path(cgroup, &cgdir, &last);
3064 if (!last)
3065 path1 = "/";
3066 else
3067 path1 = cgdir;
3068
3069 pid_t initpid = lookup_initpid_in_store(fc->pid);
3070 if (initpid <= 0)
3071 initpid = fc->pid;
3072 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3073 if (!next)
3074 ret = -EINVAL;
3075 else if (last && strcmp(next, last) == 0)
3076 ret = -EEXIST;
3077 else
3078 ret = -EPERM;
3079 goto out;
3080 }
3081
3082 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3083 ret = -EACCES;
3084 goto out;
3085 }
3086 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3087 ret = -EACCES;
3088 goto out;
3089 }
3090
3091 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3092
3093 out:
3094 free(cgdir);
3095 free(next);
3096 return ret;
3097 }
3098
3099 int cg_rmdir(const char *path)
3100 {
3101 struct fuse_context *fc = fuse_get_context();
3102 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3103 const char *cgroup;
3104 int ret;
3105
3106 if (!fc)
3107 return -EIO;
3108
3109 controller = pick_controller_from_path(fc, path);
3110 if (!controller) /* Someone's trying to delete "/cgroup". */
3111 return -EPERM;
3112
3113 cgroup = find_cgroup_in_path(path);
3114 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3115 return -EPERM;
3116
3117 get_cgdir_and_path(cgroup, &cgdir, &last);
3118 if (!last) {
3119 /* Someone's trying to delete a cgroup on the same level as the
3120 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3121 * rmdir "/cgroup/blkio/init.slice".
3122 */
3123 ret = -EPERM;
3124 goto out;
3125 }
3126
3127 pid_t initpid = lookup_initpid_in_store(fc->pid);
3128 if (initpid <= 0)
3129 initpid = fc->pid;
3130 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3131 if (!last || (next && (strcmp(next, last) == 0)))
3132 ret = -EBUSY;
3133 else
3134 ret = -ENOENT;
3135 goto out;
3136 }
3137
3138 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3139 ret = -EACCES;
3140 goto out;
3141 }
3142 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3143 ret = -EACCES;
3144 goto out;
3145 }
3146
3147 if (!cgfs_remove(controller, cgroup)) {
3148 ret = -EINVAL;
3149 goto out;
3150 }
3151
3152 ret = 0;
3153
3154 out:
3155 free(cgdir);
3156 free(next);
3157 return ret;
3158 }
3159
3160 static bool startswith(const char *line, const char *pref)
3161 {
3162 if (strncmp(line, pref, strlen(pref)) == 0)
3163 return true;
3164 return false;
3165 }
3166
3167 static void parse_memstat(char *memstat, unsigned long *cached,
3168 unsigned long *active_anon, unsigned long *inactive_anon,
3169 unsigned long *active_file, unsigned long *inactive_file,
3170 unsigned long *unevictable)
3171 {
3172 char *eol;
3173
3174 while (*memstat) {
3175 if (startswith(memstat, "total_cache")) {
3176 sscanf(memstat + 11, "%lu", cached);
3177 *cached /= 1024;
3178 } else if (startswith(memstat, "total_active_anon")) {
3179 sscanf(memstat + 17, "%lu", active_anon);
3180 *active_anon /= 1024;
3181 } else if (startswith(memstat, "total_inactive_anon")) {
3182 sscanf(memstat + 19, "%lu", inactive_anon);
3183 *inactive_anon /= 1024;
3184 } else if (startswith(memstat, "total_active_file")) {
3185 sscanf(memstat + 17, "%lu", active_file);
3186 *active_file /= 1024;
3187 } else if (startswith(memstat, "total_inactive_file")) {
3188 sscanf(memstat + 19, "%lu", inactive_file);
3189 *inactive_file /= 1024;
3190 } else if (startswith(memstat, "total_unevictable")) {
3191 sscanf(memstat + 17, "%lu", unevictable);
3192 *unevictable /= 1024;
3193 }
3194 eol = strchr(memstat, '\n');
3195 if (!eol)
3196 return;
3197 memstat = eol+1;
3198 }
3199 }
3200
3201 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3202 {
3203 char *eol;
3204 char key[32];
3205
3206 memset(key, 0, 32);
3207 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3208
3209 size_t len = strlen(key);
3210 *v = 0;
3211
3212 while (*str) {
3213 if (startswith(str, key)) {
3214 sscanf(str + len, "%lu", v);
3215 return;
3216 }
3217 eol = strchr(str, '\n');
3218 if (!eol)
3219 return;
3220 str = eol+1;
3221 }
3222 }
3223
3224 static int read_file(const char *path, char *buf, size_t size,
3225 struct file_info *d)
3226 {
3227 size_t linelen = 0, total_len = 0, rv = 0;
3228 char *line = NULL;
3229 char *cache = d->buf;
3230 size_t cache_size = d->buflen;
3231 FILE *f = fopen(path, "r");
3232 if (!f)
3233 return 0;
3234
3235 while (getline(&line, &linelen, f) != -1) {
3236 ssize_t l = snprintf(cache, cache_size, "%s", line);
3237 if (l < 0) {
3238 perror("Error writing to cache");
3239 rv = 0;
3240 goto err;
3241 }
3242 if (l >= cache_size) {
3243 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3244 rv = 0;
3245 goto err;
3246 }
3247 cache += l;
3248 cache_size -= l;
3249 total_len += l;
3250 }
3251
3252 d->size = total_len;
3253 if (total_len > size)
3254 total_len = size;
3255
3256 /* read from off 0 */
3257 memcpy(buf, d->buf, total_len);
3258 rv = total_len;
3259 err:
3260 fclose(f);
3261 free(line);
3262 return rv;
3263 }
3264
3265 /*
3266 * FUSE ops for /proc
3267 */
3268
3269 static unsigned long get_memlimit(const char *cgroup, const char *file)
3270 {
3271 char *memlimit_str = NULL;
3272 unsigned long memlimit = -1;
3273
3274 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3275 memlimit = strtoul(memlimit_str, NULL, 10);
3276
3277 free(memlimit_str);
3278
3279 return memlimit;
3280 }
3281
3282 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3283 {
3284 char *copy = strdupa(cgroup);
3285 unsigned long memlimit = 0, retlimit;
3286
3287 retlimit = get_memlimit(copy, file);
3288
3289 while (strcmp(copy, "/") != 0) {
3290 copy = dirname(copy);
3291 memlimit = get_memlimit(copy, file);
3292 if (memlimit != -1 && memlimit < retlimit)
3293 retlimit = memlimit;
3294 };
3295
3296 return retlimit;
3297 }
3298
3299 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3300 struct fuse_file_info *fi)
3301 {
3302 struct fuse_context *fc = fuse_get_context();
3303 struct file_info *d = (struct file_info *)fi->fh;
3304 char *cg;
3305 char *memusage_str = NULL, *memstat_str = NULL,
3306 *memswlimit_str = NULL, *memswusage_str = NULL;
3307 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3308 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3309 active_file = 0, inactive_file = 0, unevictable = 0,
3310 hostswtotal = 0;
3311 char *line = NULL;
3312 size_t linelen = 0, total_len = 0, rv = 0;
3313 char *cache = d->buf;
3314 size_t cache_size = d->buflen;
3315 FILE *f = NULL;
3316
3317 if (offset){
3318 if (offset > d->size)
3319 return -EINVAL;
3320 if (!d->cached)
3321 return 0;
3322 int left = d->size - offset;
3323 total_len = left > size ? size: left;
3324 memcpy(buf, cache + offset, total_len);
3325 return total_len;
3326 }
3327
3328 pid_t initpid = lookup_initpid_in_store(fc->pid);
3329 if (initpid <= 0)
3330 initpid = fc->pid;
3331 cg = get_pid_cgroup(initpid, "memory");
3332 if (!cg)
3333 return read_file("/proc/meminfo", buf, size, d);
3334 prune_init_slice(cg);
3335
3336 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3337 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3338 goto err;
3339 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3340 goto err;
3341
3342 // Following values are allowed to fail, because swapaccount might be turned
3343 // off for current kernel
3344 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3345 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3346 {
3347 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3348 memswusage = strtoul(memswusage_str, NULL, 10);
3349
3350 memswlimit = memswlimit / 1024;
3351 memswusage = memswusage / 1024;
3352 }
3353
3354 memusage = strtoul(memusage_str, NULL, 10);
3355 memlimit /= 1024;
3356 memusage /= 1024;
3357
3358 parse_memstat(memstat_str, &cached, &active_anon,
3359 &inactive_anon, &active_file, &inactive_file,
3360 &unevictable);
3361
3362 f = fopen("/proc/meminfo", "r");
3363 if (!f)
3364 goto err;
3365
3366 while (getline(&line, &linelen, f) != -1) {
3367 ssize_t l;
3368 char *printme, lbuf[100];
3369
3370 memset(lbuf, 0, 100);
3371 if (startswith(line, "MemTotal:")) {
3372 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3373 if (hosttotal < memlimit)
3374 memlimit = hosttotal;
3375 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3376 printme = lbuf;
3377 } else if (startswith(line, "MemFree:")) {
3378 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3379 printme = lbuf;
3380 } else if (startswith(line, "MemAvailable:")) {
3381 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
3382 printme = lbuf;
3383 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3384 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3385 if (hostswtotal < memswlimit)
3386 memswlimit = hostswtotal;
3387 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
3388 printme = lbuf;
3389 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3390 unsigned long swaptotal = memswlimit,
3391 swapusage = memswusage - memusage,
3392 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3393 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3394 printme = lbuf;
3395 } else if (startswith(line, "Slab:")) {
3396 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3397 printme = lbuf;
3398 } else if (startswith(line, "Buffers:")) {
3399 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3400 printme = lbuf;
3401 } else if (startswith(line, "Cached:")) {
3402 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3403 printme = lbuf;
3404 } else if (startswith(line, "SwapCached:")) {
3405 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3406 printme = lbuf;
3407 } else if (startswith(line, "Active:")) {
3408 snprintf(lbuf, 100, "Active: %8lu kB\n",
3409 active_anon + active_file);
3410 printme = lbuf;
3411 } else if (startswith(line, "Inactive:")) {
3412 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3413 inactive_anon + inactive_file);
3414 printme = lbuf;
3415 } else if (startswith(line, "Active(anon)")) {
3416 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3417 printme = lbuf;
3418 } else if (startswith(line, "Inactive(anon)")) {
3419 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3420 printme = lbuf;
3421 } else if (startswith(line, "Active(file)")) {
3422 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3423 printme = lbuf;
3424 } else if (startswith(line, "Inactive(file)")) {
3425 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3426 printme = lbuf;
3427 } else if (startswith(line, "Unevictable")) {
3428 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3429 printme = lbuf;
3430 } else if (startswith(line, "SReclaimable")) {
3431 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3432 printme = lbuf;
3433 } else if (startswith(line, "SUnreclaim")) {
3434 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3435 printme = lbuf;
3436 } else
3437 printme = line;
3438
3439 l = snprintf(cache, cache_size, "%s", printme);
3440 if (l < 0) {
3441 perror("Error writing to cache");
3442 rv = 0;
3443 goto err;
3444
3445 }
3446 if (l >= cache_size) {
3447 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3448 rv = 0;
3449 goto err;
3450 }
3451
3452 cache += l;
3453 cache_size -= l;
3454 total_len += l;
3455 }
3456
3457 d->cached = 1;
3458 d->size = total_len;
3459 if (total_len > size ) total_len = size;
3460 memcpy(buf, d->buf, total_len);
3461
3462 rv = total_len;
3463 err:
3464 if (f)
3465 fclose(f);
3466 free(line);
3467 free(cg);
3468 free(memusage_str);
3469 free(memswlimit_str);
3470 free(memswusage_str);
3471 free(memstat_str);
3472 return rv;
3473 }
3474
3475 /*
3476 * Read the cpuset.cpus for cg
3477 * Return the answer in a newly allocated string which must be freed
3478 */
3479 static char *get_cpuset(const char *cg)
3480 {
3481 char *answer;
3482
3483 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3484 return NULL;
3485 return answer;
3486 }
3487
3488 bool cpu_in_cpuset(int cpu, const char *cpuset);
3489
3490 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3491 {
3492 int cpu;
3493
3494 if (sscanf(line, "processor : %d", &cpu) != 1)
3495 return false;
3496 return cpu_in_cpuset(cpu, cpuset);
3497 }
3498
3499 /*
3500 * check whether this is a '^processor" line in /proc/cpuinfo
3501 */
3502 static bool is_processor_line(const char *line)
3503 {
3504 int cpu;
3505
3506 if (sscanf(line, "processor : %d", &cpu) == 1)
3507 return true;
3508 return false;
3509 }
3510
3511 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3512 struct fuse_file_info *fi)
3513 {
3514 struct fuse_context *fc = fuse_get_context();
3515 struct file_info *d = (struct file_info *)fi->fh;
3516 char *cg;
3517 char *cpuset = NULL;
3518 char *line = NULL;
3519 size_t linelen = 0, total_len = 0, rv = 0;
3520 bool am_printing = false, firstline = true, is_s390x = false;
3521 int curcpu = -1, cpu;
3522 char *cache = d->buf;
3523 size_t cache_size = d->buflen;
3524 FILE *f = NULL;
3525
3526 if (offset){
3527 if (offset > d->size)
3528 return -EINVAL;
3529 if (!d->cached)
3530 return 0;
3531 int left = d->size - offset;
3532 total_len = left > size ? size: left;
3533 memcpy(buf, cache + offset, total_len);
3534 return total_len;
3535 }
3536
3537 pid_t initpid = lookup_initpid_in_store(fc->pid);
3538 if (initpid <= 0)
3539 initpid = fc->pid;
3540 cg = get_pid_cgroup(initpid, "cpuset");
3541 if (!cg)
3542 return read_file("proc/cpuinfo", buf, size, d);
3543 prune_init_slice(cg);
3544
3545 cpuset = get_cpuset(cg);
3546 if (!cpuset)
3547 goto err;
3548
3549 f = fopen("/proc/cpuinfo", "r");
3550 if (!f)
3551 goto err;
3552
3553 while (getline(&line, &linelen, f) != -1) {
3554 ssize_t l;
3555 if (firstline) {
3556 firstline = false;
3557 if (strstr(line, "IBM/S390") != NULL) {
3558 is_s390x = true;
3559 am_printing = true;
3560 continue;
3561 }
3562 }
3563 if (strncmp(line, "# processors:", 12) == 0)
3564 continue;
3565 if (is_processor_line(line)) {
3566 am_printing = cpuline_in_cpuset(line, cpuset);
3567 if (am_printing) {
3568 curcpu ++;
3569 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3570 if (l < 0) {
3571 perror("Error writing to cache");
3572 rv = 0;
3573 goto err;
3574 }
3575 if (l >= cache_size) {
3576 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3577 rv = 0;
3578 goto err;
3579 }
3580 cache += l;
3581 cache_size -= l;
3582 total_len += l;
3583 }
3584 continue;
3585 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3586 char *p;
3587 if (!cpu_in_cpuset(cpu, cpuset))
3588 continue;
3589 curcpu ++;
3590 p = strchr(line, ':');
3591 if (!p || !*p)
3592 goto err;
3593 p++;
3594 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3595 if (l < 0) {
3596 perror("Error writing to cache");
3597 rv = 0;
3598 goto err;
3599 }
3600 if (l >= cache_size) {
3601 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3602 rv = 0;
3603 goto err;
3604 }
3605 cache += l;
3606 cache_size -= l;
3607 total_len += l;
3608 continue;
3609
3610 }
3611 if (am_printing) {
3612 l = snprintf(cache, cache_size, "%s", line);
3613 if (l < 0) {
3614 perror("Error writing to cache");
3615 rv = 0;
3616 goto err;
3617 }
3618 if (l >= cache_size) {
3619 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3620 rv = 0;
3621 goto err;
3622 }
3623 cache += l;
3624 cache_size -= l;
3625 total_len += l;
3626 }
3627 }
3628
3629 if (is_s390x) {
3630 char *origcache = d->buf;
3631 ssize_t l;
3632 do {
3633 d->buf = malloc(d->buflen);
3634 } while (!d->buf);
3635 cache = d->buf;
3636 cache_size = d->buflen;
3637 total_len = 0;
3638 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3639 if (l < 0 || l >= cache_size) {
3640 free(origcache);
3641 goto err;
3642 }
3643 cache_size -= l;
3644 cache += l;
3645 total_len += l;
3646 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3647 if (l < 0 || l >= cache_size) {
3648 free(origcache);
3649 goto err;
3650 }
3651 cache_size -= l;
3652 cache += l;
3653 total_len += l;
3654 l = snprintf(cache, cache_size, "%s", origcache);
3655 free(origcache);
3656 if (l < 0 || l >= cache_size)
3657 goto err;
3658 total_len += l;
3659 }
3660
3661 d->cached = 1;
3662 d->size = total_len;
3663 if (total_len > size ) total_len = size;
3664
3665 /* read from off 0 */
3666 memcpy(buf, d->buf, total_len);
3667 rv = total_len;
3668 err:
3669 if (f)
3670 fclose(f);
3671 free(line);
3672 free(cpuset);
3673 free(cg);
3674 return rv;
3675 }
3676
3677 static uint64_t get_reaper_start_time(pid_t pid)
3678 {
3679 int ret;
3680 FILE *f;
3681 uint64_t starttime;
3682 /* strlen("/proc/") = 6
3683 * +
3684 * LXCFS_NUMSTRLEN64
3685 * +
3686 * strlen("/stat") = 5
3687 * +
3688 * \0 = 1
3689 * */
3690 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3691 char path[__PROC_PID_STAT_LEN];
3692 pid_t qpid;
3693
3694 qpid = lookup_initpid_in_store(pid);
3695 if (qpid <= 0) {
3696 /* Caller can check for EINVAL on 0. */
3697 errno = EINVAL;
3698 return 0;
3699 }
3700
3701 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3702 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3703 /* Caller can check for EINVAL on 0. */
3704 errno = EINVAL;
3705 return 0;
3706 }
3707
3708 f = fopen(path, "r");
3709 if (!f) {
3710 /* Caller can check for EINVAL on 0. */
3711 errno = EINVAL;
3712 return 0;
3713 }
3714
3715 /* Note that the *scanf() argument supression requires that length
3716 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3717 * at us. It's like telling someone you're not married and then asking
3718 * if you can bring your wife to the party.
3719 */
3720 ret = fscanf(f, "%*d " /* (1) pid %d */
3721 "%*s " /* (2) comm %s */
3722 "%*c " /* (3) state %c */
3723 "%*d " /* (4) ppid %d */
3724 "%*d " /* (5) pgrp %d */
3725 "%*d " /* (6) session %d */
3726 "%*d " /* (7) tty_nr %d */
3727 "%*d " /* (8) tpgid %d */
3728 "%*u " /* (9) flags %u */
3729 "%*u " /* (10) minflt %lu */
3730 "%*u " /* (11) cminflt %lu */
3731 "%*u " /* (12) majflt %lu */
3732 "%*u " /* (13) cmajflt %lu */
3733 "%*u " /* (14) utime %lu */
3734 "%*u " /* (15) stime %lu */
3735 "%*d " /* (16) cutime %ld */
3736 "%*d " /* (17) cstime %ld */
3737 "%*d " /* (18) priority %ld */
3738 "%*d " /* (19) nice %ld */
3739 "%*d " /* (20) num_threads %ld */
3740 "%*d " /* (21) itrealvalue %ld */
3741 "%" PRIu64, /* (22) starttime %llu */
3742 &starttime);
3743 if (ret != 1) {
3744 fclose(f);
3745 /* Caller can check for EINVAL on 0. */
3746 errno = EINVAL;
3747 return 0;
3748 }
3749
3750 fclose(f);
3751
3752 errno = 0;
3753 return starttime;
3754 }
3755
3756 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3757 {
3758 uint64_t clockticks;
3759 int64_t ticks_per_sec;
3760
3761 clockticks = get_reaper_start_time(pid);
3762 if (clockticks == 0 && errno == EINVAL) {
3763 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3764 return 0;
3765 }
3766
3767 ticks_per_sec = sysconf(_SC_CLK_TCK);
3768 if (ticks_per_sec < 0 && errno == EINVAL) {
3769 lxcfs_debug(
3770 "%s\n",
3771 "failed to determine number of clock ticks in a second");
3772 return 0;
3773 }
3774
3775 return (clockticks /= ticks_per_sec);
3776 }
3777
3778 static uint64_t get_reaper_age(pid_t pid)
3779 {
3780 uint64_t procstart, uptime, procage;
3781
3782 /* We need to substract the time the process has started since system
3783 * boot minus the time when the system has started to get the actual
3784 * reaper age.
3785 */
3786 procstart = get_reaper_start_time_in_sec(pid);
3787 procage = procstart;
3788 if (procstart > 0) {
3789 int ret;
3790 struct timespec spec;
3791
3792 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3793 if (ret < 0)
3794 return 0;
3795 /* We could make this more precise here by using the tv_nsec
3796 * field in the timespec struct and convert it to milliseconds
3797 * and then create a double for the seconds and milliseconds but
3798 * that seems more work than it is worth.
3799 */
3800 uptime = spec.tv_sec;
3801 procage = uptime - procstart;
3802 }
3803
3804 return procage;
3805 }
3806
3807 /*
3808 * Returns 0 on success.
3809 * It is the caller's responsibility to free `return_usage`, unless this
3810 * function returns an error.
3811 */
3812 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage)
3813 {
3814 int cpucount = get_nprocs();
3815 struct cpuacct_usage *cpu_usage;
3816 int rv = 0, i, j, ret, read_pos = 0, read_cnt;
3817 int cg_cpu;
3818 uint64_t cg_user, cg_system;
3819 int64_t ticks_per_sec;
3820 char *usage_str = NULL;
3821
3822 ticks_per_sec = sysconf(_SC_CLK_TCK);
3823
3824 if (ticks_per_sec < 0 && errno == EINVAL) {
3825 lxcfs_debug(
3826 "%s\n",
3827 "read_cpuacct_usage_all failed to determine number of clock ticks "
3828 "in a second");
3829 return -1;
3830 }
3831
3832 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
3833 if (!cpu_usage)
3834 return -ENOMEM;
3835
3836 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
3837 rv = -1;
3838 goto err;
3839 }
3840
3841 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
3842 lxcfs_error("read_cpuacct_usage_all reading first line from "
3843 "%s/cpuacct.usage_all failed.\n", cg);
3844 rv = -1;
3845 goto err;
3846 }
3847
3848 read_pos += read_cnt;
3849
3850 for (i = 0, j = 0; i < cpucount; i++) {
3851 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
3852 &cg_system, &read_cnt);
3853
3854 if (ret == EOF)
3855 break;
3856
3857 if (ret != 3) {
3858 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
3859 "failed.\n", cg);
3860 rv = -1;
3861 goto err;
3862 }
3863
3864 read_pos += read_cnt;
3865
3866 if (!cpu_in_cpuset(i, cpuset))
3867 continue;
3868
3869 /* Convert the time from nanoseconds to USER_HZ */
3870 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
3871 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
3872 j++;
3873 }
3874
3875 rv = 0;
3876 *return_usage = cpu_usage;
3877
3878 err:
3879 if (usage_str)
3880 free(usage_str);
3881
3882 if (rv != 0) {
3883 free(cpu_usage);
3884 *return_usage = NULL;
3885 }
3886
3887 return rv;
3888 }
3889
3890 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
3891 static int proc_stat_read(char *buf, size_t size, off_t offset,
3892 struct fuse_file_info *fi)
3893 {
3894 struct fuse_context *fc = fuse_get_context();
3895 struct file_info *d = (struct file_info *)fi->fh;
3896 char *cg;
3897 char *cpuset = NULL;
3898 char *line = NULL;
3899 size_t linelen = 0, total_len = 0, rv = 0;
3900 int curcpu = -1; /* cpu numbering starts at 0 */
3901 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
3902 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3903 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
3904 char cpuall[CPUALL_MAX_SIZE];
3905 /* reserve for cpu all */
3906 char *cache = d->buf + CPUALL_MAX_SIZE;
3907 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3908 FILE *f = NULL;
3909 struct cpuacct_usage *cg_cpu_usage = NULL;
3910
3911 if (offset){
3912 if (offset > d->size)
3913 return -EINVAL;
3914 if (!d->cached)
3915 return 0;
3916 int left = d->size - offset;
3917 total_len = left > size ? size: left;
3918 memcpy(buf, d->buf + offset, total_len);
3919 return total_len;
3920 }
3921
3922 pid_t initpid = lookup_initpid_in_store(fc->pid);
3923 if (initpid <= 0)
3924 initpid = fc->pid;
3925 cg = get_pid_cgroup(initpid, "cpuset");
3926 if (!cg)
3927 return read_file("/proc/stat", buf, size, d);
3928 prune_init_slice(cg);
3929
3930 cpuset = get_cpuset(cg);
3931 if (!cpuset)
3932 goto err;
3933
3934 /*
3935 * Read cpuacct.usage_all for all CPUs.
3936 * If the cpuacct cgroup is present, it is used to calculate the container's
3937 * CPU usage. If not, values from the host's /proc/stat are used.
3938 */
3939 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage) != 0) {
3940 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
3941 "falling back to the host's /proc/stat");
3942 }
3943
3944 f = fopen("/proc/stat", "r");
3945 if (!f)
3946 goto err;
3947
3948 //skip first line
3949 if (getline(&line, &linelen, f) < 0) {
3950 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
3951 goto err;
3952 }
3953
3954 while (getline(&line, &linelen, f) != -1) {
3955 ssize_t l;
3956 int cpu;
3957 char cpu_char[10]; /* That's a lot of cores */
3958 char *c;
3959 uint64_t all_used, cg_used, new_idle;
3960 int ret;
3961
3962 if (strlen(line) == 0)
3963 continue;
3964 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3965 /* not a ^cpuN line containing a number N, just print it */
3966 l = snprintf(cache, cache_size, "%s", line);
3967 if (l < 0) {
3968 perror("Error writing to cache");
3969 rv = 0;
3970 goto err;
3971 }
3972 if (l >= cache_size) {
3973 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3974 rv = 0;
3975 goto err;
3976 }
3977 cache += l;
3978 cache_size -= l;
3979 total_len += l;
3980 continue;
3981 }
3982
3983 if (sscanf(cpu_char, "%d", &cpu) != 1)
3984 continue;
3985 if (!cpu_in_cpuset(cpu, cpuset))
3986 continue;
3987 curcpu ++;
3988
3989 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
3990 &user,
3991 &nice,
3992 &system,
3993 &idle,
3994 &iowait,
3995 &irq,
3996 &softirq,
3997 &steal,
3998 &guest,
3999 &guest_nice);
4000
4001 if (ret != 10 || !cg_cpu_usage) {
4002 c = strchr(line, ' ');
4003 if (!c)
4004 continue;
4005 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4006 if (l < 0) {
4007 perror("Error writing to cache");
4008 rv = 0;
4009 goto err;
4010
4011 }
4012 if (l >= cache_size) {
4013 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4014 rv = 0;
4015 goto err;
4016 }
4017
4018 cache += l;
4019 cache_size -= l;
4020 total_len += l;
4021
4022 if (ret != 10)
4023 continue;
4024 }
4025
4026 if (cg_cpu_usage) {
4027 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4028 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4029
4030 if (all_used >= cg_used) {
4031 new_idle = idle + (all_used - cg_used);
4032
4033 } else {
4034 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4035 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4036 curcpu, cg, all_used, cg_used);
4037 new_idle = idle;
4038 }
4039
4040 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4041 curcpu, cg_cpu_usage[curcpu].user, cg_cpu_usage[curcpu].system,
4042 new_idle);
4043
4044 if (l < 0) {
4045 perror("Error writing to cache");
4046 rv = 0;
4047 goto err;
4048
4049 }
4050 if (l >= cache_size) {
4051 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4052 rv = 0;
4053 goto err;
4054 }
4055
4056 cache += l;
4057 cache_size -= l;
4058 total_len += l;
4059
4060 user_sum += cg_cpu_usage[curcpu].user;
4061 system_sum += cg_cpu_usage[curcpu].system;
4062 idle_sum += new_idle;
4063
4064 } else {
4065 user_sum += user;
4066 nice_sum += nice;
4067 system_sum += system;
4068 idle_sum += idle;
4069 iowait_sum += iowait;
4070 irq_sum += irq;
4071 softirq_sum += softirq;
4072 steal_sum += steal;
4073 guest_sum += guest;
4074 guest_nice_sum += guest_nice;
4075 }
4076 }
4077
4078 cache = d->buf;
4079
4080 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4081 user_sum,
4082 nice_sum,
4083 system_sum,
4084 idle_sum,
4085 iowait_sum,
4086 irq_sum,
4087 softirq_sum,
4088 steal_sum,
4089 guest_sum,
4090 guest_nice_sum);
4091 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4092 memcpy(cache, cpuall, cpuall_len);
4093 cache += cpuall_len;
4094 } else {
4095 /* shouldn't happen */
4096 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4097 cpuall_len = 0;
4098 }
4099
4100 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4101 total_len += cpuall_len;
4102 d->cached = 1;
4103 d->size = total_len;
4104 if (total_len > size)
4105 total_len = size;
4106
4107 memcpy(buf, d->buf, total_len);
4108 rv = total_len;
4109
4110 err:
4111 if (f)
4112 fclose(f);
4113 if (cg_cpu_usage)
4114 free(cg_cpu_usage);
4115 free(line);
4116 free(cpuset);
4117 free(cg);
4118 return rv;
4119 }
4120
4121 /* This function retrieves the busy time of a group of tasks by looking at
4122 * cpuacct.usage. Unfortunately, this only makes sense when the container has
4123 * been given it's own cpuacct cgroup. If not, this function will take the busy
4124 * time of all other taks that do not actually belong to the container into
4125 * account as well. If someone has a clever solution for this please send a
4126 * patch!
4127 */
4128 static unsigned long get_reaper_busy(pid_t task)
4129 {
4130 pid_t initpid = lookup_initpid_in_store(task);
4131 char *cgroup = NULL, *usage_str = NULL;
4132 unsigned long usage = 0;
4133
4134 if (initpid <= 0)
4135 return 0;
4136
4137 cgroup = get_pid_cgroup(initpid, "cpuacct");
4138 if (!cgroup)
4139 goto out;
4140 prune_init_slice(cgroup);
4141 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
4142 goto out;
4143 usage = strtoul(usage_str, NULL, 10);
4144 usage /= 1000000000;
4145
4146 out:
4147 free(cgroup);
4148 free(usage_str);
4149 return usage;
4150 }
4151
4152 #if RELOADTEST
4153 void iwashere(void)
4154 {
4155 int fd;
4156
4157 fd = creat("/tmp/lxcfs-iwashere", 0644);
4158 if (fd >= 0)
4159 close(fd);
4160 }
4161 #endif
4162
4163 /*
4164 * We read /proc/uptime and reuse its second field.
4165 * For the first field, we use the mtime for the reaper for
4166 * the calling pid as returned by getreaperage
4167 */
4168 static int proc_uptime_read(char *buf, size_t size, off_t offset,
4169 struct fuse_file_info *fi)
4170 {
4171 struct fuse_context *fc = fuse_get_context();
4172 struct file_info *d = (struct file_info *)fi->fh;
4173 unsigned long int busytime = get_reaper_busy(fc->pid);
4174 char *cache = d->buf;
4175 ssize_t total_len = 0;
4176 uint64_t idletime, reaperage;
4177
4178 #if RELOADTEST
4179 iwashere();
4180 #endif
4181
4182 if (offset){
4183 if (!d->cached)
4184 return 0;
4185 if (offset > d->size)
4186 return -EINVAL;
4187 int left = d->size - offset;
4188 total_len = left > size ? size: left;
4189 memcpy(buf, cache + offset, total_len);
4190 return total_len;
4191 }
4192
4193 reaperage = get_reaper_age(fc->pid);
4194 /* To understand why this is done, please read the comment to the
4195 * get_reaper_busy() function.
4196 */
4197 idletime = reaperage;
4198 if (reaperage >= busytime)
4199 idletime = reaperage - busytime;
4200
4201 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
4202 if (total_len < 0 || total_len >= d->buflen){
4203 lxcfs_error("%s\n", "failed to write to cache");
4204 return 0;
4205 }
4206
4207 d->size = (int)total_len;
4208 d->cached = 1;
4209
4210 if (total_len > size) total_len = size;
4211
4212 memcpy(buf, d->buf, total_len);
4213 return total_len;
4214 }
4215
4216 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4217 struct fuse_file_info *fi)
4218 {
4219 char dev_name[72];
4220 struct fuse_context *fc = fuse_get_context();
4221 struct file_info *d = (struct file_info *)fi->fh;
4222 char *cg;
4223 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
4224 *io_wait_time_str = NULL, *io_service_time_str = NULL;
4225 unsigned long read = 0, write = 0;
4226 unsigned long read_merged = 0, write_merged = 0;
4227 unsigned long read_sectors = 0, write_sectors = 0;
4228 unsigned long read_ticks = 0, write_ticks = 0;
4229 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4230 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4231 char *cache = d->buf;
4232 size_t cache_size = d->buflen;
4233 char *line = NULL;
4234 size_t linelen = 0, total_len = 0, rv = 0;
4235 unsigned int major = 0, minor = 0;
4236 int i = 0;
4237 FILE *f = NULL;
4238
4239 if (offset){
4240 if (offset > d->size)
4241 return -EINVAL;
4242 if (!d->cached)
4243 return 0;
4244 int left = d->size - offset;
4245 total_len = left > size ? size: left;
4246 memcpy(buf, cache + offset, total_len);
4247 return total_len;
4248 }
4249
4250 pid_t initpid = lookup_initpid_in_store(fc->pid);
4251 if (initpid <= 0)
4252 initpid = fc->pid;
4253 cg = get_pid_cgroup(initpid, "blkio");
4254 if (!cg)
4255 return read_file("/proc/diskstats", buf, size, d);
4256 prune_init_slice(cg);
4257
4258 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
4259 goto err;
4260 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
4261 goto err;
4262 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
4263 goto err;
4264 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
4265 goto err;
4266 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
4267 goto err;
4268
4269
4270 f = fopen("/proc/diskstats", "r");
4271 if (!f)
4272 goto err;
4273
4274 while (getline(&line, &linelen, f) != -1) {
4275 ssize_t l;
4276 char lbuf[256];
4277
4278 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
4279 if (i != 3)
4280 continue;
4281
4282 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
4283 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
4284 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
4285 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
4286 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
4287 read_sectors = read_sectors/512;
4288 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
4289 write_sectors = write_sectors/512;
4290
4291 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
4292 rd_svctm = rd_svctm/1000000;
4293 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
4294 rd_wait = rd_wait/1000000;
4295 read_ticks = rd_svctm + rd_wait;
4296
4297 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
4298 wr_svctm = wr_svctm/1000000;
4299 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
4300 wr_wait = wr_wait/1000000;
4301 write_ticks = wr_svctm + wr_wait;
4302
4303 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
4304 tot_ticks = tot_ticks/1000000;
4305
4306 memset(lbuf, 0, 256);
4307 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
4308 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4309 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
4310 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
4311 else
4312 continue;
4313
4314 l = snprintf(cache, cache_size, "%s", lbuf);
4315 if (l < 0) {
4316 perror("Error writing to fuse buf");
4317 rv = 0;
4318 goto err;
4319 }
4320 if (l >= cache_size) {
4321 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4322 rv = 0;
4323 goto err;
4324 }
4325 cache += l;
4326 cache_size -= l;
4327 total_len += l;
4328 }
4329
4330 d->cached = 1;
4331 d->size = total_len;
4332 if (total_len > size ) total_len = size;
4333 memcpy(buf, d->buf, total_len);
4334
4335 rv = total_len;
4336 err:
4337 free(cg);
4338 if (f)
4339 fclose(f);
4340 free(line);
4341 free(io_serviced_str);
4342 free(io_merged_str);
4343 free(io_service_bytes_str);
4344 free(io_wait_time_str);
4345 free(io_service_time_str);
4346 return rv;
4347 }
4348
4349 static int proc_swaps_read(char *buf, size_t size, off_t offset,
4350 struct fuse_file_info *fi)
4351 {
4352 struct fuse_context *fc = fuse_get_context();
4353 struct file_info *d = (struct file_info *)fi->fh;
4354 char *cg = NULL;
4355 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
4356 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
4357 ssize_t total_len = 0, rv = 0;
4358 ssize_t l = 0;
4359 char *cache = d->buf;
4360
4361 if (offset) {
4362 if (offset > d->size)
4363 return -EINVAL;
4364 if (!d->cached)
4365 return 0;
4366 int left = d->size - offset;
4367 total_len = left > size ? size: left;
4368 memcpy(buf, cache + offset, total_len);
4369 return total_len;
4370 }
4371
4372 pid_t initpid = lookup_initpid_in_store(fc->pid);
4373 if (initpid <= 0)
4374 initpid = fc->pid;
4375 cg = get_pid_cgroup(initpid, "memory");
4376 if (!cg)
4377 return read_file("/proc/swaps", buf, size, d);
4378 prune_init_slice(cg);
4379
4380 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
4381
4382 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4383 goto err;
4384
4385 memusage = strtoul(memusage_str, NULL, 10);
4386
4387 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
4388 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
4389
4390 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
4391 memswusage = strtoul(memswusage_str, NULL, 10);
4392
4393 swap_total = (memswlimit - memlimit) / 1024;
4394 swap_free = (memswusage - memusage) / 1024;
4395 }
4396
4397 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
4398
4399 /* When no mem + swap limit is specified or swapaccount=0*/
4400 if (!memswlimit) {
4401 char *line = NULL;
4402 size_t linelen = 0;
4403 FILE *f = fopen("/proc/meminfo", "r");
4404
4405 if (!f)
4406 goto err;
4407
4408 while (getline(&line, &linelen, f) != -1) {
4409 if (startswith(line, "SwapTotal:")) {
4410 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
4411 } else if (startswith(line, "SwapFree:")) {
4412 sscanf(line, "SwapFree: %8lu kB", &swap_free);
4413 }
4414 }
4415
4416 free(line);
4417 fclose(f);
4418 }
4419
4420 if (swap_total > 0) {
4421 l = snprintf(d->buf + total_len, d->size - total_len,
4422 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
4423 swap_total, swap_free);
4424 total_len += l;
4425 }
4426
4427 if (total_len < 0 || l < 0) {
4428 perror("Error writing to cache");
4429 rv = 0;
4430 goto err;
4431 }
4432
4433 d->cached = 1;
4434 d->size = (int)total_len;
4435
4436 if (total_len > size) total_len = size;
4437 memcpy(buf, d->buf, total_len);
4438 rv = total_len;
4439
4440 err:
4441 free(cg);
4442 free(memswlimit_str);
4443 free(memlimit_str);
4444 free(memusage_str);
4445 free(memswusage_str);
4446 return rv;
4447 }
4448 /*
4449 * Find the process pid from cgroup path.
4450 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
4451 * @pid_buf : put pid to pid_buf.
4452 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
4453 * @depth : the depth of cgroup in container.
4454 * @sum : return the number of pid.
4455 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
4456 */
4457 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
4458 {
4459 DIR *dir;
4460 int fd;
4461 struct dirent *file;
4462 FILE *f = NULL;
4463 size_t linelen = 0;
4464 char *line = NULL;
4465 int pd;
4466 char *path_dir, *path;
4467 char **pid;
4468
4469 /* path = dpath + "/cgroup.procs" + /0 */
4470 do {
4471 path = malloc(strlen(dpath) + 20);
4472 } while (!path);
4473
4474 strcpy(path, dpath);
4475 fd = openat(cfd, path, O_RDONLY);
4476 if (fd < 0)
4477 goto out;
4478
4479 dir = fdopendir(fd);
4480 if (dir == NULL) {
4481 close(fd);
4482 goto out;
4483 }
4484
4485 while (((file = readdir(dir)) != NULL) && depth > 0) {
4486 if (strncmp(file->d_name, ".", 1) == 0)
4487 continue;
4488 if (strncmp(file->d_name, "..", 1) == 0)
4489 continue;
4490 if (file->d_type == DT_DIR) {
4491 /* path + '/' + d_name +/0 */
4492 do {
4493 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
4494 } while (!path_dir);
4495 strcpy(path_dir, path);
4496 strcat(path_dir, "/");
4497 strcat(path_dir, file->d_name);
4498 pd = depth - 1;
4499 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
4500 free(path_dir);
4501 }
4502 }
4503 closedir(dir);
4504
4505 strcat(path, "/cgroup.procs");
4506 fd = openat(cfd, path, O_RDONLY);
4507 if (fd < 0)
4508 goto out;
4509
4510 f = fdopen(fd, "r");
4511 if (!f) {
4512 close(fd);
4513 goto out;
4514 }
4515
4516 while (getline(&line, &linelen, f) != -1) {
4517 do {
4518 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
4519 } while (!pid);
4520 *pid_buf = pid;
4521 do {
4522 *(*pid_buf + sum) = malloc(strlen(line) + 1);
4523 } while (*(*pid_buf + sum) == NULL);
4524 strcpy(*(*pid_buf + sum), line);
4525 sum++;
4526 }
4527 fclose(f);
4528 out:
4529 free(path);
4530 return sum;
4531 }
4532 /*
4533 * calc_load calculates the load according to the following formula:
4534 * load1 = load0 * exp + active * (1 - exp)
4535 *
4536 * @load1: the new loadavg.
4537 * @load0: the former loadavg.
4538 * @active: the total number of running pid at this moment.
4539 * @exp: the fixed-point defined in the beginning.
4540 */
4541 static unsigned long
4542 calc_load(unsigned long load, unsigned long exp, unsigned long active)
4543 {
4544 unsigned long newload;
4545
4546 active = active > 0 ? active * FIXED_1 : 0;
4547 newload = load * exp + active * (FIXED_1 - exp);
4548 if (active >= load)
4549 newload += FIXED_1 - 1;
4550
4551 return newload / FIXED_1;
4552 }
4553
4554 /*
4555 * Return 0 means that container p->cg is closed.
4556 * Return -1 means that error occurred in refresh.
4557 * Positive num equals the total number of pid.
4558 */
4559 static int refresh_load(struct load_node *p, char *path)
4560 {
4561 FILE *f = NULL;
4562 char **idbuf;
4563 char proc_path[256];
4564 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
4565 char *line = NULL;
4566 size_t linelen = 0;
4567 int sum, length;
4568 DIR *dp;
4569 struct dirent *file;
4570
4571 do {
4572 idbuf = malloc(sizeof(char *));
4573 } while (!idbuf);
4574 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
4575 /* normal exit */
4576 if (sum == 0)
4577 goto out;
4578
4579 for (i = 0; i < sum; i++) {
4580 /*clean up '\n' */
4581 length = strlen(idbuf[i])-1;
4582 idbuf[i][length] = '\0';
4583 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
4584 if (ret < 0 || ret > 255) {
4585 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4586 i = sum;
4587 sum = -1;
4588 goto err_out;
4589 }
4590
4591 dp = opendir(proc_path);
4592 if (!dp) {
4593 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
4594 continue;
4595 }
4596 while ((file = readdir(dp)) != NULL) {
4597 if (strncmp(file->d_name, ".", 1) == 0)
4598 continue;
4599 if (strncmp(file->d_name, "..", 1) == 0)
4600 continue;
4601 total_pid++;
4602 /* We make the biggest pid become last_pid.*/
4603 ret = atof(file->d_name);
4604 last_pid = (ret > last_pid) ? ret : last_pid;
4605
4606 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
4607 if (ret < 0 || ret > 255) {
4608 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4609 i = sum;
4610 sum = -1;
4611 closedir(dp);
4612 goto err_out;
4613 }
4614 f = fopen(proc_path, "r");
4615 if (f != NULL) {
4616 while (getline(&line, &linelen, f) != -1) {
4617 /* Find State */
4618 if ((line[0] == 'S') && (line[1] == 't'))
4619 break;
4620 }
4621 if ((line[7] == 'R') || (line[7] == 'D'))
4622 run_pid++;
4623 fclose(f);
4624 }
4625 }
4626 closedir(dp);
4627 }
4628 /*Calculate the loadavg.*/
4629 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
4630 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
4631 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
4632 p->run_pid = run_pid;
4633 p->total_pid = total_pid;
4634 p->last_pid = last_pid;
4635
4636 free(line);
4637 err_out:
4638 for (; i > 0; i--)
4639 free(idbuf[i-1]);
4640 out:
4641 free(idbuf);
4642 return sum;
4643 }
4644 /*
4645 * Traverse the hash table and update it.
4646 */
4647 void *load_begin(void *arg)
4648 {
4649
4650 char *path = NULL;
4651 int i, sum, length, ret;
4652 struct load_node *f;
4653 int first_node;
4654 clock_t time1, time2;
4655
4656 while (1) {
4657 if (loadavg_stop == 1)
4658 return NULL;
4659
4660 time1 = clock();
4661 for (i = 0; i < LOAD_SIZE; i++) {
4662 pthread_mutex_lock(&load_hash[i].lock);
4663 if (load_hash[i].next == NULL) {
4664 pthread_mutex_unlock(&load_hash[i].lock);
4665 continue;
4666 }
4667 f = load_hash[i].next;
4668 first_node = 1;
4669 while (f) {
4670 length = strlen(f->cg) + 2;
4671 do {
4672 /* strlen(f->cg) + '.' or '' + \0 */
4673 path = malloc(length);
4674 } while (!path);
4675
4676 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
4677 if (ret < 0 || ret > length - 1) {
4678 /* snprintf failed, ignore the node.*/
4679 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
4680 goto out;
4681 }
4682 sum = refresh_load(f, path);
4683 if (sum == 0) {
4684 f = del_node(f, i);
4685 } else {
4686 out: f = f->next;
4687 }
4688 free(path);
4689 /* load_hash[i].lock locks only on the first node.*/
4690 if (first_node == 1) {
4691 first_node = 0;
4692 pthread_mutex_unlock(&load_hash[i].lock);
4693 }
4694 }
4695 }
4696
4697 if (loadavg_stop == 1)
4698 return NULL;
4699
4700 time2 = clock();
4701 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
4702 }
4703 }
4704
4705 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
4706 struct fuse_file_info *fi)
4707 {
4708 struct fuse_context *fc = fuse_get_context();
4709 struct file_info *d = (struct file_info *)fi->fh;
4710 pid_t initpid;
4711 char *cg;
4712 size_t total_len = 0;
4713 char *cache = d->buf;
4714 struct load_node *n;
4715 int hash;
4716 int cfd, rv = 0;
4717 unsigned long a, b, c;
4718
4719 if (offset) {
4720 if (offset > d->size)
4721 return -EINVAL;
4722 if (!d->cached)
4723 return 0;
4724 int left = d->size - offset;
4725 total_len = left > size ? size : left;
4726 memcpy(buf, cache + offset, total_len);
4727 return total_len;
4728 }
4729 if (!loadavg)
4730 return read_file("/proc/loadavg", buf, size, d);
4731
4732 initpid = lookup_initpid_in_store(fc->pid);
4733 if (initpid <= 0)
4734 initpid = fc->pid;
4735 cg = get_pid_cgroup(initpid, "cpu");
4736 if (!cg)
4737 return read_file("/proc/loadavg", buf, size, d);
4738
4739 prune_init_slice(cg);
4740 hash = calc_hash(cg);
4741 n = locate_node(cg, hash);
4742
4743 /* First time */
4744 if (n == NULL) {
4745 if (!find_mounted_controller("cpu", &cfd)) {
4746 /*
4747 * In locate_node() above, pthread_rwlock_unlock() isn't used
4748 * because delete is not allowed before read has ended.
4749 */
4750 pthread_rwlock_unlock(&load_hash[hash].rdlock);
4751 rv = 0;
4752 goto err;
4753 }
4754 do {
4755 n = malloc(sizeof(struct load_node));
4756 } while (!n);
4757
4758 do {
4759 n->cg = malloc(strlen(cg)+1);
4760 } while (!n->cg);
4761 strcpy(n->cg, cg);
4762 n->avenrun[0] = 0;
4763 n->avenrun[1] = 0;
4764 n->avenrun[2] = 0;
4765 n->run_pid = 0;
4766 n->total_pid = 1;
4767 n->last_pid = initpid;
4768 n->cfd = cfd;
4769 insert_node(&n, hash);
4770 }
4771 a = n->avenrun[0] + (FIXED_1/200);
4772 b = n->avenrun[1] + (FIXED_1/200);
4773 c = n->avenrun[2] + (FIXED_1/200);
4774 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
4775 LOAD_INT(a), LOAD_FRAC(a),
4776 LOAD_INT(b), LOAD_FRAC(b),
4777 LOAD_INT(c), LOAD_FRAC(c),
4778 n->run_pid, n->total_pid, n->last_pid);
4779 pthread_rwlock_unlock(&load_hash[hash].rdlock);
4780 if (total_len < 0 || total_len >= d->buflen) {
4781 lxcfs_error("%s\n", "Failed to write to cache");
4782 rv = 0;
4783 goto err;
4784 }
4785 d->size = (int)total_len;
4786 d->cached = 1;
4787
4788 if (total_len > size)
4789 total_len = size;
4790 memcpy(buf, d->buf, total_len);
4791 rv = total_len;
4792
4793 err:
4794 free(cg);
4795 return rv;
4796 }
4797 /* Return a positive number on success, return 0 on failure.*/
4798 pthread_t load_daemon(int load_use)
4799 {
4800 int ret;
4801 pthread_t pid;
4802
4803 ret = init_load();
4804 if (ret == -1) {
4805 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
4806 return 0;
4807 }
4808 ret = pthread_create(&pid, NULL, load_begin, NULL);
4809 if (ret != 0) {
4810 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
4811 load_free();
4812 return 0;
4813 }
4814 /* use loadavg, here loadavg = 1*/
4815 loadavg = load_use;
4816 return pid;
4817 }
4818
4819 /* Returns 0 on success. */
4820 int stop_load_daemon(pthread_t pid)
4821 {
4822 int s;
4823
4824 /* Signal the thread to gracefully stop */
4825 loadavg_stop = 1;
4826
4827 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
4828 if (s != 0) {
4829 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
4830 return -1;
4831 }
4832
4833 load_free();
4834 loadavg_stop = 0;
4835
4836 return 0;
4837 }
4838
4839 static off_t get_procfile_size(const char *which)
4840 {
4841 FILE *f = fopen(which, "r");
4842 char *line = NULL;
4843 size_t len = 0;
4844 ssize_t sz, answer = 0;
4845 if (!f)
4846 return 0;
4847
4848 while ((sz = getline(&line, &len, f)) != -1)
4849 answer += sz;
4850 fclose (f);
4851 free(line);
4852
4853 return answer;
4854 }
4855
4856 int proc_getattr(const char *path, struct stat *sb)
4857 {
4858 struct timespec now;
4859
4860 memset(sb, 0, sizeof(struct stat));
4861 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4862 return -EINVAL;
4863 sb->st_uid = sb->st_gid = 0;
4864 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4865 if (strcmp(path, "/proc") == 0) {
4866 sb->st_mode = S_IFDIR | 00555;
4867 sb->st_nlink = 2;
4868 return 0;
4869 }
4870 if (strcmp(path, "/proc/meminfo") == 0 ||
4871 strcmp(path, "/proc/cpuinfo") == 0 ||
4872 strcmp(path, "/proc/uptime") == 0 ||
4873 strcmp(path, "/proc/stat") == 0 ||
4874 strcmp(path, "/proc/diskstats") == 0 ||
4875 strcmp(path, "/proc/swaps") == 0 ||
4876 strcmp(path, "/proc/loadavg") == 0) {
4877 sb->st_size = 0;
4878 sb->st_mode = S_IFREG | 00444;
4879 sb->st_nlink = 1;
4880 return 0;
4881 }
4882
4883 return -ENOENT;
4884 }
4885
4886 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4887 struct fuse_file_info *fi)
4888 {
4889 if (filler(buf, ".", NULL, 0) != 0 ||
4890 filler(buf, "..", NULL, 0) != 0 ||
4891 filler(buf, "cpuinfo", NULL, 0) != 0 ||
4892 filler(buf, "meminfo", NULL, 0) != 0 ||
4893 filler(buf, "stat", NULL, 0) != 0 ||
4894 filler(buf, "uptime", NULL, 0) != 0 ||
4895 filler(buf, "diskstats", NULL, 0) != 0 ||
4896 filler(buf, "swaps", NULL, 0) != 0 ||
4897 filler(buf, "loadavg", NULL, 0) != 0)
4898 return -EINVAL;
4899 return 0;
4900 }
4901
4902 int proc_open(const char *path, struct fuse_file_info *fi)
4903 {
4904 int type = -1;
4905 struct file_info *info;
4906
4907 if (strcmp(path, "/proc/meminfo") == 0)
4908 type = LXC_TYPE_PROC_MEMINFO;
4909 else if (strcmp(path, "/proc/cpuinfo") == 0)
4910 type = LXC_TYPE_PROC_CPUINFO;
4911 else if (strcmp(path, "/proc/uptime") == 0)
4912 type = LXC_TYPE_PROC_UPTIME;
4913 else if (strcmp(path, "/proc/stat") == 0)
4914 type = LXC_TYPE_PROC_STAT;
4915 else if (strcmp(path, "/proc/diskstats") == 0)
4916 type = LXC_TYPE_PROC_DISKSTATS;
4917 else if (strcmp(path, "/proc/swaps") == 0)
4918 type = LXC_TYPE_PROC_SWAPS;
4919 else if (strcmp(path, "/proc/loadavg") == 0)
4920 type = LXC_TYPE_PROC_LOADAVG;
4921 if (type == -1)
4922 return -ENOENT;
4923
4924 info = malloc(sizeof(*info));
4925 if (!info)
4926 return -ENOMEM;
4927
4928 memset(info, 0, sizeof(*info));
4929 info->type = type;
4930
4931 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4932 do {
4933 info->buf = malloc(info->buflen);
4934 } while (!info->buf);
4935 memset(info->buf, 0, info->buflen);
4936 /* set actual size to buffer size */
4937 info->size = info->buflen;
4938
4939 fi->fh = (unsigned long)info;
4940 return 0;
4941 }
4942
4943 int proc_access(const char *path, int mask)
4944 {
4945 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4946 return 0;
4947
4948 /* these are all read-only */
4949 if ((mask & ~R_OK) != 0)
4950 return -EACCES;
4951 return 0;
4952 }
4953
4954 int proc_release(const char *path, struct fuse_file_info *fi)
4955 {
4956 do_release_file_info(fi);
4957 return 0;
4958 }
4959
4960 int proc_read(const char *path, char *buf, size_t size, off_t offset,
4961 struct fuse_file_info *fi)
4962 {
4963 struct file_info *f = (struct file_info *) fi->fh;
4964
4965 switch (f->type) {
4966 case LXC_TYPE_PROC_MEMINFO:
4967 return proc_meminfo_read(buf, size, offset, fi);
4968 case LXC_TYPE_PROC_CPUINFO:
4969 return proc_cpuinfo_read(buf, size, offset, fi);
4970 case LXC_TYPE_PROC_UPTIME:
4971 return proc_uptime_read(buf, size, offset, fi);
4972 case LXC_TYPE_PROC_STAT:
4973 return proc_stat_read(buf, size, offset, fi);
4974 case LXC_TYPE_PROC_DISKSTATS:
4975 return proc_diskstats_read(buf, size, offset, fi);
4976 case LXC_TYPE_PROC_SWAPS:
4977 return proc_swaps_read(buf, size, offset, fi);
4978 case LXC_TYPE_PROC_LOADAVG:
4979 return proc_loadavg_read(buf, size, offset, fi);
4980 default:
4981 return -EINVAL;
4982 }
4983 }
4984
4985 /*
4986 * Functions needed to setup cgroups in the __constructor__.
4987 */
4988
4989 static bool mkdir_p(const char *dir, mode_t mode)
4990 {
4991 const char *tmp = dir;
4992 const char *orig = dir;
4993 char *makeme;
4994
4995 do {
4996 dir = tmp + strspn(tmp, "/");
4997 tmp = dir + strcspn(dir, "/");
4998 makeme = strndup(orig, dir - orig);
4999 if (!makeme)
5000 return false;
5001 if (mkdir(makeme, mode) && errno != EEXIST) {
5002 lxcfs_error("Failed to create directory '%s': %s.\n",
5003 makeme, strerror(errno));
5004 free(makeme);
5005 return false;
5006 }
5007 free(makeme);
5008 } while(tmp != dir);
5009
5010 return true;
5011 }
5012
5013 static bool umount_if_mounted(void)
5014 {
5015 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5016 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5017 return false;
5018 }
5019 return true;
5020 }
5021
5022 /* __typeof__ should be safe to use with all compilers. */
5023 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5024 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5025 {
5026 return (fs->f_type == (fs_type_magic)magic_val);
5027 }
5028
5029 /*
5030 * looking at fs/proc_namespace.c, it appears we can
5031 * actually expect the rootfs entry to very specifically contain
5032 * " - rootfs rootfs "
5033 * IIUC, so long as we've chrooted so that rootfs is not our root,
5034 * the rootfs entry should always be skipped in mountinfo contents.
5035 */
5036 static bool is_on_ramfs(void)
5037 {
5038 FILE *f;
5039 char *p, *p2;
5040 char *line = NULL;
5041 size_t len = 0;
5042 int i;
5043
5044 f = fopen("/proc/self/mountinfo", "r");
5045 if (!f)
5046 return false;
5047
5048 while (getline(&line, &len, f) != -1) {
5049 for (p = line, i = 0; p && i < 4; i++)
5050 p = strchr(p + 1, ' ');
5051 if (!p)
5052 continue;
5053 p2 = strchr(p + 1, ' ');
5054 if (!p2)
5055 continue;
5056 *p2 = '\0';
5057 if (strcmp(p + 1, "/") == 0) {
5058 // this is '/'. is it the ramfs?
5059 p = strchr(p2 + 1, '-');
5060 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5061 free(line);
5062 fclose(f);
5063 return true;
5064 }
5065 }
5066 }
5067 free(line);
5068 fclose(f);
5069 return false;
5070 }
5071
5072 static int pivot_enter()
5073 {
5074 int ret = -1, oldroot = -1, newroot = -1;
5075
5076 oldroot = open("/", O_DIRECTORY | O_RDONLY);
5077 if (oldroot < 0) {
5078 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5079 return ret;
5080 }
5081
5082 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5083 if (newroot < 0) {
5084 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5085 goto err;
5086 }
5087
5088 /* change into new root fs */
5089 if (fchdir(newroot) < 0) {
5090 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5091 goto err;
5092 }
5093
5094 /* pivot_root into our new root fs */
5095 if (pivot_root(".", ".") < 0) {
5096 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5097 goto err;
5098 }
5099
5100 /*
5101 * At this point the old-root is mounted on top of our new-root.
5102 * To unmounted it we must not be chdir'd into it, so escape back
5103 * to the old-root.
5104 */
5105 if (fchdir(oldroot) < 0) {
5106 lxcfs_error("%s\n", "Failed to enter old root.");
5107 goto err;
5108 }
5109
5110 if (umount2(".", MNT_DETACH) < 0) {
5111 lxcfs_error("%s\n", "Failed to detach old root.");
5112 goto err;
5113 }
5114
5115 if (fchdir(newroot) < 0) {
5116 lxcfs_error("%s\n", "Failed to re-enter new root.");
5117 goto err;
5118 }
5119
5120 ret = 0;
5121
5122 err:
5123 if (oldroot > 0)
5124 close(oldroot);
5125 if (newroot > 0)
5126 close(newroot);
5127
5128 return ret;
5129 }
5130
5131 static int chroot_enter()
5132 {
5133 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5134 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5135 return -1;
5136 }
5137
5138 if (chroot(".") < 0) {
5139 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
5140 return -1;
5141 }
5142
5143 if (chdir("/") < 0) {
5144 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
5145 return -1;
5146 }
5147
5148 return 0;
5149 }
5150
5151 static int permute_and_enter(void)
5152 {
5153 struct statfs sb;
5154
5155 if (statfs("/", &sb) < 0) {
5156 lxcfs_error("%s\n", "Could not stat / mountpoint.");
5157 return -1;
5158 }
5159
5160 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5161 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5162 * /proc/1/mountinfo. */
5163 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5164 return chroot_enter();
5165
5166 if (pivot_enter() < 0) {
5167 lxcfs_error("%s\n", "Could not perform pivot root.");
5168 return -1;
5169 }
5170
5171 return 0;
5172 }
5173
5174 /* Prepare our new clean root. */
5175 static int permute_prepare(void)
5176 {
5177 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
5178 lxcfs_error("%s\n", "Failed to create directory for new root.");
5179 return -1;
5180 }
5181
5182 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
5183 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
5184 return -1;
5185 }
5186
5187 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
5188 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
5189 return -1;
5190 }
5191
5192 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
5193 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
5194 return -1;
5195 }
5196
5197 return 0;
5198 }
5199
5200 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
5201 static bool permute_root(void)
5202 {
5203 /* Prepare new root. */
5204 if (permute_prepare() < 0)
5205 return false;
5206
5207 /* Pivot into new root. */
5208 if (permute_and_enter() < 0)
5209 return false;
5210
5211 return true;
5212 }
5213
5214 static int preserve_mnt_ns(int pid)
5215 {
5216 int ret;
5217 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
5218 char path[len];
5219
5220 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
5221 if (ret < 0 || (size_t)ret >= len)
5222 return -1;
5223
5224 return open(path, O_RDONLY | O_CLOEXEC);
5225 }
5226
5227 static bool cgfs_prepare_mounts(void)
5228 {
5229 if (!mkdir_p(BASEDIR, 0700)) {
5230 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
5231 return false;
5232 }
5233
5234 if (!umount_if_mounted()) {
5235 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
5236 return false;
5237 }
5238
5239 if (unshare(CLONE_NEWNS) < 0) {
5240 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
5241 return false;
5242 }
5243
5244 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
5245 if (cgroup_mount_ns_fd < 0) {
5246 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
5247 return false;
5248 }
5249
5250 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
5251 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
5252 return false;
5253 }
5254
5255 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
5256 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
5257 return false;
5258 }
5259
5260 return true;
5261 }
5262
5263 static bool cgfs_mount_hierarchies(void)
5264 {
5265 char *target;
5266 size_t clen, len;
5267 int i, ret;
5268
5269 for (i = 0; i < num_hierarchies; i++) {
5270 char *controller = hierarchies[i];
5271
5272 clen = strlen(controller);
5273 len = strlen(BASEDIR) + clen + 2;
5274 target = malloc(len);
5275 if (!target)
5276 return false;
5277
5278 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
5279 if (ret < 0 || ret >= len) {
5280 free(target);
5281 return false;
5282 }
5283 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
5284 free(target);
5285 return false;
5286 }
5287 if (!strcmp(controller, "unified"))
5288 ret = mount("none", target, "cgroup2", 0, NULL);
5289 else
5290 ret = mount(controller, target, "cgroup", 0, controller);
5291 if (ret < 0) {
5292 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
5293 free(target);
5294 return false;
5295 }
5296
5297 fd_hierarchies[i] = open(target, O_DIRECTORY);
5298 if (fd_hierarchies[i] < 0) {
5299 free(target);
5300 return false;
5301 }
5302 free(target);
5303 }
5304 return true;
5305 }
5306
5307 static bool cgfs_setup_controllers(void)
5308 {
5309 if (!cgfs_prepare_mounts())
5310 return false;
5311
5312 if (!cgfs_mount_hierarchies()) {
5313 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
5314 return false;
5315 }
5316
5317 if (!permute_root())
5318 return false;
5319
5320 return true;
5321 }
5322
5323 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
5324 {
5325 FILE *f;
5326 char *cret, *line = NULL;
5327 char cwd[MAXPATHLEN];
5328 size_t len = 0;
5329 int i, init_ns = -1;
5330 bool found_unified = false;
5331
5332 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
5333 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
5334 return;
5335 }
5336
5337 while (getline(&line, &len, f) != -1) {
5338 char *idx, *p, *p2;
5339
5340 p = strchr(line, ':');
5341 if (!p)
5342 goto out;
5343 idx = line;
5344 *(p++) = '\0';
5345
5346 p2 = strrchr(p, ':');
5347 if (!p2)
5348 goto out;
5349 *p2 = '\0';
5350
5351 /* With cgroupv2 /proc/self/cgroup can contain entries of the
5352 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
5353 * because it parses out the empty string "" and later on passes
5354 * it to mount(). Let's skip such entries.
5355 */
5356 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
5357 found_unified = true;
5358 p = "unified";
5359 }
5360
5361 if (!store_hierarchy(line, p))
5362 goto out;
5363 }
5364
5365 /* Preserve initial namespace. */
5366 init_ns = preserve_mnt_ns(getpid());
5367 if (init_ns < 0) {
5368 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
5369 goto out;
5370 }
5371
5372 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
5373 if (!fd_hierarchies) {
5374 lxcfs_error("%s\n", strerror(errno));
5375 goto out;
5376 }
5377
5378 for (i = 0; i < num_hierarchies; i++)
5379 fd_hierarchies[i] = -1;
5380
5381 cret = getcwd(cwd, MAXPATHLEN);
5382 if (!cret)
5383 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
5384
5385 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
5386 * to privately mount lxcfs cgroups. */
5387 if (!cgfs_setup_controllers()) {
5388 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
5389 goto out;
5390 }
5391
5392 if (setns(init_ns, 0) < 0) {
5393 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
5394 goto out;
5395 }
5396
5397 if (!cret || chdir(cwd) < 0)
5398 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
5399
5400 print_subsystems();
5401
5402 out:
5403 free(line);
5404 fclose(f);
5405 if (init_ns >= 0)
5406 close(init_ns);
5407 }
5408
5409 static void __attribute__((destructor)) free_subsystems(void)
5410 {
5411 int i;
5412
5413 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
5414
5415 for (i = 0; i < num_hierarchies; i++) {
5416 if (hierarchies[i])
5417 free(hierarchies[i]);
5418 if (fd_hierarchies && fd_hierarchies[i] >= 0)
5419 close(fd_hierarchies[i]);
5420 }
5421 free(hierarchies);
5422 free(fd_hierarchies);
5423
5424 if (cgroup_mount_ns_fd >= 0)
5425 close(cgroup_mount_ns_fd);
5426 }