1 /* SPDX-License-Identifier: LGPL-2.1+ */
10 #ifndef FUSE_USE_VERSION
11 #define FUSE_USE_VERSION 30
14 #ifndef FUSE_USE_VERSION
15 #define FUSE_USE_VERSION 26
19 #define _FILE_OFFSET_BITS 64
21 #define __STDC_FORMAT_MACROS
39 #include <linux/magic.h>
40 #include <linux/sched.h>
41 #include <sys/epoll.h>
43 #include <sys/mount.h>
44 #include <sys/param.h>
45 #include <sys/socket.h>
46 #include <sys/syscall.h>
47 #include <sys/sysinfo.h>
51 #include "cgroup_fuse.h"
52 #include "cgroups/cgroup.h"
53 #include "cgroups/cgroup_utils.h"
54 #include "cpuset_parse.h"
55 #include "lxcfs_fuse_compat.h"
56 #include "memory_utils.h"
57 #include "proc_loadavg.h"
58 #include "proc_cpuview.h"
62 uint64_t hierarchical_memory_limit
;
63 uint64_t hierarchical_memsw_limit
;
66 uint64_t total_rss_huge
;
68 uint64_t total_mapped_file
;
70 uint64_t total_writeback
;
72 uint64_t total_pgpgin
;
73 uint64_t total_pgpgout
;
74 uint64_t total_pgfault
;
75 uint64_t total_pgmajfault
;
76 uint64_t total_inactive_anon
;
77 uint64_t total_active_anon
;
78 uint64_t total_inactive_file
;
79 uint64_t total_active_file
;
80 uint64_t total_unevictable
;
83 __lxcfs_fuse_ops
int proc_getattr(const char *path
, struct stat
*sb
)
87 memset(sb
, 0, sizeof(struct stat
));
88 if (clock_gettime(CLOCK_REALTIME
, &now
) < 0)
91 sb
->st_uid
= sb
->st_gid
= 0;
92 sb
->st_atim
= sb
->st_mtim
= sb
->st_ctim
= now
;
93 if (strcmp(path
, "/proc") == 0) {
94 sb
->st_mode
= S_IFDIR
| 00555;
99 if (strcmp(path
, "/proc/meminfo") == 0 ||
100 strcmp(path
, "/proc/cpuinfo") == 0 ||
101 strcmp(path
, "/proc/uptime") == 0 ||
102 strcmp(path
, "/proc/stat") == 0 ||
103 strcmp(path
, "/proc/diskstats") == 0 ||
104 strcmp(path
, "/proc/swaps") == 0 ||
105 strcmp(path
, "/proc/loadavg") == 0) {
107 sb
->st_mode
= S_IFREG
| 00444;
115 __lxcfs_fuse_ops
int proc_readdir(const char *path
, void *buf
,
116 fuse_fill_dir_t filler
, off_t offset
,
117 struct fuse_file_info
*fi
)
119 if (DIR_FILLER(filler
, buf
, ".", NULL
, 0) != 0 ||
120 DIR_FILLER(filler
, buf
, "..", NULL
, 0) != 0 ||
121 DIR_FILLER(filler
, buf
, "cpuinfo", NULL
, 0) != 0 ||
122 DIR_FILLER(filler
, buf
, "meminfo", NULL
, 0) != 0 ||
123 DIR_FILLER(filler
, buf
, "stat", NULL
, 0) != 0 ||
124 DIR_FILLER(filler
, buf
, "uptime", NULL
, 0) != 0 ||
125 DIR_FILLER(filler
, buf
, "diskstats", NULL
, 0) != 0 ||
126 DIR_FILLER(filler
, buf
, "swaps", NULL
, 0) != 0 ||
127 DIR_FILLER(filler
, buf
, "loadavg", NULL
, 0) != 0)
133 static off_t
get_procfile_size(const char *path
)
135 __do_fclose
FILE *f
= NULL
;
136 __do_free
char *line
= NULL
;
138 ssize_t sz
, answer
= 0;
140 f
= fopen(path
, "re");
144 while ((sz
= getline(&line
, &len
, f
)) != -1)
150 __lxcfs_fuse_ops
int proc_open(const char *path
, struct fuse_file_info
*fi
)
152 __do_free
struct file_info
*info
= NULL
;
155 if (strcmp(path
, "/proc/meminfo") == 0)
156 type
= LXC_TYPE_PROC_MEMINFO
;
157 else if (strcmp(path
, "/proc/cpuinfo") == 0)
158 type
= LXC_TYPE_PROC_CPUINFO
;
159 else if (strcmp(path
, "/proc/uptime") == 0)
160 type
= LXC_TYPE_PROC_UPTIME
;
161 else if (strcmp(path
, "/proc/stat") == 0)
162 type
= LXC_TYPE_PROC_STAT
;
163 else if (strcmp(path
, "/proc/diskstats") == 0)
164 type
= LXC_TYPE_PROC_DISKSTATS
;
165 else if (strcmp(path
, "/proc/swaps") == 0)
166 type
= LXC_TYPE_PROC_SWAPS
;
167 else if (strcmp(path
, "/proc/loadavg") == 0)
168 type
= LXC_TYPE_PROC_LOADAVG
;
172 info
= zalloc(sizeof(*info
));
178 info
->buflen
= get_procfile_size(path
) + BUF_RESERVE_SIZE
;
180 info
->buf
= zalloc(info
->buflen
);
183 /* set actual size to buffer size */
184 info
->size
= info
->buflen
;
186 fi
->fh
= PTR_TO_UINT64(move_ptr(info
));
190 __lxcfs_fuse_ops
int proc_access(const char *path
, int mask
)
192 if (strcmp(path
, "/proc") == 0 && access(path
, R_OK
) == 0)
195 /* these are all read-only */
196 if ((mask
& ~R_OK
) != 0)
202 __lxcfs_fuse_ops
int proc_release(const char *path
, struct fuse_file_info
*fi
)
204 do_release_file_info(fi
);
208 static uint64_t get_memlimit(const char *cgroup
, bool swap
)
210 __do_free
char *memlimit_str
= NULL
;
211 uint64_t memlimit
= 0;
215 ret
= cgroup_ops
->get_memory_swap_max(cgroup_ops
, cgroup
, &memlimit_str
);
217 ret
= cgroup_ops
->get_memory_max(cgroup_ops
, cgroup
, &memlimit_str
);
218 if (ret
> 0 && memlimit_str
[0] && safe_uint64(memlimit_str
, &memlimit
, 10) < 0)
219 lxcfs_error("Failed to convert memlimit %s", memlimit_str
);
225 * This function taken from glibc-2.32, as POSIX dirname("/some-dir") will
226 * return "/some-dir" as opposed to "/", which breaks `get_min_memlimit()`
228 static char *gnu_dirname(char *path
)
230 static const char dot
[] = ".";
234 last_slash
= path
!= NULL
? strrchr(path
, '/') : NULL
;
236 if (last_slash
!= NULL
&& last_slash
!= path
&& last_slash
[1] == '\0') {
237 /* Determine whether all remaining characters are slashes. */
240 for (runp
= last_slash
; runp
!= path
; --runp
)
244 /* The '/' is the last character, we have to look further. */
246 last_slash
= memrchr(path
, '/', runp
- path
);
249 if (last_slash
!= NULL
) {
250 /* Determine whether all remaining characters are slashes. */
253 for (runp
= last_slash
; runp
!= path
; --runp
)
257 /* Terminate the path. */
260 * The last slash is the first character in the string.
261 * We have to return "/". As a special case we have to
262 * return "//" if there are exactly two slashes at the
263 * beginning of the string. See XBD 4.10 Path Name
264 * Resolution for more information
266 if (last_slash
== path
+ 1)
269 last_slash
= path
+ 1;
273 last_slash
[0] = '\0';
276 * This assignment is ill-designed but the XPG specs require to
277 * return a string containing "." in any case no directory part
278 * is found and so a static and constant string is required.
286 static uint64_t get_min_memlimit(const char *cgroup
, bool swap
)
288 __do_free
char *copy
= NULL
;
289 uint64_t memlimit
= 0, retlimit
= 0;
291 copy
= strdup(cgroup
);
293 return log_error_errno(0, ENOMEM
, "Failed to allocate memory");
295 retlimit
= get_memlimit(copy
, swap
);
298 * If the cgroup doesn't start with / (probably won't happen), dirname()
299 * will terminate with "" instead of "/"
301 while (*copy
&& strcmp(copy
, "/") != 0) {
304 it
= gnu_dirname(it
);
305 memlimit
= get_memlimit(it
, swap
);
306 if (memlimit
> 0 && memlimit
< retlimit
)
313 static inline bool startswith(const char *line
, const char *pref
)
315 return strncmp(line
, pref
, strlen(pref
)) == 0;
318 static int proc_swaps_read(char *buf
, size_t size
, off_t offset
,
319 struct fuse_file_info
*fi
)
321 __do_free
char *cgroup
= NULL
, *memusage_str
= NULL
, *memswusage_str
= NULL
;
322 struct fuse_context
*fc
= fuse_get_context();
323 struct lxcfs_opts
*opts
= (struct lxcfs_opts
*)fuse_get_context()->private_data
;
324 bool wants_swap
= opts
&& !opts
->swap_off
&& liblxcfs_can_use_swap();
325 struct file_info
*d
= INTTYPE_TO_PTR(fi
->fh
);
326 uint64_t memswlimit
= 0, memlimit
= 0, memusage
= 0, memswusage
= 0,
327 swtotal
= 0, swfree
= 0, swusage
= 0;
328 ssize_t total_len
= 0;
330 char *cache
= d
->buf
;
336 if (offset
> d
->size
)
342 left
= d
->size
- offset
;
343 total_len
= left
> size
? size
: left
;
344 memcpy(buf
, cache
+ offset
, total_len
);
349 pid_t initpid
= lookup_initpid_in_store(fc
->pid
);
350 if (initpid
<= 1 || is_shared_pidns(initpid
))
353 cgroup
= get_pid_cgroup(initpid
, "memory");
355 return read_file_fuse("/proc/swaps", buf
, size
, d
);
356 prune_init_slice(cgroup
);
358 memlimit
= get_min_memlimit(cgroup
, false);
360 ret
= cgroup_ops
->get_memory_current(cgroup_ops
, cgroup
, &memusage_str
);
364 if (safe_uint64(memusage_str
, &memusage
, 10) < 0)
365 lxcfs_error("Failed to convert memusage %s", memusage_str
);
368 memswlimit
= get_min_memlimit(cgroup
, true);
369 if (memswlimit
> 0) {
370 ret
= cgroup_ops
->get_memory_swap_current(cgroup_ops
, cgroup
, &memswusage_str
);
371 if (ret
>= 0 && safe_uint64(memswusage_str
, &memswusage
, 10) == 0) {
372 if (memlimit
> memswlimit
)
375 swtotal
= (memswlimit
- memlimit
) / 1024;
376 if (memusage
> memswusage
|| swtotal
== 0)
379 swusage
= (memswusage
- memusage
) / 1024;
380 if (swtotal
>= swusage
)
381 swfree
= swtotal
- swusage
;
386 total_len
= snprintf(d
->buf
, d
->size
, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
388 /* When no mem + swap limit is specified or swapaccount=0*/
390 __do_free
char *line
= NULL
;
391 __do_free
void *fopen_cache
= NULL
;
392 __do_fclose
FILE *f
= NULL
;
395 f
= fopen_cached("/proc/meminfo", "re", &fopen_cache
);
399 while (getline(&line
, &linelen
, f
) != -1) {
400 if (startswith(line
, "SwapTotal:"))
401 sscanf(line
, "SwapTotal: %8" PRIu64
" kB", &swtotal
);
402 else if (startswith(line
, "SwapFree:"))
403 sscanf(line
, "SwapFree: %8" PRIu64
" kB", &swfree
);
408 l
= snprintf(d
->buf
+ total_len
, d
->size
- total_len
,
409 "none%*svirtual\t\t%" PRIu64
"\t%" PRIu64
"\t0\n",
410 36, " ", swtotal
, swfree
);
414 if (total_len
< 0 || l
< 0)
415 return log_error(0, "Failed writing to cache");
418 d
->size
= (int)total_len
;
420 if (total_len
> size
)
422 memcpy(buf
, d
->buf
, total_len
);
427 static void get_blkio_io_value(char *str
, unsigned major
, unsigned minor
,
428 char *iotype
, uint64_t *v
)
435 snprintf(key
, 32, "%u:%u %s", major
, minor
, iotype
);
440 if (startswith(str
, key
)) {
441 sscanf(str
+ len
, "%lu", v
);
444 eol
= strchr(str
, '\n');
451 static int proc_diskstats_read(char *buf
, size_t size
, off_t offset
,
452 struct fuse_file_info
*fi
)
454 __do_free
char *cg
= NULL
, *io_serviced_str
= NULL
,
455 *io_merged_str
= NULL
, *io_service_bytes_str
= NULL
,
456 *io_wait_time_str
= NULL
, *io_service_time_str
= NULL
,
458 __do_free
void *fopen_cache
= NULL
;
459 __do_fclose
FILE *f
= NULL
;
460 struct fuse_context
*fc
= fuse_get_context();
461 struct file_info
*d
= INTTYPE_TO_PTR(fi
->fh
);
462 uint64_t read
= 0, write
= 0;
463 uint64_t read_merged
= 0, write_merged
= 0;
464 uint64_t read_sectors
= 0, write_sectors
= 0;
465 uint64_t read_ticks
= 0, write_ticks
= 0;
466 uint64_t ios_pgr
= 0, tot_ticks
= 0, rq_ticks
= 0;
467 uint64_t rd_svctm
= 0, wr_svctm
= 0, rd_wait
= 0, wr_wait
= 0;
468 char *cache
= d
->buf
;
469 size_t cache_size
= d
->buflen
;
470 size_t linelen
= 0, total_len
= 0;
471 unsigned int major
= 0, minor
= 0;
479 if (offset
> d
->size
)
485 left
= d
->size
- offset
;
486 total_len
= left
> size
? size
: left
;
487 memcpy(buf
, cache
+ offset
, total_len
);
492 pid_t initpid
= lookup_initpid_in_store(fc
->pid
);
493 if (initpid
<= 1 || is_shared_pidns(initpid
))
496 cg
= get_pid_cgroup(initpid
, "blkio");
498 return read_file_fuse("/proc/diskstats", buf
, size
, d
);
499 prune_init_slice(cg
);
501 ret
= cgroup_ops
->get_io_serviced(cgroup_ops
, cg
, &io_serviced_str
);
503 if (ret
== -EOPNOTSUPP
)
504 return read_file_fuse("/proc/diskstats", buf
, size
, d
);
507 ret
= cgroup_ops
->get_io_merged(cgroup_ops
, cg
, &io_merged_str
);
509 if (ret
== -EOPNOTSUPP
)
510 return read_file_fuse("/proc/diskstats", buf
, size
, d
);
513 ret
= cgroup_ops
->get_io_service_bytes(cgroup_ops
, cg
, &io_service_bytes_str
);
515 if (ret
== -EOPNOTSUPP
)
516 return read_file_fuse("/proc/diskstats", buf
, size
, d
);
519 ret
= cgroup_ops
->get_io_wait_time(cgroup_ops
, cg
, &io_wait_time_str
);
521 if (ret
== -EOPNOTSUPP
)
522 return read_file_fuse("/proc/diskstats", buf
, size
, d
);
525 ret
= cgroup_ops
->get_io_service_time(cgroup_ops
, cg
, &io_service_time_str
);
527 if (ret
== -EOPNOTSUPP
)
528 return read_file_fuse("/proc/diskstats", buf
, size
, d
);
531 f
= fopen_cached("/proc/diskstats", "re", &fopen_cache
);
535 while (getline(&line
, &linelen
, f
) != -1) {
539 i
= sscanf(line
, "%u %u %71s", &major
, &minor
, dev_name
);
543 get_blkio_io_value(io_serviced_str
, major
, minor
, "Read", &read
);
544 get_blkio_io_value(io_serviced_str
, major
, minor
, "Write", &write
);
545 get_blkio_io_value(io_merged_str
, major
, minor
, "Read", &read_merged
);
546 get_blkio_io_value(io_merged_str
, major
, minor
, "Write", &write_merged
);
547 get_blkio_io_value(io_service_bytes_str
, major
, minor
, "Read", &read_sectors
);
548 read_sectors
= read_sectors
/512;
549 get_blkio_io_value(io_service_bytes_str
, major
, minor
, "Write", &write_sectors
);
550 write_sectors
= write_sectors
/512;
552 get_blkio_io_value(io_service_time_str
, major
, minor
, "Read", &rd_svctm
);
553 rd_svctm
= rd_svctm
/1000000;
554 get_blkio_io_value(io_wait_time_str
, major
, minor
, "Read", &rd_wait
);
555 rd_wait
= rd_wait
/1000000;
556 read_ticks
= rd_svctm
+ rd_wait
;
558 get_blkio_io_value(io_service_time_str
, major
, minor
, "Write", &wr_svctm
);
559 wr_svctm
= wr_svctm
/1000000;
560 get_blkio_io_value(io_wait_time_str
, major
, minor
, "Write", &wr_wait
);
561 wr_wait
= wr_wait
/1000000;
562 write_ticks
= wr_svctm
+ wr_wait
;
564 get_blkio_io_value(io_service_time_str
, major
, minor
, "Total", &tot_ticks
);
565 tot_ticks
= tot_ticks
/1000000;
567 memset(lbuf
, 0, 256);
568 if (read
|| write
|| read_merged
|| write_merged
|| read_sectors
|| write_sectors
|| read_ticks
|| write_ticks
)
569 snprintf(lbuf
, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
570 major
, minor
, dev_name
, read
, read_merged
, read_sectors
, read_ticks
,
571 write
, write_merged
, write_sectors
, write_ticks
, ios_pgr
, tot_ticks
, rq_ticks
);
575 l
= snprintf(cache
, cache_size
, "%s", lbuf
);
577 return log_error(0, "Failed to write cache");
579 return log_error(0, "Write to cache was truncated");
588 if (total_len
> size
)
590 memcpy(buf
, d
->buf
, total_len
);
596 static inline void iwashere(void)
598 mknod("/tmp/lxcfs-iwashere", S_IFREG
, 0644);
603 * This function retrieves the busy time of a group of tasks by looking at
604 * cpuacct.usage. Unfortunately, this only makes sense when the container has
605 * been given it's own cpuacct cgroup. If not, this function will take the busy
606 * time of all other taks that do not actually belong to the container into
607 * account as well. If someone has a clever solution for this please send a
610 static double get_reaper_busy(pid_t task
)
612 __do_free
char *cgroup
= NULL
, *usage_str
= NULL
;
616 initpid
= lookup_initpid_in_store(task
);
620 cgroup
= get_pid_cgroup(initpid
, "cpuacct");
623 prune_init_slice(cgroup
);
625 if (!cgroup_ops
->get(cgroup_ops
, "cpuacct", cgroup
, "cpuacct.usage", &usage_str
))
628 if (safe_uint64(usage_str
, &usage
, 10) < 0)
629 lxcfs_error("Failed to convert usage %s", usage_str
);
631 return ((double)usage
/ 1000000000);
634 static uint64_t get_reaper_start_time(pid_t pid
)
636 __do_free
void *fopen_cache
= NULL
;
637 __do_fclose
FILE *f
= NULL
;
640 char path
[STRLITERALLEN("/proc/") + LXCFS_NUMSTRLEN64
+
641 STRLITERALLEN("/stat") + 1];
644 qpid
= lookup_initpid_in_store(pid
);
646 return ret_errno(EINVAL
);
648 ret
= snprintf(path
, sizeof(path
), "/proc/%d/stat", qpid
);
649 if (ret
< 0 || (size_t)ret
>= sizeof(path
))
650 return ret_errno(EINVAL
);
652 f
= fopen_cached(path
, "re", &fopen_cache
);
654 return ret_errno(EINVAL
);
656 /* Note that the *scanf() argument supression requires that length
657 * modifiers such as "l" are omitted. Otherwise some compilers will yell
658 * at us. It's like telling someone you're not married and then asking
659 * if you can bring your wife to the party.
661 ret
= fscanf(f
, "%*d " /* (1) pid %d */
662 "%*s " /* (2) comm %s */
663 "%*c " /* (3) state %c */
664 "%*d " /* (4) ppid %d */
665 "%*d " /* (5) pgrp %d */
666 "%*d " /* (6) session %d */
667 "%*d " /* (7) tty_nr %d */
668 "%*d " /* (8) tpgid %d */
669 "%*u " /* (9) flags %u */
670 "%*u " /* (10) minflt %lu */
671 "%*u " /* (11) cminflt %lu */
672 "%*u " /* (12) majflt %lu */
673 "%*u " /* (13) cmajflt %lu */
674 "%*u " /* (14) utime %lu */
675 "%*u " /* (15) stime %lu */
676 "%*d " /* (16) cutime %ld */
677 "%*d " /* (17) cstime %ld */
678 "%*d " /* (18) priority %ld */
679 "%*d " /* (19) nice %ld */
680 "%*d " /* (20) num_threads %ld */
681 "%*d " /* (21) itrealvalue %ld */
682 "%" PRIu64
, /* (22) starttime %llu */
685 return ret_errno(EINVAL
);
687 return ret_set_errno(starttime
, 0);
690 static double get_reaper_start_time_in_sec(pid_t pid
)
692 uint64_t clockticks
, ticks_per_sec
;
696 clockticks
= get_reaper_start_time(pid
);
698 return log_debug(0, "Failed to retrieve start time of pid %d", pid
);
700 ret
= sysconf(_SC_CLK_TCK
);
702 return log_debug(0, "Failed to determine number of clock ticks in a second");
704 ticks_per_sec
= (uint64_t)ret
;
705 res
= (double)clockticks
/ ticks_per_sec
;
709 static double get_reaper_age(pid_t pid
)
712 double procstart
, procage
;
715 * We need to substract the time the process has started since system
716 * boot minus the time when the system has started to get the actual
719 procstart
= get_reaper_start_time_in_sec(pid
);
723 struct timespec spec
;
725 ret
= clock_gettime(CLOCK_BOOTTIME
, &spec
);
729 uptime_ms
= (spec
.tv_sec
* 1000) + (spec
.tv_nsec
* 1e-6);
730 procage
= (uptime_ms
- (procstart
* 1000)) / 1000;
737 * We read /proc/uptime and reuse its second field.
738 * For the first field, we use the mtime for the reaper for
739 * the calling pid as returned by getreaperage
741 static int proc_uptime_read(char *buf
, size_t size
, off_t offset
,
742 struct fuse_file_info
*fi
)
744 struct fuse_context
*fc
= fuse_get_context();
745 struct file_info
*d
= INTTYPE_TO_PTR(fi
->fh
);
746 char *cache
= d
->buf
;
747 ssize_t total_len
= 0, ret
= 0;
748 double busytime
, idletime
, reaperage
;
757 if (offset
> d
->size
)
763 left
= d
->size
- offset
;
764 total_len
= left
> size
? size
: left
;
765 memcpy(buf
, cache
+ offset
, total_len
);
770 reaperage
= get_reaper_age(fc
->pid
);
772 * To understand why this is done, please read the comment to the
773 * get_reaper_busy() function.
775 idletime
= reaperage
;
776 busytime
= get_reaper_busy(fc
->pid
);
777 if (reaperage
>= busytime
)
778 idletime
= reaperage
- busytime
;
780 ret
= snprintf(d
->buf
, d
->buflen
, "%.2lf %.2lf\n", reaperage
, idletime
);
781 if (ret
< 0 || ret
>= d
->buflen
)
782 return read_file_fuse("/proc/uptime", buf
, size
, d
);
787 if (total_len
> size
)
789 memcpy(buf
, d
->buf
, total_len
);
794 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
795 static int proc_stat_read(char *buf
, size_t size
, off_t offset
,
796 struct fuse_file_info
*fi
)
798 __do_free
char *cg
= NULL
, *cpuset
= NULL
, *line
= NULL
;
799 __do_free
void *fopen_cache
= NULL
;
800 __do_free
struct cpuacct_usage
*cg_cpu_usage
= NULL
;
801 __do_fclose
FILE *f
= NULL
;
802 struct fuse_context
*fc
= fuse_get_context();
803 struct lxcfs_opts
*opts
= (struct lxcfs_opts
*)fc
->private_data
;
804 struct file_info
*d
= INTTYPE_TO_PTR(fi
->fh
);
805 size_t linelen
= 0, total_len
= 0;
806 int curcpu
= -1; /* cpu numbering starts at 0 */
808 uint64_t user
= 0, nice
= 0, system
= 0, idle
= 0, iowait
= 0, irq
= 0,
809 softirq
= 0, steal
= 0, guest
= 0, guest_nice
= 0;
810 uint64_t user_sum
= 0, nice_sum
= 0, system_sum
= 0, idle_sum
= 0,
811 iowait_sum
= 0, irq_sum
= 0, softirq_sum
= 0, steal_sum
= 0,
812 guest_sum
= 0, guest_nice_sum
= 0;
813 char cpuall
[CPUALL_MAX_SIZE
];
814 /* reserve for cpu all */
815 char *cache
= d
->buf
+ CPUALL_MAX_SIZE
;
816 size_t cache_size
= d
->buflen
- CPUALL_MAX_SIZE
;
817 int cg_cpu_usage_size
= 0;
822 if (offset
> d
->size
)
828 left
= d
->size
- offset
;
829 total_len
= left
> size
? size
: left
;
830 memcpy(buf
, d
->buf
+ offset
, total_len
);
835 pid_t initpid
= lookup_initpid_in_store(fc
->pid
);
836 if (initpid
<= 1 || is_shared_pidns(initpid
))
840 * when container run with host pid namespace initpid == 1, cgroup will "/"
841 * we should return host os's /proc contents.
842 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
845 return read_file_fuse("/proc/stat", buf
, size
, d
);
847 cg
= get_pid_cgroup(initpid
, "cpuset");
849 return read_file_fuse("/proc/stat", buf
, size
, d
);
850 prune_init_slice(cg
);
852 cpuset
= get_cpuset(cg
);
856 f
= fopen_cached("/proc/stat", "re", &fopen_cache
);
860 /* Skip first system cpu line. */
861 if (getline(&line
, &linelen
, f
) < 0)
862 return log_error(0, "proc_stat_read read first line failed");
865 * Read cpuacct.usage_all for all CPUs.
866 * If the cpuacct cgroup is present, it is used to calculate the container's
867 * CPU usage. If not, values from the host's /proc/stat are used.
869 if (read_cpuacct_usage_all(cg
, cpuset
, &cg_cpu_usage
, &cg_cpu_usage_size
) == 0) {
870 if (cgroup_ops
->can_use_cpuview(cgroup_ops
) && opts
&& opts
->use_cfs
) {
871 total_len
= cpuview_proc_stat(cg
, cpuset
, cg_cpu_usage
,
872 cg_cpu_usage_size
, f
,
877 lxcfs_v("proc_stat_read failed to read from cpuacct, falling back to the host's /proc/stat");
880 while (getline(&line
, &linelen
, f
) != -1) {
882 char cpu_char
[10]; /* That's a lot of cores */
884 uint64_t all_used
, cg_used
, new_idle
;
887 if (strlen(line
) == 0)
889 if (sscanf(line
, "cpu%9[^ ]", cpu_char
) != 1) {
890 /* not a ^cpuN line containing a number N, just print it */
891 l
= snprintf(cache
, cache_size
, "%s", line
);
893 return log_error(0, "Failed to write cache");
895 return log_error(0, "Write to cache was truncated");
904 if (sscanf(cpu_char
, "%d", &physcpu
) != 1)
907 if (!cpu_in_cpuset(physcpu
, cpuset
))
912 ret
= sscanf(line
, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
923 if (ret
!= 10 || !cg_cpu_usage
) {
924 c
= strchr(line
, ' ');
928 l
= snprintf(cache
, cache_size
, "cpu%d%s", curcpu
, c
);
930 return log_error(0, "Failed to write cache");
932 return log_error(0, "Write to cache was truncated");
943 if (physcpu
>= cg_cpu_usage_size
)
946 all_used
= user
+ nice
+ system
+ iowait
+ irq
+ softirq
+ steal
+ guest
+ guest_nice
;
947 cg_used
= cg_cpu_usage
[physcpu
].user
+ cg_cpu_usage
[physcpu
].system
;
949 if (all_used
>= cg_used
) {
950 new_idle
= idle
+ (all_used
- cg_used
);
953 lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64
" in /proc/stat, %" PRIu64
" in cpuacct.usage_all; unable to determine idle time",
954 curcpu
, cg
, all_used
, cg_used
);
958 l
= snprintf(cache
, cache_size
,
959 "cpu%d %" PRIu64
" 0 %" PRIu64
" %" PRIu64
" 0 0 0 0 0 0\n",
960 curcpu
, cg_cpu_usage
[physcpu
].user
,
961 cg_cpu_usage
[physcpu
].system
, new_idle
);
963 return log_error(0, "Failed to write cache");
965 return log_error(0, "Write to cache was truncated");
971 user_sum
+= cg_cpu_usage
[physcpu
].user
;
972 system_sum
+= cg_cpu_usage
[physcpu
].system
;
973 idle_sum
+= new_idle
;
977 system_sum
+= system
;
979 iowait_sum
+= iowait
;
981 softirq_sum
+= softirq
;
984 guest_nice_sum
+= guest_nice
;
990 int cpuall_len
= snprintf(cpuall
, CPUALL_MAX_SIZE
, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
1001 if (cpuall_len
> 0 && cpuall_len
< CPUALL_MAX_SIZE
) {
1002 memcpy(cache
, cpuall
, cpuall_len
);
1003 cache
+= cpuall_len
;
1005 /* shouldn't happen */
1006 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d", cpuall_len
);
1010 memmove(cache
, d
->buf
+ CPUALL_MAX_SIZE
, total_len
);
1011 total_len
+= cpuall_len
;
1015 d
->size
= total_len
;
1016 if (total_len
> size
)
1019 memcpy(buf
, d
->buf
, total_len
);
1023 /* Note that "memory.stat" in cgroup2 is hierarchical by default. */
1024 static bool cgroup_parse_memory_stat(const char *cgroup
, struct memory_stat
*mstat
)
1026 __do_close
int fd
= -EBADF
;
1027 __do_fclose
FILE *f
= NULL
;
1028 __do_free
char *line
= NULL
;
1029 __do_free
void *fdopen_cache
= NULL
;
1034 fd
= cgroup_ops
->get_memory_stats_fd(cgroup_ops
, cgroup
);
1038 f
= fdopen_cached(fd
, "re", &fdopen_cache
);
1042 unified
= pure_unified_layout(cgroup_ops
);
1043 while ((linelen
= getline(&line
, &len
, f
)) != -1) {
1044 if (!unified
&& startswith(line
, "hierarchical_memory_limit")) {
1045 sscanf(line
, "hierarchical_memory_limit %" PRIu64
, &(mstat
->hierarchical_memory_limit
));
1046 } else if (!unified
&& startswith(line
, "hierarchical_memsw_limit")) {
1047 sscanf(line
, "hierarchical_memsw_limit %" PRIu64
, &(mstat
->hierarchical_memsw_limit
));
1048 } else if (startswith(line
, unified
? "file" :"total_cache")) {
1049 sscanf(line
, unified
? "file %" PRIu64
: "total_cache %" PRIu64
, &(mstat
->total_cache
));
1050 } else if (!unified
&& startswith(line
, "total_rss")) {
1051 sscanf(line
, "total_rss %" PRIu64
, &(mstat
->total_rss
));
1052 } else if (!unified
&& startswith(line
, "total_rss_huge")) {
1053 sscanf(line
, "total_rss_huge %" PRIu64
, &(mstat
->total_rss_huge
));
1054 } else if (startswith(line
, unified
? "shmem" : "total_shmem")) {
1055 sscanf(line
, unified
? "shmem %" PRIu64
: "total_shmem %" PRIu64
, &(mstat
->total_shmem
));
1056 } else if (startswith(line
, unified
? "file_mapped" : "total_mapped_file")) {
1057 sscanf(line
, unified
? "file_mapped %" PRIu64
: "total_mapped_file %" PRIu64
, &(mstat
->total_mapped_file
));
1058 } else if (!unified
&& startswith(line
, "total_dirty")) {
1059 sscanf(line
, "total_dirty %" PRIu64
, &(mstat
->total_dirty
));
1060 } else if (!unified
&& startswith(line
, "total_writeback")) {
1061 sscanf(line
, "total_writeback %" PRIu64
, &(mstat
->total_writeback
));
1062 } else if (!unified
&& startswith(line
, "total_swap")) {
1063 sscanf(line
, "total_swap %" PRIu64
, &(mstat
->total_swap
));
1064 } else if (!unified
&& startswith(line
, "total_pgpgin")) {
1065 sscanf(line
, "total_pgpgin %" PRIu64
, &(mstat
->total_pgpgin
));
1066 } else if (!unified
&& startswith(line
, "total_pgpgout")) {
1067 sscanf(line
, "total_pgpgout %" PRIu64
, &(mstat
->total_pgpgout
));
1068 } else if (startswith(line
, unified
? "pgfault" : "total_pgfault")) {
1069 sscanf(line
, unified
? "pgfault %" PRIu64
: "total_pgfault %" PRIu64
, &(mstat
->total_pgfault
));
1070 } else if (startswith(line
, unified
? "pgmajfault" : "total_pgmajfault")) {
1071 sscanf(line
, unified
? "pgmajfault %" PRIu64
: "total_pgmajfault %" PRIu64
, &(mstat
->total_pgmajfault
));
1072 } else if (startswith(line
, unified
? "inactive_anon" : "total_inactive_anon")) {
1073 sscanf(line
, unified
? "inactive_anon %" PRIu64
: "total_inactive_anon %" PRIu64
, &(mstat
->total_inactive_anon
));
1074 } else if (startswith(line
, unified
? "active_anon" : "total_active_anon")) {
1075 sscanf(line
, unified
? "active_anon %" PRIu64
: "total_active_anon %" PRIu64
, &(mstat
->total_active_anon
));
1076 } else if (startswith(line
, unified
? "inactive_file" : "total_inactive_file")) {
1077 sscanf(line
, unified
? "inactive_file %" PRIu64
: "total_inactive_file %" PRIu64
, &(mstat
->total_inactive_file
));
1078 } else if (startswith(line
, unified
? "active_file" : "total_active_file")) {
1079 sscanf(line
, unified
? "active_file %" PRIu64
: "total_active_file %" PRIu64
, &(mstat
->total_active_file
));
1080 } else if (startswith(line
, unified
? "unevictable" : "total_unevictable")) {
1081 sscanf(line
, unified
? "unevictable %" PRIu64
: "total_unevictable %" PRIu64
, &(mstat
->total_unevictable
));
1088 static int proc_meminfo_read(char *buf
, size_t size
, off_t offset
,
1089 struct fuse_file_info
*fi
)
1091 __do_free
char *cgroup
= NULL
, *line
= NULL
, *memusage_str
= NULL
,
1092 *memswusage_str
= NULL
;
1093 __do_free
void *fopen_cache
= NULL
;
1094 __do_fclose
FILE *f
= NULL
;
1095 struct fuse_context
*fc
= fuse_get_context();
1096 struct lxcfs_opts
*opts
= (struct lxcfs_opts
*)fuse_get_context()->private_data
;
1097 bool wants_swap
= opts
&& !opts
->swap_off
&& liblxcfs_can_use_swap(), host_swap
= false;
1098 struct file_info
*d
= INTTYPE_TO_PTR(fi
->fh
);
1099 uint64_t memlimit
= 0, memusage
= 0, memswlimit
= 0, memswusage
= 0,
1100 hosttotal
= 0, swfree
= 0, swusage
= 0, swtotal
= 0;
1101 struct memory_stat mstat
= {};
1102 size_t linelen
= 0, total_len
= 0;
1103 char *cache
= d
->buf
;
1104 size_t cache_size
= d
->buflen
;
1110 if (offset
> d
->size
)
1116 left
= d
->size
- offset
;
1117 total_len
= left
> size
? size
: left
;
1118 memcpy(buf
, cache
+ offset
, total_len
);
1123 pid_t initpid
= lookup_initpid_in_store(fc
->pid
);
1124 if (initpid
<= 1 || is_shared_pidns(initpid
))
1127 cgroup
= get_pid_cgroup(initpid
, "memory");
1129 return read_file_fuse("/proc/meminfo", buf
, size
, d
);
1131 prune_init_slice(cgroup
);
1134 ret
= cgroup_ops
->get_memory_current(cgroup_ops
, cgroup
, &memusage_str
);
1136 return read_file_fuse("/proc/meminfo", buf
, size
, d
);
1138 if (safe_uint64(memusage_str
, &memusage
, 10) < 0)
1139 lxcfs_error("Failed to convert memusage %s", memusage_str
);
1141 if (!cgroup_parse_memory_stat(cgroup
, &mstat
))
1142 return read_file_fuse("/proc/meminfo", buf
, size
, d
);
1144 memlimit
= get_min_memlimit(cgroup
, false);
1147 * Following values are allowed to fail, because swapaccount might be
1148 * turned off for current kernel.
1151 memswlimit
= get_min_memlimit(cgroup
, true);
1152 if (memswlimit
> 0) {
1153 ret
= cgroup_ops
->get_memory_swap_current(cgroup_ops
, cgroup
, &memswusage_str
);
1154 if (ret
>= 0 && safe_uint64(memswusage_str
, &memswusage
, 10) == 0) {
1155 if (memlimit
> memswlimit
)
1158 swtotal
= (memswlimit
- memlimit
) / 1024;
1159 if (memusage
> memswusage
|| swtotal
== 0)
1162 swusage
= (memswusage
- memusage
) / 1024;
1167 f
= fopen_cached("/proc/meminfo", "re", &fopen_cache
);
1169 return read_file_fuse("/proc/meminfo", buf
, size
, d
);
1173 while (getline(&line
, &linelen
, f
) != -1) {
1175 char *printme
, lbuf
[100];
1177 memset(lbuf
, 0, 100);
1178 if (startswith(line
, "MemTotal:")) {
1179 sscanf(line
+sizeof("MemTotal:")-1, "%" PRIu64
, &hosttotal
);
1181 memlimit
= hosttotal
;
1183 if (hosttotal
< memlimit
)
1184 memlimit
= hosttotal
;
1185 snprintf(lbuf
, 100, "MemTotal: %8" PRIu64
" kB\n", memlimit
);
1187 } else if (startswith(line
, "MemFree:")) {
1188 snprintf(lbuf
, 100, "MemFree: %8" PRIu64
" kB\n", memlimit
- memusage
);
1190 } else if (startswith(line
, "MemAvailable:")) {
1191 snprintf(lbuf
, 100, "MemAvailable: %8" PRIu64
" kB\n", memlimit
- memusage
+ mstat
.total_cache
/ 1024);
1193 } else if (startswith(line
, "SwapTotal:")) {
1195 uint64_t hostswtotal
= 0;
1197 sscanf(line
+ STRLITERALLEN("SwapTotal:"), "%" PRIu64
, &hostswtotal
);
1199 if (hostswtotal
< swtotal
) {
1200 swtotal
= hostswtotal
;
1205 snprintf(lbuf
, 100, "SwapTotal: %8" PRIu64
" kB\n", swtotal
);
1207 } else if (startswith(line
, "SwapFree:")) {
1209 uint64_t hostswfree
= 0;
1212 sscanf(line
+ STRLITERALLEN("SwapFree:"), "%" PRIu64
, &hostswfree
);
1213 swfree
= hostswfree
;
1214 } else if (swtotal
>= swusage
) {
1215 swfree
= swtotal
- swusage
;
1219 snprintf(lbuf
, 100, "SwapFree: %8" PRIu64
" kB\n", swfree
);
1221 } else if (startswith(line
, "Slab:")) {
1222 snprintf(lbuf
, 100, "Slab: %8" PRIu64
" kB\n", (uint64_t)0);
1224 } else if (startswith(line
, "Buffers:")) {
1225 snprintf(lbuf
, 100, "Buffers: %8" PRIu64
" kB\n", (uint64_t)0);
1227 } else if (startswith(line
, "Cached:")) {
1228 snprintf(lbuf
, 100, "Cached: %8" PRIu64
" kB\n",
1229 mstat
.total_cache
/ 1024);
1231 } else if (startswith(line
, "SwapCached:")) {
1232 snprintf(lbuf
, 100, "SwapCached: %8" PRIu64
" kB\n", (uint64_t)0);
1234 } else if (startswith(line
, "Active:")) {
1235 snprintf(lbuf
, 100, "Active: %8" PRIu64
" kB\n",
1236 (mstat
.total_active_anon
+
1237 mstat
.total_active_file
) /
1240 } else if (startswith(line
, "Inactive:")) {
1241 snprintf(lbuf
, 100, "Inactive: %8" PRIu64
" kB\n",
1242 (mstat
.total_inactive_anon
+
1243 mstat
.total_inactive_file
) /
1246 } else if (startswith(line
, "Active(anon):")) {
1247 snprintf(lbuf
, 100, "Active(anon): %8" PRIu64
" kB\n",
1248 mstat
.total_active_anon
/ 1024);
1250 } else if (startswith(line
, "Inactive(anon):")) {
1251 snprintf(lbuf
, 100, "Inactive(anon): %8" PRIu64
" kB\n",
1252 mstat
.total_inactive_anon
/ 1024);
1254 } else if (startswith(line
, "Active(file):")) {
1255 snprintf(lbuf
, 100, "Active(file): %8" PRIu64
" kB\n",
1256 mstat
.total_active_file
/ 1024);
1258 } else if (startswith(line
, "Inactive(file):")) {
1259 snprintf(lbuf
, 100, "Inactive(file): %8" PRIu64
" kB\n",
1260 mstat
.total_inactive_file
/ 1024);
1262 } else if (startswith(line
, "Unevictable:")) {
1263 snprintf(lbuf
, 100, "Unevictable: %8" PRIu64
" kB\n",
1264 mstat
.total_unevictable
/ 1024);
1266 } else if (startswith(line
, "Dirty:")) {
1267 snprintf(lbuf
, 100, "Dirty: %8" PRIu64
" kB\n",
1268 mstat
.total_dirty
/ 1024);
1270 } else if (startswith(line
, "Writeback:")) {
1271 snprintf(lbuf
, 100, "Writeback: %8" PRIu64
" kB\n",
1272 mstat
.total_writeback
/ 1024);
1274 } else if (startswith(line
, "AnonPages:")) {
1275 snprintf(lbuf
, 100, "AnonPages: %8" PRIu64
" kB\n",
1276 (mstat
.total_active_anon
+
1277 mstat
.total_inactive_anon
- mstat
.total_shmem
) /
1280 } else if (startswith(line
, "Mapped:")) {
1281 snprintf(lbuf
, 100, "Mapped: %8" PRIu64
" kB\n",
1282 mstat
.total_mapped_file
/ 1024);
1284 } else if (startswith(line
, "SReclaimable:")) {
1285 snprintf(lbuf
, 100, "SReclaimable: %8" PRIu64
" kB\n", (uint64_t)0);
1287 } else if (startswith(line
, "SUnreclaim:")) {
1288 snprintf(lbuf
, 100, "SUnreclaim: %8" PRIu64
" kB\n", (uint64_t)0);
1290 } else if (startswith(line
, "Shmem:")) {
1291 snprintf(lbuf
, 100, "Shmem: %8" PRIu64
" kB\n",
1292 mstat
.total_shmem
/ 1024);
1294 } else if (startswith(line
, "ShmemHugePages:")) {
1295 snprintf(lbuf
, 100, "ShmemHugePages: %8" PRIu64
" kB\n", (uint64_t)0);
1297 } else if (startswith(line
, "ShmemPmdMapped:")) {
1298 snprintf(lbuf
, 100, "ShmemPmdMapped: %8" PRIu64
" kB\n", (uint64_t)0);
1300 } else if (startswith(line
, "AnonHugePages:")) {
1301 snprintf(lbuf
, 100, "AnonHugePages: %8" PRIu64
" kB\n",
1302 mstat
.total_rss_huge
/ 1024);
1308 l
= snprintf(cache
, cache_size
, "%s", printme
);
1310 return log_error(0, "Failed to write cache");
1311 if (l
>= cache_size
)
1312 return log_error(0, "Write to cache was truncated");
1320 d
->size
= total_len
;
1321 if (total_len
> size
)
1323 memcpy(buf
, d
->buf
, total_len
);
1328 __lxcfs_fuse_ops
int proc_read(const char *path
, char *buf
, size_t size
,
1329 off_t offset
, struct fuse_file_info
*fi
)
1331 struct file_info
*f
= INTTYPE_TO_PTR(fi
->fh
);
1334 case LXC_TYPE_PROC_MEMINFO
:
1335 if (liblxcfs_functional())
1336 return proc_meminfo_read(buf
, size
, offset
, fi
);
1338 return read_file_fuse_with_offset(LXC_TYPE_PROC_MEMINFO_PATH
,
1339 buf
, size
, offset
, f
);
1340 case LXC_TYPE_PROC_CPUINFO
:
1341 if (liblxcfs_functional())
1342 return proc_cpuinfo_read(buf
, size
, offset
, fi
);
1344 return read_file_fuse_with_offset(LXC_TYPE_PROC_CPUINFO_PATH
,
1345 buf
, size
, offset
, f
);
1346 case LXC_TYPE_PROC_UPTIME
:
1347 if (liblxcfs_functional())
1348 return proc_uptime_read(buf
, size
, offset
, fi
);
1350 return read_file_fuse_with_offset(LXC_TYPE_PROC_UPTIME_PATH
,
1351 buf
, size
, offset
, f
);
1352 case LXC_TYPE_PROC_STAT
:
1353 if (liblxcfs_functional())
1354 return proc_stat_read(buf
, size
, offset
, fi
);
1356 return read_file_fuse_with_offset(LXC_TYPE_PROC_STAT_PATH
, buf
,
1358 case LXC_TYPE_PROC_DISKSTATS
:
1359 if (liblxcfs_functional())
1360 return proc_diskstats_read(buf
, size
, offset
, fi
);
1362 return read_file_fuse_with_offset(LXC_TYPE_PROC_DISKSTATS_PATH
,
1363 buf
, size
, offset
, f
);
1364 case LXC_TYPE_PROC_SWAPS
:
1365 if (liblxcfs_functional())
1366 return proc_swaps_read(buf
, size
, offset
, fi
);
1368 return read_file_fuse_with_offset(LXC_TYPE_PROC_SWAPS_PATH
, buf
,
1370 case LXC_TYPE_PROC_LOADAVG
:
1371 if (liblxcfs_functional())
1372 return proc_loadavg_read(buf
, size
, offset
, fi
);
1374 return read_file_fuse_with_offset(LXC_TYPE_PROC_LOADAVG_PATH
,
1375 buf
, size
, offset
, f
);