1 /* SPDX-License-Identifier: LGPL-2.1+ */
7 #ifndef FUSE_USE_VERSION
8 #define FUSE_USE_VERSION 26
11 #define _FILE_OFFSET_BITS 64
13 #define __STDC_FORMAT_MACROS
31 #include <linux/magic.h>
32 #include <linux/sched.h>
33 #include <sys/epoll.h>
35 #include <sys/mount.h>
36 #include <sys/param.h>
37 #include <sys/socket.h>
38 #include <sys/syscall.h>
39 #include <sys/sysinfo.h>
44 #include "cgroup_fuse.h"
45 #include "cgroups/cgroup.h"
46 #include "cgroups/cgroup_utils.h"
47 #include "cpuset_parse.h"
48 #include "memory_utils.h"
49 #include "proc_loadavg.h"
50 #include "proc_cpuview.h"
54 uint64_t hierarchical_memory_limit
;
55 uint64_t hierarchical_memsw_limit
;
58 uint64_t total_rss_huge
;
60 uint64_t total_mapped_file
;
62 uint64_t total_writeback
;
64 uint64_t total_pgpgin
;
65 uint64_t total_pgpgout
;
66 uint64_t total_pgfault
;
67 uint64_t total_pgmajfault
;
68 uint64_t total_inactive_anon
;
69 uint64_t total_active_anon
;
70 uint64_t total_inactive_file
;
71 uint64_t total_active_file
;
72 uint64_t total_unevictable
;
75 __lxcfs_fuse_ops
int proc_getattr(const char *path
, struct stat
*sb
)
79 memset(sb
, 0, sizeof(struct stat
));
80 if (clock_gettime(CLOCK_REALTIME
, &now
) < 0)
83 sb
->st_uid
= sb
->st_gid
= 0;
84 sb
->st_atim
= sb
->st_mtim
= sb
->st_ctim
= now
;
85 if (strcmp(path
, "/proc") == 0) {
86 sb
->st_mode
= S_IFDIR
| 00555;
91 if (strcmp(path
, "/proc/meminfo") == 0 ||
92 strcmp(path
, "/proc/cpuinfo") == 0 ||
93 strcmp(path
, "/proc/uptime") == 0 ||
94 strcmp(path
, "/proc/stat") == 0 ||
95 strcmp(path
, "/proc/diskstats") == 0 ||
96 strcmp(path
, "/proc/swaps") == 0 ||
97 strcmp(path
, "/proc/loadavg") == 0) {
99 sb
->st_mode
= S_IFREG
| 00444;
107 __lxcfs_fuse_ops
int proc_readdir(const char *path
, void *buf
,
108 fuse_fill_dir_t filler
, off_t offset
,
109 struct fuse_file_info
*fi
)
111 if (filler(buf
, ".", NULL
, 0) != 0 ||
112 filler(buf
, "..", NULL
, 0) != 0 ||
113 filler(buf
, "cpuinfo", NULL
, 0) != 0 ||
114 filler(buf
, "meminfo", NULL
, 0) != 0 ||
115 filler(buf
, "stat", NULL
, 0) != 0 ||
116 filler(buf
, "uptime", NULL
, 0) != 0 ||
117 filler(buf
, "diskstats", NULL
, 0) != 0 ||
118 filler(buf
, "swaps", NULL
, 0) != 0 ||
119 filler(buf
, "loadavg", NULL
, 0) != 0)
125 static off_t
get_procfile_size(const char *path
)
127 __do_fclose
FILE *f
= NULL
;
128 __do_free
char *line
= NULL
;
130 ssize_t sz
, answer
= 0;
132 f
= fopen(path
, "re");
136 while ((sz
= getline(&line
, &len
, f
)) != -1)
142 __lxcfs_fuse_ops
int proc_open(const char *path
, struct fuse_file_info
*fi
)
144 __do_free
struct file_info
*info
= NULL
;
147 if (strcmp(path
, "/proc/meminfo") == 0)
148 type
= LXC_TYPE_PROC_MEMINFO
;
149 else if (strcmp(path
, "/proc/cpuinfo") == 0)
150 type
= LXC_TYPE_PROC_CPUINFO
;
151 else if (strcmp(path
, "/proc/uptime") == 0)
152 type
= LXC_TYPE_PROC_UPTIME
;
153 else if (strcmp(path
, "/proc/stat") == 0)
154 type
= LXC_TYPE_PROC_STAT
;
155 else if (strcmp(path
, "/proc/diskstats") == 0)
156 type
= LXC_TYPE_PROC_DISKSTATS
;
157 else if (strcmp(path
, "/proc/swaps") == 0)
158 type
= LXC_TYPE_PROC_SWAPS
;
159 else if (strcmp(path
, "/proc/loadavg") == 0)
160 type
= LXC_TYPE_PROC_LOADAVG
;
164 info
= malloc(sizeof(*info
));
168 memset(info
, 0, sizeof(*info
));
171 info
->buflen
= get_procfile_size(path
) + BUF_RESERVE_SIZE
;
173 info
->buf
= malloc(info
->buflen
);
177 memset(info
->buf
, 0, info
->buflen
);
178 /* set actual size to buffer size */
179 info
->size
= info
->buflen
;
181 fi
->fh
= PTR_TO_UINT64(move_ptr(info
));
185 __lxcfs_fuse_ops
int proc_access(const char *path
, int mask
)
187 if (strcmp(path
, "/proc") == 0 && access(path
, R_OK
) == 0)
190 /* these are all read-only */
191 if ((mask
& ~R_OK
) != 0)
197 __lxcfs_fuse_ops
int proc_release(const char *path
, struct fuse_file_info
*fi
)
199 do_release_file_info(fi
);
203 static uint64_t get_memlimit(const char *cgroup
, bool swap
)
205 __do_free
char *memlimit_str
= NULL
;
206 uint64_t memlimit
= 0;
210 ret
= cgroup_ops
->get_memory_swap_max(cgroup_ops
, cgroup
, &memlimit_str
);
212 ret
= cgroup_ops
->get_memory_max(cgroup_ops
, cgroup
, &memlimit_str
);
213 if (ret
> 0 && safe_uint64(memlimit_str
, &memlimit
, 10) < 0)
214 lxcfs_error("Failed to convert memlimit %s", memlimit_str
);
219 static uint64_t get_min_memlimit(const char *cgroup
, bool swap
)
221 __do_free
char *copy
= NULL
;
222 uint64_t memlimit
= 0, retlimit
= 0;
224 copy
= strdup(cgroup
);
226 return log_error_errno(0, ENOMEM
, "Failed to allocate memory");
228 retlimit
= get_memlimit(copy
, swap
);
230 while (strcmp(copy
, "/") != 0) {
234 memlimit
= get_memlimit(it
, swap
);
235 if (memlimit
> 0 && memlimit
< retlimit
)
242 static inline bool startswith(const char *line
, const char *pref
)
244 return strncmp(line
, pref
, strlen(pref
)) == 0;
247 static int proc_swaps_read(char *buf
, size_t size
, off_t offset
,
248 struct fuse_file_info
*fi
)
250 __do_free
char *cg
= NULL
, *memswlimit_str
= NULL
, *memusage_str
= NULL
,
251 *memswusage_str
= NULL
;
252 struct fuse_context
*fc
= fuse_get_context();
253 struct file_info
*d
= INTTYPE_TO_PTR(fi
->fh
);
254 uint64_t memswlimit
= 0, memlimit
= 0, memusage
= 0, memswusage
= 0,
255 swap_total
= 0, swap_free
= 0;
256 ssize_t total_len
= 0;
258 char *cache
= d
->buf
;
264 if (offset
> d
->size
)
270 left
= d
->size
- offset
;
271 total_len
= left
> size
? size
: left
;
272 memcpy(buf
, cache
+ offset
, total_len
);
277 pid_t initpid
= lookup_initpid_in_store(fc
->pid
);
278 if (initpid
<= 1 || is_shared_pidns(initpid
))
281 cg
= get_pid_cgroup(initpid
, "memory");
283 return read_file_fuse("/proc/swaps", buf
, size
, d
);
284 prune_init_slice(cg
);
286 memlimit
= get_min_memlimit(cg
, false);
288 ret
= cgroup_ops
->get_memory_current(cgroup_ops
, cg
, &memusage_str
);
292 if (safe_uint64(memusage_str
, &memusage
, 10) < 0)
293 lxcfs_error("Failed to convert memusage %s", memusage_str
);
295 ret
= cgroup_ops
->get_memory_swap_max(cgroup_ops
, cg
, &memswlimit_str
);
297 ret
= cgroup_ops
->get_memory_swap_current(cgroup_ops
, cg
, &memswusage_str
);
299 memswlimit
= get_min_memlimit(cg
, true);
301 if (safe_uint64(memswusage_str
, &memswusage
, 10) < 0)
302 lxcfs_error("Failed to convert memswusage %s", memswusage_str
);
304 swap_total
= (memswlimit
- memlimit
) / 1024;
305 swap_free
= (memswusage
- memusage
) / 1024;
308 total_len
= snprintf(d
->buf
, d
->size
, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
310 /* When no mem + swap limit is specified or swapaccount=0*/
312 __do_free
char *line
= NULL
;
313 __do_free
void *fopen_cache
= NULL
;
314 __do_fclose
FILE *f
= NULL
;
317 f
= fopen_cached("/proc/meminfo", "re", &fopen_cache
);
321 while (getline(&line
, &linelen
, f
) != -1) {
322 if (startswith(line
, "SwapTotal:"))
323 sscanf(line
, "SwapTotal: %8" PRIu64
" kB", &swap_total
);
324 else if (startswith(line
, "SwapFree:"))
325 sscanf(line
, "SwapFree: %8" PRIu64
" kB", &swap_free
);
329 if (swap_total
> 0) {
330 l
= snprintf(d
->buf
+ total_len
, d
->size
- total_len
,
331 "none%*svirtual\t\t%" PRIu64
"\t%" PRIu64
"\t0\n",
332 36, " ", swap_total
, swap_free
);
336 if (total_len
< 0 || l
< 0)
337 return log_error(0, "Failed writing to cache");
340 d
->size
= (int)total_len
;
342 if (total_len
> size
)
344 memcpy(buf
, d
->buf
, total_len
);
349 static void get_blkio_io_value(char *str
, unsigned major
, unsigned minor
,
350 char *iotype
, uint64_t *v
)
357 snprintf(key
, 32, "%u:%u %s", major
, minor
, iotype
);
362 if (startswith(str
, key
)) {
363 sscanf(str
+ len
, "%lu", v
);
366 eol
= strchr(str
, '\n');
373 static int proc_diskstats_read(char *buf
, size_t size
, off_t offset
,
374 struct fuse_file_info
*fi
)
376 __do_free
char *cg
= NULL
, *io_serviced_str
= NULL
,
377 *io_merged_str
= NULL
, *io_service_bytes_str
= NULL
,
378 *io_wait_time_str
= NULL
, *io_service_time_str
= NULL
,
380 __do_free
void *fopen_cache
= NULL
;
381 __do_fclose
FILE *f
= NULL
;
382 struct fuse_context
*fc
= fuse_get_context();
383 struct file_info
*d
= INTTYPE_TO_PTR(fi
->fh
);
384 uint64_t read
= 0, write
= 0;
385 uint64_t read_merged
= 0, write_merged
= 0;
386 uint64_t read_sectors
= 0, write_sectors
= 0;
387 uint64_t read_ticks
= 0, write_ticks
= 0;
388 uint64_t ios_pgr
= 0, tot_ticks
= 0, rq_ticks
= 0;
389 uint64_t rd_svctm
= 0, wr_svctm
= 0, rd_wait
= 0, wr_wait
= 0;
390 char *cache
= d
->buf
;
391 size_t cache_size
= d
->buflen
;
392 size_t linelen
= 0, total_len
= 0;
393 unsigned int major
= 0, minor
= 0;
401 if (offset
> d
->size
)
407 left
= d
->size
- offset
;
408 total_len
= left
> size
? size
: left
;
409 memcpy(buf
, cache
+ offset
, total_len
);
414 pid_t initpid
= lookup_initpid_in_store(fc
->pid
);
415 if (initpid
<= 1 || is_shared_pidns(initpid
))
418 cg
= get_pid_cgroup(initpid
, "blkio");
420 return read_file_fuse("/proc/diskstats", buf
, size
, d
);
421 prune_init_slice(cg
);
423 ret
= cgroup_ops
->get_io_serviced(cgroup_ops
, cg
, &io_serviced_str
);
425 if (ret
== -EOPNOTSUPP
)
426 return read_file_fuse("/proc/diskstats", buf
, size
, d
);
429 ret
= cgroup_ops
->get_io_merged(cgroup_ops
, cg
, &io_merged_str
);
431 if (ret
== -EOPNOTSUPP
)
432 return read_file_fuse("/proc/diskstats", buf
, size
, d
);
435 ret
= cgroup_ops
->get_io_service_bytes(cgroup_ops
, cg
, &io_service_bytes_str
);
437 if (ret
== -EOPNOTSUPP
)
438 return read_file_fuse("/proc/diskstats", buf
, size
, d
);
441 ret
= cgroup_ops
->get_io_wait_time(cgroup_ops
, cg
, &io_wait_time_str
);
443 if (ret
== -EOPNOTSUPP
)
444 return read_file_fuse("/proc/diskstats", buf
, size
, d
);
447 ret
= cgroup_ops
->get_io_service_time(cgroup_ops
, cg
, &io_service_time_str
);
449 if (ret
== -EOPNOTSUPP
)
450 return read_file_fuse("/proc/diskstats", buf
, size
, d
);
453 f
= fopen_cached("/proc/diskstats", "re", &fopen_cache
);
457 while (getline(&line
, &linelen
, f
) != -1) {
461 i
= sscanf(line
, "%u %u %71s", &major
, &minor
, dev_name
);
465 get_blkio_io_value(io_serviced_str
, major
, minor
, "Read", &read
);
466 get_blkio_io_value(io_serviced_str
, major
, minor
, "Write", &write
);
467 get_blkio_io_value(io_merged_str
, major
, minor
, "Read", &read_merged
);
468 get_blkio_io_value(io_merged_str
, major
, minor
, "Write", &write_merged
);
469 get_blkio_io_value(io_service_bytes_str
, major
, minor
, "Read", &read_sectors
);
470 read_sectors
= read_sectors
/512;
471 get_blkio_io_value(io_service_bytes_str
, major
, minor
, "Write", &write_sectors
);
472 write_sectors
= write_sectors
/512;
474 get_blkio_io_value(io_service_time_str
, major
, minor
, "Read", &rd_svctm
);
475 rd_svctm
= rd_svctm
/1000000;
476 get_blkio_io_value(io_wait_time_str
, major
, minor
, "Read", &rd_wait
);
477 rd_wait
= rd_wait
/1000000;
478 read_ticks
= rd_svctm
+ rd_wait
;
480 get_blkio_io_value(io_service_time_str
, major
, minor
, "Write", &wr_svctm
);
481 wr_svctm
= wr_svctm
/1000000;
482 get_blkio_io_value(io_wait_time_str
, major
, minor
, "Write", &wr_wait
);
483 wr_wait
= wr_wait
/1000000;
484 write_ticks
= wr_svctm
+ wr_wait
;
486 get_blkio_io_value(io_service_time_str
, major
, minor
, "Total", &tot_ticks
);
487 tot_ticks
= tot_ticks
/1000000;
489 memset(lbuf
, 0, 256);
490 if (read
|| write
|| read_merged
|| write_merged
|| read_sectors
|| write_sectors
|| read_ticks
|| write_ticks
)
491 snprintf(lbuf
, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
492 major
, minor
, dev_name
, read
, read_merged
, read_sectors
, read_ticks
,
493 write
, write_merged
, write_sectors
, write_ticks
, ios_pgr
, tot_ticks
, rq_ticks
);
497 l
= snprintf(cache
, cache_size
, "%s", lbuf
);
499 return log_error(0, "Failed to write cache");
501 return log_error(0, "Write to cache was truncated");
510 if (total_len
> size
)
512 memcpy(buf
, d
->buf
, total_len
);
518 static inline void iwashere(void)
520 mknod("/tmp/lxcfs-iwashere", S_IFREG
, 0644);
524 /* This function retrieves the busy time of a group of tasks by looking at
525 * cpuacct.usage. Unfortunately, this only makes sense when the container has
526 * been given it's own cpuacct cgroup. If not, this function will take the busy
527 * time of all other taks that do not actually belong to the container into
528 * account as well. If someone has a clever solution for this please send a
531 static double get_reaper_busy(pid_t task
)
533 __do_free
char *cgroup
= NULL
, *usage_str
= NULL
;
537 initpid
= lookup_initpid_in_store(task
);
541 cgroup
= get_pid_cgroup(initpid
, "cpuacct");
544 prune_init_slice(cgroup
);
545 if (!cgroup_ops
->get(cgroup_ops
, "cpuacct", cgroup
, "cpuacct.usage", &usage_str
))
548 if (safe_uint64(usage_str
, &usage
, 10) < 0)
549 lxcfs_error("Failed to convert usage %s", usage_str
);
551 return ((double)usage
/ 1000000000);
554 static uint64_t get_reaper_start_time(pid_t pid
)
556 __do_free
void *fopen_cache
= NULL
;
557 __do_fclose
FILE *f
= NULL
;
560 /* strlen("/proc/") = 6
564 * strlen("/stat") = 5
568 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
569 char path
[__PROC_PID_STAT_LEN
];
572 qpid
= lookup_initpid_in_store(pid
);
574 /* Caller can check for EINVAL on 0. */
579 ret
= snprintf(path
, __PROC_PID_STAT_LEN
, "/proc/%d/stat", qpid
);
580 if (ret
< 0 || ret
>= __PROC_PID_STAT_LEN
) {
581 /* Caller can check for EINVAL on 0. */
586 f
= fopen_cached(path
, "re", &fopen_cache
);
588 /* Caller can check for EINVAL on 0. */
593 /* Note that the *scanf() argument supression requires that length
594 * modifiers such as "l" are omitted. Otherwise some compilers will yell
595 * at us. It's like telling someone you're not married and then asking
596 * if you can bring your wife to the party.
598 ret
= fscanf(f
, "%*d " /* (1) pid %d */
599 "%*s " /* (2) comm %s */
600 "%*c " /* (3) state %c */
601 "%*d " /* (4) ppid %d */
602 "%*d " /* (5) pgrp %d */
603 "%*d " /* (6) session %d */
604 "%*d " /* (7) tty_nr %d */
605 "%*d " /* (8) tpgid %d */
606 "%*u " /* (9) flags %u */
607 "%*u " /* (10) minflt %lu */
608 "%*u " /* (11) cminflt %lu */
609 "%*u " /* (12) majflt %lu */
610 "%*u " /* (13) cmajflt %lu */
611 "%*u " /* (14) utime %lu */
612 "%*u " /* (15) stime %lu */
613 "%*d " /* (16) cutime %ld */
614 "%*d " /* (17) cstime %ld */
615 "%*d " /* (18) priority %ld */
616 "%*d " /* (19) nice %ld */
617 "%*d " /* (20) num_threads %ld */
618 "%*d " /* (21) itrealvalue %ld */
619 "%" PRIu64
, /* (22) starttime %llu */
622 return ret_set_errno(0, EINVAL
);
624 return ret_set_errno(starttime
, 0);
627 static double get_reaper_start_time_in_sec(pid_t pid
)
629 uint64_t clockticks
, ticks_per_sec
;
633 clockticks
= get_reaper_start_time(pid
);
634 if (clockticks
== 0 && errno
== EINVAL
)
635 return log_debug(0, "Failed to retrieve start time of pid %d", pid
);
637 ret
= sysconf(_SC_CLK_TCK
);
638 if (ret
< 0 && errno
== EINVAL
)
639 return log_debug(0, "Failed to determine number of clock ticks in a second");
641 ticks_per_sec
= (uint64_t)ret
;
642 res
= (double)clockticks
/ ticks_per_sec
;
646 static double get_reaper_age(pid_t pid
)
649 double procstart
, procage
;
651 /* We need to substract the time the process has started since system
652 * boot minus the time when the system has started to get the actual
655 procstart
= get_reaper_start_time_in_sec(pid
);
659 struct timespec spec
;
661 ret
= clock_gettime(CLOCK_BOOTTIME
, &spec
);
665 /* We could make this more precise here by using the tv_nsec
666 * field in the timespec struct and convert it to milliseconds
667 * and then create a double for the seconds and milliseconds but
668 * that seems more work than it is worth.
670 uptime_ms
= (spec
.tv_sec
* 1000) + (spec
.tv_nsec
* 1e-6);
671 procage
= (uptime_ms
- (procstart
* 1000)) / 1000;
678 * We read /proc/uptime and reuse its second field.
679 * For the first field, we use the mtime for the reaper for
680 * the calling pid as returned by getreaperage
682 static int proc_uptime_read(char *buf
, size_t size
, off_t offset
,
683 struct fuse_file_info
*fi
)
685 struct fuse_context
*fc
= fuse_get_context();
686 struct file_info
*d
= INTTYPE_TO_PTR(fi
->fh
);
687 double busytime
= get_reaper_busy(fc
->pid
);
688 char *cache
= d
->buf
;
689 ssize_t total_len
= 0;
690 double idletime
, reaperage
;
702 if (offset
> d
->size
)
705 left
= d
->size
- offset
;
706 total_len
= left
> size
? size
: left
;
707 memcpy(buf
, cache
+ offset
, total_len
);
712 reaperage
= get_reaper_age(fc
->pid
);
714 * To understand why this is done, please read the comment to the
715 * get_reaper_busy() function.
717 idletime
= reaperage
;
718 if (reaperage
>= busytime
)
719 idletime
= reaperage
- busytime
;
721 total_len
= snprintf(d
->buf
, d
->buflen
, "%.2lf %.2lf\n", reaperage
, idletime
);
722 if (total_len
< 0 || total_len
>= d
->buflen
)
723 return log_error(0, "Failed to write to cache");
725 d
->size
= (int)total_len
;
728 if (total_len
> size
)
731 memcpy(buf
, d
->buf
, total_len
);
735 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
736 static int proc_stat_read(char *buf
, size_t size
, off_t offset
,
737 struct fuse_file_info
*fi
)
739 __do_free
char *cg
= NULL
, *cpuset
= NULL
, *line
= NULL
;
740 __do_free
void *fopen_cache
= NULL
;
741 __do_free
struct cpuacct_usage
*cg_cpu_usage
= NULL
;
742 __do_fclose
FILE *f
= NULL
;
743 struct fuse_context
*fc
= fuse_get_context();
744 struct lxcfs_opts
*opts
= (struct lxcfs_opts
*)fc
->private_data
;
745 struct file_info
*d
= INTTYPE_TO_PTR(fi
->fh
);
746 size_t linelen
= 0, total_len
= 0;
747 int curcpu
= -1; /* cpu numbering starts at 0 */
749 uint64_t user
= 0, nice
= 0, system
= 0, idle
= 0, iowait
= 0, irq
= 0,
750 softirq
= 0, steal
= 0, guest
= 0, guest_nice
= 0;
751 uint64_t user_sum
= 0, nice_sum
= 0, system_sum
= 0, idle_sum
= 0,
752 iowait_sum
= 0, irq_sum
= 0, softirq_sum
= 0, steal_sum
= 0,
753 guest_sum
= 0, guest_nice_sum
= 0;
754 char cpuall
[CPUALL_MAX_SIZE
];
755 /* reserve for cpu all */
756 char *cache
= d
->buf
+ CPUALL_MAX_SIZE
;
757 size_t cache_size
= d
->buflen
- CPUALL_MAX_SIZE
;
758 int cg_cpu_usage_size
= 0;
763 if (offset
> d
->size
)
769 left
= d
->size
- offset
;
770 total_len
= left
> size
? size
: left
;
771 memcpy(buf
, d
->buf
+ offset
, total_len
);
776 pid_t initpid
= lookup_initpid_in_store(fc
->pid
);
777 if (initpid
<= 1 || is_shared_pidns(initpid
))
781 * when container run with host pid namespace initpid == 1, cgroup will "/"
782 * we should return host os's /proc contents.
783 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
786 return read_file_fuse("/proc/stat", buf
, size
, d
);
788 cg
= get_pid_cgroup(initpid
, "cpuset");
790 return read_file_fuse("/proc/stat", buf
, size
, d
);
791 prune_init_slice(cg
);
793 cpuset
= get_cpuset(cg
);
797 f
= fopen_cached("/proc/stat", "re", &fopen_cache
);
802 * Read cpuacct.usage_all for all CPUs.
803 * If the cpuacct cgroup is present, it is used to calculate the container's
804 * CPU usage. If not, values from the host's /proc/stat are used.
806 if (read_cpuacct_usage_all(cg
, cpuset
, &cg_cpu_usage
, &cg_cpu_usage_size
) == 0) {
807 if (cgroup_ops
->can_use_cpuview(cgroup_ops
) && opts
&& opts
->use_cfs
) {
808 total_len
= cpuview_proc_stat(cg
, cpuset
, cg_cpu_usage
,
809 cg_cpu_usage_size
, f
,
814 lxcfs_v("proc_stat_read failed to read from cpuacct, falling back to the host's /proc/stat");
818 if (getline(&line
, &linelen
, f
) < 0)
819 return log_error(0, "proc_stat_read read first line failed");
821 while (getline(&line
, &linelen
, f
) != -1) {
823 char cpu_char
[10]; /* That's a lot of cores */
825 uint64_t all_used
, cg_used
, new_idle
;
828 if (strlen(line
) == 0)
830 if (sscanf(line
, "cpu%9[^ ]", cpu_char
) != 1) {
831 /* not a ^cpuN line containing a number N, just print it */
832 l
= snprintf(cache
, cache_size
, "%s", line
);
834 return log_error(0, "Failed to write cache");
836 return log_error(0, "Write to cache was truncated");
845 if (sscanf(cpu_char
, "%d", &physcpu
) != 1)
848 if (!cpu_in_cpuset(physcpu
, cpuset
))
853 ret
= sscanf(line
, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
864 if (ret
!= 10 || !cg_cpu_usage
) {
865 c
= strchr(line
, ' ');
869 l
= snprintf(cache
, cache_size
, "cpu%d%s", curcpu
, c
);
871 return log_error(0, "Failed to write cache");
873 return log_error(0, "Write to cache was truncated");
884 if (physcpu
>= cg_cpu_usage_size
)
887 all_used
= user
+ nice
+ system
+ iowait
+ irq
+ softirq
+ steal
+ guest
+ guest_nice
;
888 cg_used
= cg_cpu_usage
[physcpu
].user
+ cg_cpu_usage
[physcpu
].system
;
890 if (all_used
>= cg_used
) {
891 new_idle
= idle
+ (all_used
- cg_used
);
894 lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64
" in /proc/stat, %" PRIu64
" in cpuacct.usage_all; unable to determine idle time",
895 curcpu
, cg
, all_used
, cg_used
);
899 l
= snprintf(cache
, cache_size
,
900 "cpu%d %" PRIu64
" 0 %" PRIu64
" %" PRIu64
" 0 0 0 0 0 0\n",
901 curcpu
, cg_cpu_usage
[physcpu
].user
,
902 cg_cpu_usage
[physcpu
].system
, new_idle
);
904 return log_error(0, "Failed to write cache");
906 return log_error(0, "Write to cache was truncated");
912 user_sum
+= cg_cpu_usage
[physcpu
].user
;
913 system_sum
+= cg_cpu_usage
[physcpu
].system
;
914 idle_sum
+= new_idle
;
918 system_sum
+= system
;
920 iowait_sum
+= iowait
;
922 softirq_sum
+= softirq
;
925 guest_nice_sum
+= guest_nice
;
931 int cpuall_len
= snprintf(cpuall
, CPUALL_MAX_SIZE
, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
942 if (cpuall_len
> 0 && cpuall_len
< CPUALL_MAX_SIZE
) {
943 memcpy(cache
, cpuall
, cpuall_len
);
946 /* shouldn't happen */
947 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d", cpuall_len
);
951 memmove(cache
, d
->buf
+ CPUALL_MAX_SIZE
, total_len
);
952 total_len
+= cpuall_len
;
957 if (total_len
> size
)
960 memcpy(buf
, d
->buf
, total_len
);
964 /* Note that "memory.stat" in cgroup2 is hierarchical by default. */
965 static bool cgroup_parse_memory_stat(const char *cgroup
, struct memory_stat
*mstat
)
967 __do_close
int fd
= -EBADF
;
968 __do_fclose
FILE *f
= NULL
;
969 __do_free
char *line
= NULL
;
970 __do_free
void *fdopen_cache
= NULL
;
975 fd
= cgroup_ops
->get_memory_stats_fd(cgroup_ops
, cgroup
);
979 f
= fdopen_cached(fd
, "re", &fdopen_cache
);
983 unified
= pure_unified_layout(cgroup_ops
);
984 while ((linelen
= getline(&line
, &len
, f
)) != -1) {
985 if (!unified
&& startswith(line
, "hierarchical_memory_limit")) {
986 sscanf(line
, "hierarchical_memory_limit %" PRIu64
, &(mstat
->hierarchical_memory_limit
));
987 } else if (!unified
&& startswith(line
, "hierarchical_memsw_limit")) {
988 sscanf(line
, "hierarchical_memsw_limit %" PRIu64
, &(mstat
->hierarchical_memsw_limit
));
989 } else if (startswith(line
, unified
? "file" :"total_cache")) {
990 sscanf(line
, unified
? "file %" PRIu64
: "total_cache %" PRIu64
, &(mstat
->total_cache
));
991 } else if (!unified
&& startswith(line
, "total_rss")) {
992 sscanf(line
, "total_rss %" PRIu64
, &(mstat
->total_rss
));
993 } else if (!unified
&& startswith(line
, "total_rss_huge")) {
994 sscanf(line
, "total_rss_huge %" PRIu64
, &(mstat
->total_rss_huge
));
995 } else if (startswith(line
, unified
? "shmem" : "total_shmem")) {
996 sscanf(line
, unified
? "shmem %" PRIu64
: "total_shmem %" PRIu64
, &(mstat
->total_shmem
));
997 } else if (startswith(line
, unified
? "file_mapped" : "total_mapped_file")) {
998 sscanf(line
, unified
? "file_mapped %" PRIu64
: "total_mapped_file %" PRIu64
, &(mstat
->total_mapped_file
));
999 } else if (!unified
&& startswith(line
, "total_dirty")) {
1000 sscanf(line
, "total_dirty %" PRIu64
, &(mstat
->total_dirty
));
1001 } else if (!unified
&& startswith(line
, "total_writeback")) {
1002 sscanf(line
, "total_writeback %" PRIu64
, &(mstat
->total_writeback
));
1003 } else if (!unified
&& startswith(line
, "total_swap")) {
1004 sscanf(line
, "total_swap %" PRIu64
, &(mstat
->total_swap
));
1005 } else if (!unified
&& startswith(line
, "total_pgpgin")) {
1006 sscanf(line
, "total_pgpgin %" PRIu64
, &(mstat
->total_pgpgin
));
1007 } else if (!unified
&& startswith(line
, "total_pgpgout")) {
1008 sscanf(line
, "total_pgpgout %" PRIu64
, &(mstat
->total_pgpgout
));
1009 } else if (startswith(line
, unified
? "pgfault" : "total_pgfault")) {
1010 sscanf(line
, unified
? "pgfault %" PRIu64
: "total_pgfault %" PRIu64
, &(mstat
->total_pgfault
));
1011 } else if (startswith(line
, unified
? "pgmajfault" : "total_pgmajfault")) {
1012 sscanf(line
, unified
? "pgmajfault %" PRIu64
: "total_pgmajfault %" PRIu64
, &(mstat
->total_pgmajfault
));
1013 } else if (startswith(line
, unified
? "inactive_anon" : "total_inactive_anon")) {
1014 sscanf(line
, unified
? "inactive_anon %" PRIu64
: "total_inactive_anon %" PRIu64
, &(mstat
->total_inactive_anon
));
1015 } else if (startswith(line
, unified
? "active_anon" : "total_active_anon")) {
1016 sscanf(line
, unified
? "active_anon %" PRIu64
: "total_active_anon %" PRIu64
, &(mstat
->total_active_anon
));
1017 } else if (startswith(line
, unified
? "inactive_file" : "total_inactive_file")) {
1018 sscanf(line
, unified
? "inactive_file %" PRIu64
: "total_inactive_file %" PRIu64
, &(mstat
->total_inactive_file
));
1019 } else if (startswith(line
, unified
? "active_file" : "total_active_file")) {
1020 sscanf(line
, unified
? "active_file %" PRIu64
: "total_active_file %" PRIu64
, &(mstat
->total_active_file
));
1021 } else if (startswith(line
, unified
? "unevictable" : "total_unevictable")) {
1022 sscanf(line
, unified
? "unevictable %" PRIu64
: "total_unevictable %" PRIu64
, &(mstat
->total_unevictable
));
1029 static int proc_meminfo_read(char *buf
, size_t size
, off_t offset
,
1030 struct fuse_file_info
*fi
)
1032 __do_free
char *cgroup
= NULL
, *line
= NULL
, *memusage_str
= NULL
,
1033 *memswlimit_str
= NULL
, *memswusage_str
= NULL
;
1034 __do_free
void *fopen_cache
= NULL
;
1035 __do_fclose
FILE *f
= NULL
;
1036 struct fuse_context
*fc
= fuse_get_context();
1037 struct lxcfs_opts
*opts
= (struct lxcfs_opts
*)fuse_get_context()->private_data
;
1038 struct file_info
*d
= INTTYPE_TO_PTR(fi
->fh
);
1039 uint64_t memlimit
= 0, memusage
= 0, memswlimit
= 0, memswusage
= 0,
1041 struct memory_stat mstat
= {};
1042 size_t linelen
= 0, total_len
= 0;
1043 char *cache
= d
->buf
;
1044 size_t cache_size
= d
->buflen
;
1050 if (offset
> d
->size
)
1056 left
= d
->size
- offset
;
1057 total_len
= left
> size
? size
: left
;
1058 memcpy(buf
, cache
+ offset
, total_len
);
1063 pid_t initpid
= lookup_initpid_in_store(fc
->pid
);
1064 if (initpid
<= 1 || is_shared_pidns(initpid
))
1067 cgroup
= get_pid_cgroup(initpid
, "memory");
1069 return read_file_fuse("/proc/meminfo", buf
, size
, d
);
1071 prune_init_slice(cgroup
);
1073 memlimit
= get_min_memlimit(cgroup
, false);
1075 ret
= cgroup_ops
->get_memory_current(cgroup_ops
, cgroup
, &memusage_str
);
1077 return read_file_fuse("/proc/meminfo", buf
, size
, d
);
1079 if (!cgroup_parse_memory_stat(cgroup
, &mstat
))
1080 return read_file_fuse("/proc/meminfo", buf
, size
, d
);
1083 * Following values are allowed to fail, because swapaccount might be
1084 * turned off for current kernel.
1086 ret
= cgroup_ops
->get_memory_swap_max(cgroup_ops
, cgroup
, &memswlimit_str
);
1088 ret
= cgroup_ops
->get_memory_swap_current(cgroup_ops
, cgroup
, &memswusage_str
);
1090 memswlimit
= get_min_memlimit(cgroup
, true);
1091 memswlimit
= memswlimit
/ 1024;
1092 if (safe_uint64(memswusage_str
, &memswusage
, 10) < 0)
1093 lxcfs_error("Failed to convert memswusage %s", memswusage_str
);
1094 memswusage
= memswusage
/ 1024;
1097 if (safe_uint64(memusage_str
, &memusage
, 10) < 0)
1098 lxcfs_error("Failed to convert memusage %s", memswusage_str
);
1102 f
= fopen_cached("/proc/meminfo", "re", &fopen_cache
);
1104 return read_file_fuse("/proc/meminfo", buf
, size
, d
);
1106 while (getline(&line
, &linelen
, f
) != -1) {
1108 char *printme
, lbuf
[100];
1110 memset(lbuf
, 0, 100);
1111 if (startswith(line
, "MemTotal:")) {
1112 sscanf(line
+sizeof("MemTotal:")-1, "%" PRIu64
, &hosttotal
);
1113 if (hosttotal
< memlimit
)
1114 memlimit
= hosttotal
;
1115 snprintf(lbuf
, 100, "MemTotal: %8" PRIu64
" kB\n", memlimit
);
1117 } else if (startswith(line
, "MemFree:")) {
1118 snprintf(lbuf
, 100, "MemFree: %8" PRIu64
" kB\n", memlimit
- memusage
);
1120 } else if (startswith(line
, "MemAvailable:")) {
1121 snprintf(lbuf
, 100, "MemAvailable: %8" PRIu64
" kB\n", memlimit
- memusage
+ mstat
.total_cache
/ 1024);
1123 } else if (startswith(line
, "SwapTotal:") && memswlimit
> 0 && opts
&& opts
->swap_off
== false) {
1124 snprintf(lbuf
, 100, "SwapTotal: %8" PRIu64
" kB\n",
1125 (memswlimit
>= memlimit
)
1126 ? (memswlimit
- memlimit
)
1129 } else if (startswith(line
, "SwapTotal:") && opts
&& opts
->swap_off
== true) {
1130 snprintf(lbuf
, 100, "SwapTotal: %8" PRIu64
" kB\n", (uint64_t)0);
1132 } else if (startswith(line
, "SwapFree:") && memswlimit
> 0 &&
1133 memswusage
> 0 && opts
&& opts
->swap_off
== false) {
1134 uint64_t swaptotal
= memswlimit
,
1135 swapusage
= memusage
> memswusage
1137 : memswusage
- memusage
,
1138 swapfree
= swapusage
< swaptotal
1139 ? swaptotal
- swapusage
1141 snprintf(lbuf
, 100, "SwapFree: %8" PRIu64
" kB\n", swapfree
);
1143 } else if (startswith(line
, "SwapFree:") && opts
&& opts
->swap_off
== true) {
1144 snprintf(lbuf
, 100, "SwapFree: %8" PRIu64
" kB\n", (uint64_t)0);
1146 } else if (startswith(line
, "Slab:")) {
1147 snprintf(lbuf
, 100, "Slab: %8" PRIu64
" kB\n", (uint64_t)0);
1149 } else if (startswith(line
, "Buffers:")) {
1150 snprintf(lbuf
, 100, "Buffers: %8" PRIu64
" kB\n", (uint64_t)0);
1152 } else if (startswith(line
, "Cached:")) {
1153 snprintf(lbuf
, 100, "Cached: %8" PRIu64
" kB\n",
1154 mstat
.total_cache
/ 1024);
1156 } else if (startswith(line
, "SwapCached:")) {
1157 snprintf(lbuf
, 100, "SwapCached: %8" PRIu64
" kB\n", (uint64_t)0);
1159 } else if (startswith(line
, "Active:")) {
1160 snprintf(lbuf
, 100, "Active: %8" PRIu64
" kB\n",
1161 (mstat
.total_active_anon
+
1162 mstat
.total_active_file
) /
1165 } else if (startswith(line
, "Inactive:")) {
1166 snprintf(lbuf
, 100, "Inactive: %8" PRIu64
" kB\n",
1167 (mstat
.total_inactive_anon
+
1168 mstat
.total_inactive_file
) /
1171 } else if (startswith(line
, "Active(anon)")) {
1172 snprintf(lbuf
, 100, "Active(anon): %8" PRIu64
" kB\n",
1173 mstat
.total_active_anon
/ 1024);
1175 } else if (startswith(line
, "Inactive(anon)")) {
1176 snprintf(lbuf
, 100, "Inactive(anon): %8" PRIu64
" kB\n",
1177 mstat
.total_inactive_anon
/ 1024);
1179 } else if (startswith(line
, "Active(file)")) {
1180 snprintf(lbuf
, 100, "Active(file): %8" PRIu64
" kB\n",
1181 mstat
.total_active_file
/ 1024);
1183 } else if (startswith(line
, "Inactive(file)")) {
1184 snprintf(lbuf
, 100, "Inactive(file): %8" PRIu64
" kB\n",
1185 mstat
.total_inactive_file
/ 1024);
1187 } else if (startswith(line
, "Unevictable")) {
1188 snprintf(lbuf
, 100, "Unevictable: %8" PRIu64
" kB\n",
1189 mstat
.total_unevictable
/ 1024);
1191 } else if (startswith(line
, "Dirty")) {
1192 snprintf(lbuf
, 100, "Dirty: %8" PRIu64
" kB\n",
1193 mstat
.total_dirty
/ 1024);
1195 } else if (startswith(line
, "Writeback")) {
1196 snprintf(lbuf
, 100, "Writeback: %8" PRIu64
" kB\n",
1197 mstat
.total_writeback
/ 1024);
1199 } else if (startswith(line
, "AnonPages")) {
1200 snprintf(lbuf
, 100, "AnonPages: %8" PRIu64
" kB\n",
1201 (mstat
.total_active_anon
+
1202 mstat
.total_inactive_anon
- mstat
.total_shmem
) /
1205 } else if (startswith(line
, "Mapped")) {
1206 snprintf(lbuf
, 100, "Mapped: %8" PRIu64
" kB\n",
1207 mstat
.total_mapped_file
/ 1024);
1209 } else if (startswith(line
, "SReclaimable")) {
1210 snprintf(lbuf
, 100, "SReclaimable: %8" PRIu64
" kB\n", (uint64_t)0);
1212 } else if (startswith(line
, "SUnreclaim")) {
1213 snprintf(lbuf
, 100, "SUnreclaim: %8" PRIu64
" kB\n", (uint64_t)0);
1215 } else if (startswith(line
, "Shmem:")) {
1216 snprintf(lbuf
, 100, "Shmem: %8" PRIu64
" kB\n",
1217 mstat
.total_shmem
/ 1024);
1219 } else if (startswith(line
, "ShmemHugePages")) {
1220 snprintf(lbuf
, 100, "ShmemHugePages: %8" PRIu64
" kB\n", (uint64_t)0);
1222 } else if (startswith(line
, "ShmemPmdMapped")) {
1223 snprintf(lbuf
, 100, "ShmemPmdMapped: %8" PRIu64
" kB\n", (uint64_t)0);
1225 } else if (startswith(line
, "AnonHugePages")) {
1226 snprintf(lbuf
, 100, "AnonHugePages: %8" PRIu64
" kB\n",
1227 mstat
.total_rss_huge
/ 1024);
1233 l
= snprintf(cache
, cache_size
, "%s", printme
);
1235 return log_error(0, "Failed to write cache");
1236 if (l
>= cache_size
)
1237 return log_error(0, "Write to cache was truncated");
1245 d
->size
= total_len
;
1246 if (total_len
> size
)
1248 memcpy(buf
, d
->buf
, total_len
);
1253 __lxcfs_fuse_ops
int proc_read(const char *path
, char *buf
, size_t size
,
1254 off_t offset
, struct fuse_file_info
*fi
)
1256 struct file_info
*f
= INTTYPE_TO_PTR(fi
->fh
);
1259 case LXC_TYPE_PROC_MEMINFO
:
1260 if (liblxcfs_functional())
1261 return proc_meminfo_read(buf
, size
, offset
, fi
);
1263 return read_file_fuse_with_offset(LXC_TYPE_PROC_MEMINFO_PATH
,
1264 buf
, size
, offset
, f
);
1265 case LXC_TYPE_PROC_CPUINFO
:
1266 if (liblxcfs_functional())
1267 return proc_cpuinfo_read(buf
, size
, offset
, fi
);
1269 return read_file_fuse_with_offset(LXC_TYPE_PROC_CPUINFO_PATH
,
1270 buf
, size
, offset
, f
);
1271 case LXC_TYPE_PROC_UPTIME
:
1272 if (liblxcfs_functional())
1273 return proc_uptime_read(buf
, size
, offset
, fi
);
1275 return read_file_fuse_with_offset(LXC_TYPE_PROC_UPTIME_PATH
,
1276 buf
, size
, offset
, f
);
1277 case LXC_TYPE_PROC_STAT
:
1278 if (liblxcfs_functional())
1279 return proc_stat_read(buf
, size
, offset
, fi
);
1281 return read_file_fuse_with_offset(LXC_TYPE_PROC_STAT_PATH
, buf
,
1283 case LXC_TYPE_PROC_DISKSTATS
:
1284 if (liblxcfs_functional())
1285 return proc_diskstats_read(buf
, size
, offset
, fi
);
1287 return read_file_fuse_with_offset(LXC_TYPE_PROC_DISKSTATS_PATH
,
1288 buf
, size
, offset
, f
);
1289 case LXC_TYPE_PROC_SWAPS
:
1290 if (liblxcfs_functional())
1291 return proc_swaps_read(buf
, size
, offset
, fi
);
1293 return read_file_fuse_with_offset(LXC_TYPE_PROC_SWAPS_PATH
, buf
,
1295 case LXC_TYPE_PROC_LOADAVG
:
1296 if (liblxcfs_functional())
1297 return proc_loadavg_read(buf
, size
, offset
, fi
);
1299 return read_file_fuse_with_offset(LXC_TYPE_PROC_LOADAVG_PATH
,
1300 buf
, size
, offset
, f
);