]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/proc_fuse.c
Merge pull request #501 from phanhuy1502/detect-disable-cfs-quota
[mirror_lxcfs.git] / src / proc_fuse.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
1f5596dd 2
f834b6bf
SP
3#include "config.h"
4
1f5596dd
CB
5#include <dirent.h>
6#include <errno.h>
7#include <fcntl.h>
1f5596dd
CB
8#include <inttypes.h>
9#include <libgen.h>
10#include <pthread.h>
11#include <sched.h>
12#include <stdarg.h>
13#include <stdbool.h>
14#include <stdint.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <time.h>
19#include <unistd.h>
20#include <wait.h>
21#include <linux/magic.h>
22#include <linux/sched.h>
23#include <sys/epoll.h>
24#include <sys/mman.h>
25#include <sys/mount.h>
26#include <sys/param.h>
27#include <sys/socket.h>
28#include <sys/syscall.h>
29#include <sys/sysinfo.h>
30#include <sys/vfs.h>
31
e01afbb7
CB
32#include "proc_fuse.h"
33
1f5596dd 34#include "bindings.h"
1f5596dd
CB
35#include "cgroup_fuse.h"
36#include "cgroups/cgroup.h"
37#include "cgroups/cgroup_utils.h"
4ec5c9da 38#include "cpuset_parse.h"
ec2043ed 39#include "lxcfs_fuse_compat.h"
1f5596dd
CB
40#include "memory_utils.h"
41#include "proc_loadavg.h"
4ec5c9da 42#include "proc_cpuview.h"
1f5596dd
CB
43#include "utils.h"
44
acff9786
CB
45struct memory_stat {
46 uint64_t hierarchical_memory_limit;
47 uint64_t hierarchical_memsw_limit;
48 uint64_t total_cache;
49 uint64_t total_rss;
50 uint64_t total_rss_huge;
51 uint64_t total_shmem;
52 uint64_t total_mapped_file;
53 uint64_t total_dirty;
54 uint64_t total_writeback;
55 uint64_t total_swap;
56 uint64_t total_pgpgin;
57 uint64_t total_pgpgout;
58 uint64_t total_pgfault;
59 uint64_t total_pgmajfault;
60 uint64_t total_inactive_anon;
61 uint64_t total_active_anon;
62 uint64_t total_inactive_file;
63 uint64_t total_active_file;
64 uint64_t total_unevictable;
65};
66
ce554964
SP
67static off_t get_procfile_size(const char *path)
68{
69 __do_fclose FILE *f = NULL;
70 __do_free char *line = NULL;
71 size_t len = 0;
72 ssize_t sz, answer = 0;
73
74 f = fopen(path, "re");
75 if (!f)
76 return 0;
77
78 while ((sz = getline(&line, &len, f)) != -1)
79 answer += sz;
80
81 return answer;
82}
83
2d7bcab7 84__lxcfs_fuse_ops int proc_getattr(const char *path, struct stat *sb)
1f5596dd
CB
85{
86 struct timespec now;
87
88 memset(sb, 0, sizeof(struct stat));
89 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
90 return -EINVAL;
f75d5b75 91
1f5596dd
CB
92 sb->st_uid = sb->st_gid = 0;
93 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
94 if (strcmp(path, "/proc") == 0) {
95 sb->st_mode = S_IFDIR | 00555;
96 sb->st_nlink = 2;
97 return 0;
98 }
f75d5b75
CB
99
100 if (strcmp(path, "/proc/meminfo") == 0 ||
101 strcmp(path, "/proc/cpuinfo") == 0 ||
102 strcmp(path, "/proc/uptime") == 0 ||
103 strcmp(path, "/proc/stat") == 0 ||
104 strcmp(path, "/proc/diskstats") == 0 ||
105 strcmp(path, "/proc/swaps") == 0 ||
6cc153e6
FS
106 strcmp(path, "/proc/loadavg") == 0 ||
107 strcmp(path, "/proc/slabinfo") == 0) {
ce554964 108 sb->st_size = get_procfile_size(path);
1f5596dd
CB
109 sb->st_mode = S_IFREG | 00444;
110 sb->st_nlink = 1;
111 return 0;
112 }
113
114 return -ENOENT;
115}
116
2d7bcab7
CB
117__lxcfs_fuse_ops int proc_readdir(const char *path, void *buf,
118 fuse_fill_dir_t filler, off_t offset,
119 struct fuse_file_info *fi)
1f5596dd 120{
f834b6bf
SP
121 if (DIR_FILLER(filler, buf, ".", NULL, 0) != 0 ||
122 DIR_FILLER(filler, buf, "..", NULL, 0) != 0 ||
123 DIR_FILLER(filler, buf, "cpuinfo", NULL, 0) != 0 ||
124 DIR_FILLER(filler, buf, "meminfo", NULL, 0) != 0 ||
125 DIR_FILLER(filler, buf, "stat", NULL, 0) != 0 ||
126 DIR_FILLER(filler, buf, "uptime", NULL, 0) != 0 ||
127 DIR_FILLER(filler, buf, "diskstats", NULL, 0) != 0 ||
128 DIR_FILLER(filler, buf, "swaps", NULL, 0) != 0 ||
6cc153e6
FS
129 DIR_FILLER(filler, buf, "loadavg", NULL, 0) != 0 ||
130 DIR_FILLER(filler, buf, "slabinfo", NULL, 0) != 0)
1f5596dd
CB
131 return -EINVAL;
132
133 return 0;
134}
135
2d7bcab7 136__lxcfs_fuse_ops int proc_open(const char *path, struct fuse_file_info *fi)
1f5596dd 137{
700dd417 138 __do_free struct file_info *info = NULL;
1f5596dd 139 int type = -1;
1f5596dd
CB
140
141 if (strcmp(path, "/proc/meminfo") == 0)
142 type = LXC_TYPE_PROC_MEMINFO;
143 else if (strcmp(path, "/proc/cpuinfo") == 0)
144 type = LXC_TYPE_PROC_CPUINFO;
145 else if (strcmp(path, "/proc/uptime") == 0)
146 type = LXC_TYPE_PROC_UPTIME;
147 else if (strcmp(path, "/proc/stat") == 0)
148 type = LXC_TYPE_PROC_STAT;
149 else if (strcmp(path, "/proc/diskstats") == 0)
150 type = LXC_TYPE_PROC_DISKSTATS;
151 else if (strcmp(path, "/proc/swaps") == 0)
152 type = LXC_TYPE_PROC_SWAPS;
153 else if (strcmp(path, "/proc/loadavg") == 0)
154 type = LXC_TYPE_PROC_LOADAVG;
6cc153e6
FS
155 else if (strcmp(path, "/proc/slabinfo") == 0)
156 type = LXC_TYPE_PROC_SLABINFO;
1f5596dd
CB
157 if (type == -1)
158 return -ENOENT;
159
f1a33645 160 info = zalloc(sizeof(*info));
1f5596dd
CB
161 if (!info)
162 return -ENOMEM;
163
1f5596dd
CB
164 info->type = type;
165
166 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
167
f1a33645 168 info->buf = zalloc(info->buflen);
1f5596dd
CB
169 if (!info->buf)
170 return -ENOMEM;
1f5596dd
CB
171 /* set actual size to buffer size */
172 info->size = info->buflen;
173
700dd417 174 fi->fh = PTR_TO_UINT64(move_ptr(info));
1f5596dd
CB
175 return 0;
176}
177
2d7bcab7 178__lxcfs_fuse_ops int proc_access(const char *path, int mask)
1f5596dd
CB
179{
180 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
181 return 0;
182
183 /* these are all read-only */
184 if ((mask & ~R_OK) != 0)
185 return -EACCES;
f75d5b75 186
1f5596dd
CB
187 return 0;
188}
189
2d7bcab7 190__lxcfs_fuse_ops int proc_release(const char *path, struct fuse_file_info *fi)
1f5596dd
CB
191{
192 do_release_file_info(fi);
193 return 0;
194}
195
e9712933 196static uint64_t get_memlimit(const char *cgroup, bool swap)
1f5596dd 197{
1f5596dd 198 __do_free char *memlimit_str = NULL;
c83158f2 199 uint64_t memlimit = 0;
f75d5b75 200 int ret;
1f5596dd
CB
201
202 if (swap)
203 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
204 else
205 ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
ee1a885f 206 if (ret > 0 && memlimit_str[0] && safe_uint64(memlimit_str, &memlimit, 10) < 0)
c83158f2 207 lxcfs_error("Failed to convert memlimit %s", memlimit_str);
1f5596dd
CB
208
209 return memlimit;
210}
211
6f88ab0c
JS
212/*
213 * This function taken from glibc-2.32, as POSIX dirname("/some-dir") will
214 * return "/some-dir" as opposed to "/", which breaks `get_min_memlimit()`
215 */
216static char *gnu_dirname(char *path)
217{
218 static const char dot[] = ".";
219 char *last_slash;
220
221 /* Find last '/'. */
222 last_slash = path != NULL ? strrchr(path, '/') : NULL;
223
224 if (last_slash != NULL && last_slash != path && last_slash[1] == '\0') {
225 /* Determine whether all remaining characters are slashes. */
226 char *runp;
227
228 for (runp = last_slash; runp != path; --runp)
229 if (runp[-1] != '/')
230 break;
231
232 /* The '/' is the last character, we have to look further. */
233 if (runp != path)
234 last_slash = memrchr(path, '/', runp - path);
235 }
236
237 if (last_slash != NULL) {
238 /* Determine whether all remaining characters are slashes. */
239 char *runp;
240
241 for (runp = last_slash; runp != path; --runp)
242 if (runp[-1] != '/')
243 break;
244
245 /* Terminate the path. */
246 if (runp == path) {
247 /*
248 * The last slash is the first character in the string.
249 * We have to return "/". As a special case we have to
250 * return "//" if there are exactly two slashes at the
251 * beginning of the string. See XBD 4.10 Path Name
252 * Resolution for more information
253 */
254 if (last_slash == path + 1)
255 ++last_slash;
256 else
257 last_slash = path + 1;
258 } else
259 last_slash = runp;
260
261 last_slash[0] = '\0';
262 } else {
263 /*
264 * This assignment is ill-designed but the XPG specs require to
265 * return a string containing "." in any case no directory part
266 * is found and so a static and constant string is required.
267 */
268 path = (char *)dot;
269 }
270
271 return path;
272}
273
e9712933 274static uint64_t get_min_memlimit(const char *cgroup, bool swap)
1f5596dd
CB
275{
276 __do_free char *copy = NULL;
c83158f2 277 uint64_t memlimit = 0, retlimit = 0;
1f5596dd
CB
278
279 copy = strdup(cgroup);
f75d5b75
CB
280 if (!copy)
281 return log_error_errno(0, ENOMEM, "Failed to allocate memory");
282
1f5596dd
CB
283 retlimit = get_memlimit(copy, swap);
284
6f88ab0c
JS
285 /*
286 * If the cgroup doesn't start with / (probably won't happen), dirname()
287 * will terminate with "" instead of "/"
288 */
289 while (*copy && strcmp(copy, "/") != 0) {
1f5596dd
CB
290 char *it = copy;
291
6f88ab0c 292 it = gnu_dirname(it);
1f5596dd 293 memlimit = get_memlimit(it, swap);
c83158f2 294 if (memlimit > 0 && memlimit < retlimit)
1f5596dd
CB
295 retlimit = memlimit;
296 };
297
298 return retlimit;
299}
300
f75d5b75 301static inline bool startswith(const char *line, const char *pref)
4ec5c9da 302{
f75d5b75 303 return strncmp(line, pref, strlen(pref)) == 0;
4ec5c9da
CB
304}
305
1f5596dd
CB
306static int proc_swaps_read(char *buf, size_t size, off_t offset,
307 struct fuse_file_info *fi)
308{
362d1193
SG
309 __do_free char *cgroup = NULL, *memusage_str = NULL,
310 *memswusage_str = NULL, *memswpriority_str = NULL;
1f5596dd 311 struct fuse_context *fc = fuse_get_context();
84e184b1 312 bool wants_swap = lxcfs_has_opt(fuse_get_context()->private_data, LXCFS_SWAP_ON);
99b183fb 313 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
e9712933 314 uint64_t memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0,
79af0cd1 315 swtotal = 0, swusage = 0, memswpriority = 1,
3ce66074 316 hostswtotal = 0, hostswfree = 0;
1f5596dd
CB
317 ssize_t total_len = 0;
318 ssize_t l = 0;
319 char *cache = d->buf;
320 int ret;
3ce66074
SG
321 __do_free char *line = NULL;
322 __do_free void *fopen_cache = NULL;
323 __do_fclose FILE *f = NULL;
324 size_t linelen = 0;
1f5596dd
CB
325
326 if (offset) {
3cf1e562 327 size_t left;
1f5596dd
CB
328
329 if (offset > d->size)
330 return -EINVAL;
331
332 if (!d->cached)
333 return 0;
334
335 left = d->size - offset;
336 total_len = left > size ? size: left;
337 memcpy(buf, cache + offset, total_len);
338
339 return total_len;
340 }
341
342 pid_t initpid = lookup_initpid_in_store(fc->pid);
343 if (initpid <= 1 || is_shared_pidns(initpid))
344 initpid = fc->pid;
a9f0d623 345
b7b018d0
CB
346 cgroup = get_pid_cgroup(initpid, "memory");
347 if (!cgroup)
1f5596dd 348 return read_file_fuse("/proc/swaps", buf, size, d);
b7b018d0 349 prune_init_slice(cgroup);
1f5596dd 350
b7b018d0 351 memlimit = get_min_memlimit(cgroup, false);
1f5596dd 352
b7b018d0 353 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
1f5596dd
CB
354 if (ret < 0)
355 return 0;
356
c83158f2
CB
357 if (safe_uint64(memusage_str, &memusage, 10) < 0)
358 lxcfs_error("Failed to convert memusage %s", memusage_str);
1f5596dd 359
2f2080c1 360 if (wants_swap) {
b7b018d0
CB
361 memswlimit = get_min_memlimit(cgroup, true);
362 if (memswlimit > 0) {
363 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
364 if (ret >= 0 && safe_uint64(memswusage_str, &memswusage, 10) == 0) {
365 if (memlimit > memswlimit)
366 swtotal = 0;
367 else
368 swtotal = (memswlimit - memlimit) / 1024;
369 if (memusage > memswusage || swtotal == 0)
370 swusage = 0;
371 else
372 swusage = (memswusage - memusage) / 1024;
b7b018d0 373 }
362d1193
SG
374
375 ret = cgroup_ops->get_memory_swappiness(cgroup_ops, cgroup, &memswpriority_str);
376 if (ret >= 0)
377 safe_uint64(memswpriority_str, &memswpriority, 10);
2f2080c1 378 }
1f5596dd
CB
379 }
380
381 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
382
3ce66074
SG
383 /* Read host total and free values */
384 f = fopen_cached("/proc/meminfo", "re", &fopen_cache);
385 if (!f)
386 return 0;
1f5596dd 387
3ce66074
SG
388 while (getline(&line, &linelen, f) != -1) {
389 if (startswith(line, "SwapTotal:"))
390 sscanf(line, "SwapTotal: %8" PRIu64 " kB", &hostswtotal);
391 else if (startswith(line, "SwapFree:"))
392 sscanf(line, "SwapFree: %8" PRIu64 " kB", &hostswfree);
393 }
1f5596dd 394
3ce66074
SG
395 if (wants_swap) {
396 /* The total amount of swap is always reported to be the
397 lesser of the RAM+SWAP limit or the SWAP device size.
398 This is because the kernel can swap as much as it
399 wants and not only up to swtotal. */
400 swtotal = memlimit / 1024 + swtotal;
401 if (hostswtotal < swtotal) {
402 swtotal = hostswtotal;
1f5596dd 403 }
1f5596dd 404
3ce66074
SG
405 /* When swappiness is 0, pretend we can't swap. */
406 if (memswpriority == 0) {
407 swtotal = swusage;
408 }
362d1193
SG
409 }
410
b7b018d0 411 if (swtotal > 0) {
1f5596dd 412 l = snprintf(d->buf + total_len, d->size - total_len,
e9712933 413 "none%*svirtual\t\t%" PRIu64 "\t%" PRIu64 "\t0\n",
7cbfbc74 414 36, " ", swtotal, swusage);
1f5596dd
CB
415 total_len += l;
416 }
417
f75d5b75
CB
418 if (total_len < 0 || l < 0)
419 return log_error(0, "Failed writing to cache");
1f5596dd
CB
420
421 d->cached = 1;
422 d->size = (int)total_len;
423
3cf1e562 424 if ((size_t)total_len > size)
f75d5b75 425 total_len = size;
1f5596dd 426 memcpy(buf, d->buf, total_len);
f75d5b75 427
1f5596dd
CB
428 return total_len;
429}
430
431static void get_blkio_io_value(char *str, unsigned major, unsigned minor,
1ba088ae 432 char *iotype, uint64_t *v)
1f5596dd
CB
433{
434 char *eol;
435 char key[32];
f75d5b75 436 size_t len;
1f5596dd
CB
437
438 memset(key, 0, 32);
439 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
440
1f5596dd 441 *v = 0;
f75d5b75 442 len = strlen(key);
1f5596dd
CB
443 while (*str) {
444 if (startswith(str, key)) {
445 sscanf(str + len, "%lu", v);
446 return;
447 }
448 eol = strchr(str, '\n');
449 if (!eol)
450 return;
451 str = eol+1;
452 }
453}
454
998cdfc9
CB
455struct lxcfs_diskstats {
456 unsigned int major; /* 1 - major number */
457 unsigned int minor; /* 2 - minor mumber */
458 char dev_name[72]; /* 3 - device name */
459 uint64_t read; /* 4 - reads completed successfully */
460 uint64_t read_merged; /* 5 - reads merged */
461 uint64_t read_sectors; /* 6 - sectors read */
462 uint64_t read_ticks; /* 7 - time spent reading (ms) */
463 uint64_t write; /* 8 - writes completed */
464 uint64_t write_merged; /* 9 - writes merged */
465 uint64_t write_sectors; /* 10 - sectors written */
466 uint64_t write_ticks; /* 11 - time spent writing (ms) */
467 uint64_t ios_pgr; /* 12 - I/Os currently in progress */
468 uint64_t total_ticks; /* 13 - time spent doing I/Os (ms) */
469 uint64_t rq_ticks; /* 14 - weighted time spent doing I/Os (ms) */
470 uint64_t discard; /* 15 - discards completed successfully (4.18+) */
471 uint64_t discard_merged; /* 16 - discards merged (4.18+) */
472 uint64_t discard_sectors; /* 17 - sectors discarded (4.18+) */
473 uint64_t discard_ticks; /* 18 - time spent discarding (4.18+) */
474};
475
1f5596dd
CB
476static int proc_diskstats_read(char *buf, size_t size, off_t offset,
477 struct fuse_file_info *fi)
478{
479 __do_free char *cg = NULL, *io_serviced_str = NULL,
480 *io_merged_str = NULL, *io_service_bytes_str = NULL,
481 *io_wait_time_str = NULL, *io_service_time_str = NULL,
482 *line = NULL;
757a63e7 483 __do_free void *fopen_cache = NULL;
1f5596dd
CB
484 __do_fclose FILE *f = NULL;
485 struct fuse_context *fc = fuse_get_context();
99b183fb 486 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
998cdfc9
CB
487 struct lxcfs_diskstats stats = {};
488 /* helper fields */
489 uint64_t read_service_time, write_service_time, discard_service_time, read_wait_time,
490 write_wait_time, discard_wait_time;
1f5596dd
CB
491 char *cache = d->buf;
492 size_t cache_size = d->buflen;
493 size_t linelen = 0, total_len = 0;
1f5596dd
CB
494 int i = 0;
495 int ret;
1f5596dd 496
cbfc55fd 497 if (offset) {
3cf1e562 498 size_t left;
1f5596dd
CB
499
500 if (offset > d->size)
501 return -EINVAL;
502
503 if (!d->cached)
504 return 0;
505
506 left = d->size - offset;
507 total_len = left > size ? size: left;
508 memcpy(buf, cache + offset, total_len);
509
510 return total_len;
511 }
512
513 pid_t initpid = lookup_initpid_in_store(fc->pid);
514 if (initpid <= 1 || is_shared_pidns(initpid))
515 initpid = fc->pid;
a9f0d623 516
1f5596dd
CB
517 cg = get_pid_cgroup(initpid, "blkio");
518 if (!cg)
519 return read_file_fuse("/proc/diskstats", buf, size, d);
520 prune_init_slice(cg);
521
522 ret = cgroup_ops->get_io_serviced(cgroup_ops, cg, &io_serviced_str);
523 if (ret < 0) {
524 if (ret == -EOPNOTSUPP)
525 return read_file_fuse("/proc/diskstats", buf, size, d);
526 }
527
528 ret = cgroup_ops->get_io_merged(cgroup_ops, cg, &io_merged_str);
529 if (ret < 0) {
530 if (ret == -EOPNOTSUPP)
531 return read_file_fuse("/proc/diskstats", buf, size, d);
532 }
533
534 ret = cgroup_ops->get_io_service_bytes(cgroup_ops, cg, &io_service_bytes_str);
535 if (ret < 0) {
536 if (ret == -EOPNOTSUPP)
537 return read_file_fuse("/proc/diskstats", buf, size, d);
538 }
539
540 ret = cgroup_ops->get_io_wait_time(cgroup_ops, cg, &io_wait_time_str);
541 if (ret < 0) {
542 if (ret == -EOPNOTSUPP)
543 return read_file_fuse("/proc/diskstats", buf, size, d);
544 }
545
546 ret = cgroup_ops->get_io_service_time(cgroup_ops, cg, &io_service_time_str);
547 if (ret < 0) {
548 if (ret == -EOPNOTSUPP)
549 return read_file_fuse("/proc/diskstats", buf, size, d);
550 }
551
757a63e7 552 f = fopen_cached("/proc/diskstats", "re", &fopen_cache);
1f5596dd
CB
553 if (!f)
554 return 0;
555
556 while (getline(&line, &linelen, f) != -1) {
557 ssize_t l;
558 char lbuf[256];
559
998cdfc9 560 i = sscanf(line, "%u %u %71s", &stats.major, &stats.minor, stats.dev_name);
1f5596dd
CB
561 if (i != 3)
562 continue;
563
998cdfc9
CB
564 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Read", &stats.read);
565 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Write", &stats.write);
566 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Discard", &stats.discard);
567
568 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Read", &stats.read_merged);
569 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Write", &stats.write_merged);
570 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Discard", &stats.discard_merged);
571
572 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Read", &stats.read_sectors);
573 stats.read_sectors = stats.read_sectors / 512;
574 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Write", &stats.write_sectors);
575 stats.write_sectors = stats.write_sectors / 512;
576 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Discard", &stats.discard_sectors);
577 stats.discard_sectors = stats.discard_sectors / 512;
578
579 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Read", &read_service_time);
580 read_service_time = read_service_time / 1000000;
581 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Read", &read_wait_time);
582 read_wait_time = read_wait_time / 1000000;
583 stats.read_ticks = read_service_time + read_wait_time;
584
585 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Write", &write_service_time);
586 write_service_time = write_service_time / 1000000;
587 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Write", &write_wait_time);
588 write_wait_time = write_wait_time / 1000000;
589 stats.write_ticks = write_service_time + write_wait_time;
590
591 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Discard", &discard_service_time);
592 discard_service_time = discard_service_time / 1000000;
593 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Discard", &discard_wait_time);
594 discard_wait_time = discard_wait_time / 1000000;
595 stats.discard_ticks = discard_service_time + discard_wait_time;
596
597 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Total", &stats.total_ticks);
598 stats.total_ticks = stats.total_ticks / 1000000;
1f5596dd
CB
599
600 memset(lbuf, 0, 256);
998cdfc9
CB
601 if (stats.read || stats.write || stats.read_merged || stats.write_merged ||
602 stats.read_sectors || stats.write_sectors || stats.read_ticks ||
603 stats.write_ticks || stats.ios_pgr || stats.total_ticks || stats.rq_ticks ||
604 stats.discard_merged || stats.discard_sectors || stats.discard_ticks)
605 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
606 stats.major,
607 stats.minor,
608 stats.dev_name,
609 stats.read,
610 stats.read_merged,
611 stats.read_sectors,
612 stats.read_ticks,
613 stats.write,
614 stats.write_merged,
615 stats.write_sectors,
616 stats.write_ticks,
617 stats.ios_pgr,
618 stats.total_ticks,
619 stats.rq_ticks,
620 stats.discard_merged,
621 stats.discard_sectors,
622 stats.discard_ticks);
1f5596dd
CB
623 else
624 continue;
625
626 l = snprintf(cache, cache_size, "%s", lbuf);
f75d5b75
CB
627 if (l < 0)
628 return log_error(0, "Failed to write cache");
3cf1e562 629 if ((size_t)l >= cache_size)
f75d5b75
CB
630 return log_error(0, "Write to cache was truncated");
631
1f5596dd
CB
632 cache += l;
633 cache_size -= l;
634 total_len += l;
635 }
636
637 d->cached = 1;
638 d->size = total_len;
cbfc55fd
CB
639 if (total_len > size)
640 total_len = size;
1f5596dd
CB
641 memcpy(buf, d->buf, total_len);
642
643 return total_len;
644}
645
b1ef0dde 646#ifdef RELOADTEST
12a60884 647static inline void iwashere(void)
1f5596dd 648{
12a60884 649 mknod("/tmp/lxcfs-iwashere", S_IFREG, 0644);
1f5596dd
CB
650}
651#endif
652
c9c93749
CB
653/*
654 * This function retrieves the busy time of a group of tasks by looking at
1f5596dd
CB
655 * cpuacct.usage. Unfortunately, this only makes sense when the container has
656 * been given it's own cpuacct cgroup. If not, this function will take the busy
657 * time of all other taks that do not actually belong to the container into
658 * account as well. If someone has a clever solution for this please send a
659 * patch!
660 */
661static double get_reaper_busy(pid_t task)
662{
663 __do_free char *cgroup = NULL, *usage_str = NULL;
e9712933 664 uint64_t usage = 0;
1f5596dd
CB
665 pid_t initpid;
666
667 initpid = lookup_initpid_in_store(task);
668 if (initpid <= 0)
669 return 0;
670
671 cgroup = get_pid_cgroup(initpid, "cpuacct");
672 if (!cgroup)
673 return 0;
674 prune_init_slice(cgroup);
c9c93749 675
e9712933 676 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cgroup, "cpuacct.usage", &usage_str))
1f5596dd
CB
677 return 0;
678
c83158f2
CB
679 if (safe_uint64(usage_str, &usage, 10) < 0)
680 lxcfs_error("Failed to convert usage %s", usage_str);
681
1f5596dd
CB
682 return ((double)usage / 1000000000);
683}
684
685static uint64_t get_reaper_start_time(pid_t pid)
686{
757a63e7 687 __do_free void *fopen_cache = NULL;
12a60884 688 __do_fclose FILE *f = NULL;
1f5596dd 689 int ret;
1f5596dd 690 uint64_t starttime;
c9c93749
CB
691 char path[STRLITERALLEN("/proc/") + LXCFS_NUMSTRLEN64 +
692 STRLITERALLEN("/stat") + 1];
1f5596dd
CB
693 pid_t qpid;
694
695 qpid = lookup_initpid_in_store(pid);
c9c93749
CB
696 if (qpid <= 0)
697 return ret_errno(EINVAL);
1f5596dd 698
c9c93749
CB
699 ret = snprintf(path, sizeof(path), "/proc/%d/stat", qpid);
700 if (ret < 0 || (size_t)ret >= sizeof(path))
701 return ret_errno(EINVAL);
1f5596dd 702
757a63e7 703 f = fopen_cached(path, "re", &fopen_cache);
c9c93749
CB
704 if (!f)
705 return ret_errno(EINVAL);
1f5596dd
CB
706
707 /* Note that the *scanf() argument supression requires that length
708 * modifiers such as "l" are omitted. Otherwise some compilers will yell
709 * at us. It's like telling someone you're not married and then asking
710 * if you can bring your wife to the party.
711 */
712 ret = fscanf(f, "%*d " /* (1) pid %d */
713 "%*s " /* (2) comm %s */
714 "%*c " /* (3) state %c */
715 "%*d " /* (4) ppid %d */
716 "%*d " /* (5) pgrp %d */
717 "%*d " /* (6) session %d */
718 "%*d " /* (7) tty_nr %d */
719 "%*d " /* (8) tpgid %d */
720 "%*u " /* (9) flags %u */
721 "%*u " /* (10) minflt %lu */
722 "%*u " /* (11) cminflt %lu */
723 "%*u " /* (12) majflt %lu */
724 "%*u " /* (13) cmajflt %lu */
725 "%*u " /* (14) utime %lu */
726 "%*u " /* (15) stime %lu */
727 "%*d " /* (16) cutime %ld */
728 "%*d " /* (17) cstime %ld */
729 "%*d " /* (18) priority %ld */
730 "%*d " /* (19) nice %ld */
731 "%*d " /* (20) num_threads %ld */
732 "%*d " /* (21) itrealvalue %ld */
733 "%" PRIu64, /* (22) starttime %llu */
734 &starttime);
12a60884 735 if (ret != 1)
c9c93749 736 return ret_errno(EINVAL);
1f5596dd 737
12a60884 738 return ret_set_errno(starttime, 0);
1f5596dd
CB
739}
740
741static double get_reaper_start_time_in_sec(pid_t pid)
742{
743 uint64_t clockticks, ticks_per_sec;
744 int64_t ret;
745 double res = 0;
746
747 clockticks = get_reaper_start_time(pid);
c9c93749 748 if (clockticks <= 0)
f75d5b75 749 return log_debug(0, "Failed to retrieve start time of pid %d", pid);
1f5596dd
CB
750
751 ret = sysconf(_SC_CLK_TCK);
c9c93749 752 if (ret < 0)
f75d5b75 753 return log_debug(0, "Failed to determine number of clock ticks in a second");
1f5596dd
CB
754
755 ticks_per_sec = (uint64_t)ret;
756 res = (double)clockticks / ticks_per_sec;
757 return res;
758}
759
760static double get_reaper_age(pid_t pid)
761{
762 uint64_t uptime_ms;
763 double procstart, procage;
764
c9c93749
CB
765 /*
766 * We need to substract the time the process has started since system
1f5596dd
CB
767 * boot minus the time when the system has started to get the actual
768 * reaper age.
769 */
770 procstart = get_reaper_start_time_in_sec(pid);
771 procage = procstart;
772 if (procstart > 0) {
773 int ret;
774 struct timespec spec;
775
776 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
777 if (ret < 0)
778 return 0;
779
1f5596dd
CB
780 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
781 procage = (uptime_ms - (procstart * 1000)) / 1000;
782 }
783
784 return procage;
785}
786
787/*
788 * We read /proc/uptime and reuse its second field.
789 * For the first field, we use the mtime for the reaper for
790 * the calling pid as returned by getreaperage
791 */
792static int proc_uptime_read(char *buf, size_t size, off_t offset,
793 struct fuse_file_info *fi)
794{
795 struct fuse_context *fc = fuse_get_context();
99b183fb 796 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1f5596dd 797 char *cache = d->buf;
d7718002 798 ssize_t total_len = 0, ret = 0;
ea725aba 799 double busytime, idletime, reaperage;
1f5596dd 800
b1ef0dde 801#ifdef RELOADTEST
1f5596dd
CB
802 iwashere();
803#endif
804
f75d5b75 805 if (offset) {
3cf1e562 806 size_t left;
f75d5b75 807
1f5596dd
CB
808 if (offset > d->size)
809 return -EINVAL;
f75d5b75 810
d7718002
CB
811 if (!d->cached)
812 return 0;
813
f75d5b75
CB
814 left = d->size - offset;
815 total_len = left > size ? size : left;
1f5596dd 816 memcpy(buf, cache + offset, total_len);
f75d5b75 817
1f5596dd
CB
818 return total_len;
819 }
820
821 reaperage = get_reaper_age(fc->pid);
f75d5b75
CB
822 /*
823 * To understand why this is done, please read the comment to the
1f5596dd
CB
824 * get_reaper_busy() function.
825 */
826 idletime = reaperage;
ea725aba 827 busytime = get_reaper_busy(fc->pid);
1f5596dd
CB
828 if (reaperage >= busytime)
829 idletime = reaperage - busytime;
830
d7718002
CB
831 ret = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
832 if (ret < 0 || ret >= d->buflen)
c9c93749 833 return read_file_fuse("/proc/uptime", buf, size, d);
d7718002 834 total_len = ret;
1f5596dd 835
1f5596dd 836 d->cached = 1;
d7718002 837 d->size = total_len;
3cf1e562 838 if ((size_t)total_len > size)
f75d5b75 839 total_len = size;
1f5596dd 840 memcpy(buf, d->buf, total_len);
c9c93749 841
1f5596dd
CB
842 return total_len;
843}
844
845#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
846static int proc_stat_read(char *buf, size_t size, off_t offset,
847 struct fuse_file_info *fi)
848{
849 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
757a63e7 850 __do_free void *fopen_cache = NULL;
1f5596dd
CB
851 __do_free struct cpuacct_usage *cg_cpu_usage = NULL;
852 __do_fclose FILE *f = NULL;
853 struct fuse_context *fc = fuse_get_context();
8044f626 854 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
99b183fb 855 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1f5596dd
CB
856 size_t linelen = 0, total_len = 0;
857 int curcpu = -1; /* cpu numbering starts at 0 */
858 int physcpu = 0;
1ba088ae
CB
859 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
860 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
861 uint64_t user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0,
862 iowait_sum = 0, irq_sum = 0, softirq_sum = 0, steal_sum = 0,
863 guest_sum = 0, guest_nice_sum = 0;
1f5596dd
CB
864 char cpuall[CPUALL_MAX_SIZE];
865 /* reserve for cpu all */
866 char *cache = d->buf + CPUALL_MAX_SIZE;
867 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
868 int cg_cpu_usage_size = 0;
869
7b367480 870 if (offset) {
3cf1e562 871 size_t left;
7b367480 872
1f5596dd
CB
873 if (offset > d->size)
874 return -EINVAL;
7b367480 875
1f5596dd
CB
876 if (!d->cached)
877 return 0;
7b367480
CB
878
879 left = d->size - offset;
880 total_len = left > size ? size : left;
1f5596dd 881 memcpy(buf, d->buf + offset, total_len);
7b367480 882
1f5596dd
CB
883 return total_len;
884 }
885
886 pid_t initpid = lookup_initpid_in_store(fc->pid);
a9f0d623 887 if (initpid <= 1 || is_shared_pidns(initpid))
1f5596dd
CB
888 initpid = fc->pid;
889
890 /*
891 * when container run with host pid namespace initpid == 1, cgroup will "/"
892 * we should return host os's /proc contents.
893 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
894 */
605e157d 895 if (initpid == 1)
6300e6c6 896 return read_file_fuse("/proc/stat", buf, size, d);
1f5596dd
CB
897
898 cg = get_pid_cgroup(initpid, "cpuset");
1f5596dd
CB
899 if (!cg)
900 return read_file_fuse("/proc/stat", buf, size, d);
901 prune_init_slice(cg);
902
903 cpuset = get_cpuset(cg);
904 if (!cpuset)
905 return 0;
906
77711d7a
CB
907 f = fopen_cached("/proc/stat", "re", &fopen_cache);
908 if (!f)
909 return 0;
910
1494771e
CB
911 /* Skip first system cpu line. */
912 if (getline(&line, &linelen, f) < 0)
913 return log_error(0, "proc_stat_read read first line failed");
914
1f5596dd
CB
915 /*
916 * Read cpuacct.usage_all for all CPUs.
917 * If the cpuacct cgroup is present, it is used to calculate the container's
918 * CPU usage. If not, values from the host's /proc/stat are used.
919 */
f9434b9a
CB
920 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) == 0) {
921 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs) {
922 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage,
923 cg_cpu_usage_size, f,
924 d->buf, d->buflen);
925 goto out;
926 }
927 } else {
928 lxcfs_v("proc_stat_read failed to read from cpuacct, falling back to the host's /proc/stat");
929 }
1f5596dd 930
1f5596dd
CB
931 while (getline(&line, &linelen, f) != -1) {
932 ssize_t l;
933 char cpu_char[10]; /* That's a lot of cores */
934 char *c;
935 uint64_t all_used, cg_used, new_idle;
936 int ret;
937
938 if (strlen(line) == 0)
939 continue;
940 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
941 /* not a ^cpuN line containing a number N, just print it */
942 l = snprintf(cache, cache_size, "%s", line);
f75d5b75
CB
943 if (l < 0)
944 return log_error(0, "Failed to write cache");
3cf1e562 945 if ((size_t)l >= cache_size)
f75d5b75
CB
946 return log_error(0, "Write to cache was truncated");
947
1f5596dd
CB
948 cache += l;
949 cache_size -= l;
950 total_len += l;
f75d5b75 951
1f5596dd
CB
952 continue;
953 }
954
955 if (sscanf(cpu_char, "%d", &physcpu) != 1)
956 continue;
f75d5b75 957
1f5596dd
CB
958 if (!cpu_in_cpuset(physcpu, cpuset))
959 continue;
f75d5b75 960
2b8eff1d 961 curcpu++;
1f5596dd
CB
962
963 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
964 &user,
965 &nice,
966 &system,
967 &idle,
968 &iowait,
969 &irq,
970 &softirq,
971 &steal,
972 &guest,
973 &guest_nice);
1f5596dd
CB
974 if (ret != 10 || !cg_cpu_usage) {
975 c = strchr(line, ' ');
976 if (!c)
977 continue;
1f5596dd 978
f75d5b75
CB
979 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
980 if (l < 0)
981 return log_error(0, "Failed to write cache");
3cf1e562 982 if ((size_t)l >= cache_size)
f75d5b75 983 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
984
985 cache += l;
986 cache_size -= l;
987 total_len += l;
988
989 if (ret != 10)
990 continue;
991 }
992
993 if (cg_cpu_usage) {
994 if (physcpu >= cg_cpu_usage_size)
995 break;
996
997 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
998 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
999
1000 if (all_used >= cg_used) {
1001 new_idle = idle + (all_used - cg_used);
1f5596dd 1002 } else {
1e3aa115 1003 lxcfs_debug("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
2b8eff1d 1004 curcpu, cg, all_used, cg_used);
1f5596dd
CB
1005 new_idle = idle;
1006 }
1007
2b8eff1d
CB
1008 l = snprintf(cache, cache_size,
1009 "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
1010 curcpu, cg_cpu_usage[physcpu].user,
1011 cg_cpu_usage[physcpu].system, new_idle);
f75d5b75
CB
1012 if (l < 0)
1013 return log_error(0, "Failed to write cache");
3cf1e562 1014 if ((size_t)l >= cache_size)
f75d5b75 1015 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
1016
1017 cache += l;
1018 cache_size -= l;
1019 total_len += l;
1020
1021 user_sum += cg_cpu_usage[physcpu].user;
1022 system_sum += cg_cpu_usage[physcpu].system;
1023 idle_sum += new_idle;
1f5596dd
CB
1024 } else {
1025 user_sum += user;
1026 nice_sum += nice;
1027 system_sum += system;
1028 idle_sum += idle;
1029 iowait_sum += iowait;
1030 irq_sum += irq;
1031 softirq_sum += softirq;
1032 steal_sum += steal;
1033 guest_sum += guest;
1034 guest_nice_sum += guest_nice;
1035 }
1036 }
1037
1038 cache = d->buf;
1039
1040 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
1041 user_sum,
1042 nice_sum,
1043 system_sum,
1044 idle_sum,
1045 iowait_sum,
1046 irq_sum,
1047 softirq_sum,
1048 steal_sum,
1049 guest_sum,
1050 guest_nice_sum);
1051 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
1052 memcpy(cache, cpuall, cpuall_len);
1053 cache += cpuall_len;
1054 } else {
1055 /* shouldn't happen */
f75d5b75 1056 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d", cpuall_len);
1f5596dd
CB
1057 cpuall_len = 0;
1058 }
1059
1060 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
1061 total_len += cpuall_len;
1062
1063out:
1064 d->cached = 1;
1065 d->size = total_len;
1066 if (total_len > size)
1067 total_len = size;
1068
1069 memcpy(buf, d->buf, total_len);
1070 return total_len;
1071}
1072
1073/* Note that "memory.stat" in cgroup2 is hierarchical by default. */
acff9786 1074static bool cgroup_parse_memory_stat(const char *cgroup, struct memory_stat *mstat)
1f5596dd 1075{
05b7a16d 1076 __do_close int fd = -EBADF;
acff9786
CB
1077 __do_fclose FILE *f = NULL;
1078 __do_free char *line = NULL;
28519477 1079 __do_free void *fdopen_cache = NULL;
acff9786
CB
1080 bool unified;
1081 size_t len = 0;
1082 ssize_t linelen;
1f5596dd 1083
acff9786
CB
1084 fd = cgroup_ops->get_memory_stats_fd(cgroup_ops, cgroup);
1085 if (fd < 0)
1086 return false;
1087
28519477 1088 f = fdopen_cached(fd, "re", &fdopen_cache);
acff9786
CB
1089 if (!f)
1090 return false;
acff9786
CB
1091
1092 unified = pure_unified_layout(cgroup_ops);
1093 while ((linelen = getline(&line, &len, f)) != -1) {
1094 if (!unified && startswith(line, "hierarchical_memory_limit")) {
1095 sscanf(line, "hierarchical_memory_limit %" PRIu64, &(mstat->hierarchical_memory_limit));
1096 } else if (!unified && startswith(line, "hierarchical_memsw_limit")) {
1097 sscanf(line, "hierarchical_memsw_limit %" PRIu64, &(mstat->hierarchical_memsw_limit));
91d63a9e
IM
1098 } else if (startswith(line, unified ? "file" :"total_cache")) {
1099 sscanf(line, unified ? "file %" PRIu64 : "total_cache %" PRIu64, &(mstat->total_cache));
acff9786
CB
1100 } else if (!unified && startswith(line, "total_rss")) {
1101 sscanf(line, "total_rss %" PRIu64, &(mstat->total_rss));
1102 } else if (!unified && startswith(line, "total_rss_huge")) {
1103 sscanf(line, "total_rss_huge %" PRIu64, &(mstat->total_rss_huge));
1104 } else if (startswith(line, unified ? "shmem" : "total_shmem")) {
1105 sscanf(line, unified ? "shmem %" PRIu64 : "total_shmem %" PRIu64, &(mstat->total_shmem));
1106 } else if (startswith(line, unified ? "file_mapped" : "total_mapped_file")) {
1107 sscanf(line, unified ? "file_mapped %" PRIu64 : "total_mapped_file %" PRIu64, &(mstat->total_mapped_file));
1108 } else if (!unified && startswith(line, "total_dirty")) {
1109 sscanf(line, "total_dirty %" PRIu64, &(mstat->total_dirty));
1110 } else if (!unified && startswith(line, "total_writeback")) {
1111 sscanf(line, "total_writeback %" PRIu64, &(mstat->total_writeback));
1112 } else if (!unified && startswith(line, "total_swap")) {
1113 sscanf(line, "total_swap %" PRIu64, &(mstat->total_swap));
1114 } else if (!unified && startswith(line, "total_pgpgin")) {
1115 sscanf(line, "total_pgpgin %" PRIu64, &(mstat->total_pgpgin));
1116 } else if (!unified && startswith(line, "total_pgpgout")) {
1117 sscanf(line, "total_pgpgout %" PRIu64, &(mstat->total_pgpgout));
1118 } else if (startswith(line, unified ? "pgfault" : "total_pgfault")) {
1119 sscanf(line, unified ? "pgfault %" PRIu64 : "total_pgfault %" PRIu64, &(mstat->total_pgfault));
1120 } else if (startswith(line, unified ? "pgmajfault" : "total_pgmajfault")) {
1121 sscanf(line, unified ? "pgmajfault %" PRIu64 : "total_pgmajfault %" PRIu64, &(mstat->total_pgmajfault));
1122 } else if (startswith(line, unified ? "inactive_anon" : "total_inactive_anon")) {
1123 sscanf(line, unified ? "inactive_anon %" PRIu64 : "total_inactive_anon %" PRIu64, &(mstat->total_inactive_anon));
1124 } else if (startswith(line, unified ? "active_anon" : "total_active_anon")) {
1125 sscanf(line, unified ? "active_anon %" PRIu64 : "total_active_anon %" PRIu64, &(mstat->total_active_anon));
1126 } else if (startswith(line, unified ? "inactive_file" : "total_inactive_file")) {
1127 sscanf(line, unified ? "inactive_file %" PRIu64 : "total_inactive_file %" PRIu64, &(mstat->total_inactive_file));
1128 } else if (startswith(line, unified ? "active_file" : "total_active_file")) {
1129 sscanf(line, unified ? "active_file %" PRIu64 : "total_active_file %" PRIu64, &(mstat->total_active_file));
1130 } else if (startswith(line, unified ? "unevictable" : "total_unevictable")) {
1131 sscanf(line, unified ? "unevictable %" PRIu64 : "total_unevictable %" PRIu64, &(mstat->total_unevictable));
1f5596dd 1132 }
1f5596dd 1133 }
acff9786
CB
1134
1135 return true;
1f5596dd
CB
1136}
1137
1f5596dd
CB
1138static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1139 struct fuse_file_info *fi)
1140{
63f35cc0 1141 __do_free char *cgroup = NULL, *line = NULL, *memusage_str = NULL,
362d1193 1142 *memswusage_str = NULL, *memswpriority_str = NULL;
757a63e7 1143 __do_free void *fopen_cache = NULL;
1f5596dd
CB
1144 __do_fclose FILE *f = NULL;
1145 struct fuse_context *fc = fuse_get_context();
84e184b1 1146 bool wants_swap = lxcfs_has_opt(fuse_get_context()->private_data, LXCFS_SWAP_ON);
99b183fb 1147 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
acff9786 1148 uint64_t memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
362d1193
SG
1149 hosttotal = 0, swfree = 0, swusage = 0, swtotal = 0,
1150 memswpriority = 1;
334a14f9 1151 struct memory_stat mstat = {};
1f5596dd
CB
1152 size_t linelen = 0, total_len = 0;
1153 char *cache = d->buf;
1154 size_t cache_size = d->buflen;
1155 int ret;
1156
1157 if (offset) {
3cf1e562 1158 size_t left;
1f5596dd
CB
1159
1160 if (offset > d->size)
1161 return -EINVAL;
1162
1163 if (!d->cached)
1164 return 0;
1165
1166 left = d->size - offset;
1167 total_len = left > size ? size : left;
1168 memcpy(buf, cache + offset, total_len);
1169
1170 return total_len;
1171 }
1172
1173 pid_t initpid = lookup_initpid_in_store(fc->pid);
1174 if (initpid <= 1 || is_shared_pidns(initpid))
1175 initpid = fc->pid;
1176
1177 cgroup = get_pid_cgroup(initpid, "memory");
1178 if (!cgroup)
1179 return read_file_fuse("/proc/meminfo", buf, size, d);
1180
1181 prune_init_slice(cgroup);
1182
b7b018d0 1183 /* memory limits */
1f5596dd
CB
1184 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
1185 if (ret < 0)
b0f33646 1186 return read_file_fuse("/proc/meminfo", buf, size, d);
1f5596dd 1187
b7b018d0
CB
1188 if (safe_uint64(memusage_str, &memusage, 10) < 0)
1189 lxcfs_error("Failed to convert memusage %s", memusage_str);
1190
acff9786 1191 if (!cgroup_parse_memory_stat(cgroup, &mstat))
b0f33646 1192 return read_file_fuse("/proc/meminfo", buf, size, d);
1f5596dd 1193
b7b018d0
CB
1194 memlimit = get_min_memlimit(cgroup, false);
1195
1f5596dd
CB
1196 /*
1197 * Following values are allowed to fail, because swapaccount might be
1198 * turned off for current kernel.
1199 */
2f2080c1 1200 if (wants_swap) {
b7b018d0
CB
1201 memswlimit = get_min_memlimit(cgroup, true);
1202 if (memswlimit > 0) {
2f2080c1 1203 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
b7b018d0
CB
1204 if (ret >= 0 && safe_uint64(memswusage_str, &memswusage, 10) == 0) {
1205 if (memlimit > memswlimit)
1206 swtotal = 0;
1207 else
1208 swtotal = (memswlimit - memlimit) / 1024;
1209 if (memusage > memswusage || swtotal == 0)
1210 swusage = 0;
1211 else
1212 swusage = (memswusage - memusage) / 1024;
6bfe1016 1213 }
2f2080c1 1214 }
362d1193
SG
1215
1216 ret = cgroup_ops->get_memory_swappiness(cgroup_ops, cgroup, &memswpriority_str);
1217 if (ret >= 0)
1218 safe_uint64(memswpriority_str, &memswpriority, 10);
1f5596dd
CB
1219 }
1220
757a63e7 1221 f = fopen_cached("/proc/meminfo", "re", &fopen_cache);
1f5596dd 1222 if (!f)
b0f33646 1223 return read_file_fuse("/proc/meminfo", buf, size, d);
1f5596dd 1224
b7b018d0
CB
1225 memusage /= 1024;
1226 memlimit /= 1024;
1f5596dd
CB
1227 while (getline(&line, &linelen, f) != -1) {
1228 ssize_t l;
1229 char *printme, lbuf[100];
1230
1231 memset(lbuf, 0, 100);
1232 if (startswith(line, "MemTotal:")) {
acff9786 1233 sscanf(line+sizeof("MemTotal:")-1, "%" PRIu64, &hosttotal);
114eb8b8
CB
1234 if (memlimit == 0)
1235 memlimit = hosttotal;
1236
1f5596dd
CB
1237 if (hosttotal < memlimit)
1238 memlimit = hosttotal;
acff9786 1239 snprintf(lbuf, 100, "MemTotal: %8" PRIu64 " kB\n", memlimit);
1f5596dd
CB
1240 printme = lbuf;
1241 } else if (startswith(line, "MemFree:")) {
acff9786 1242 snprintf(lbuf, 100, "MemFree: %8" PRIu64 " kB\n", memlimit - memusage);
1f5596dd
CB
1243 printme = lbuf;
1244 } else if (startswith(line, "MemAvailable:")) {
acff9786 1245 snprintf(lbuf, 100, "MemAvailable: %8" PRIu64 " kB\n", memlimit - memusage + mstat.total_cache / 1024);
1f5596dd 1246 printme = lbuf;
07c90197 1247 } else if (startswith(line, "SwapTotal:")) {
b7b018d0
CB
1248 if (wants_swap) {
1249 uint64_t hostswtotal = 0;
1250
1251 sscanf(line + STRLITERALLEN("SwapTotal:"), "%" PRIu64, &hostswtotal);
1252
33aa929e
SG
1253 /* The total amount of swap is always reported to be the
1254 lesser of the RAM+SWAP limit or the SWAP device size.
1255 This is because the kernel can swap as much as it
1256 wants and not only up to swtotal. */
1257
1258 swtotal = memlimit + swtotal;
1259 if (hostswtotal < swtotal) {
b7b018d0 1260 swtotal = hostswtotal;
b7b018d0 1261 }
362d1193
SG
1262
1263 /* When swappiness is 0, pretend we can't swap. */
1264 if (memswpriority == 0) {
1265 swtotal = swusage;
1266 }
b7b018d0
CB
1267 }
1268
6bfe1016 1269 snprintf(lbuf, 100, "SwapTotal: %8" PRIu64 " kB\n", swtotal);
1f5596dd 1270 printme = lbuf;
07c90197 1271 } else if (startswith(line, "SwapFree:")) {
b7b018d0 1272 if (wants_swap) {
33aa929e 1273 swfree = swtotal - swusage;
07c90197 1274 }
b7b018d0 1275
6bfe1016 1276 snprintf(lbuf, 100, "SwapFree: %8" PRIu64 " kB\n", swfree);
1f5596dd
CB
1277 printme = lbuf;
1278 } else if (startswith(line, "Slab:")) {
6ddc3c00 1279 snprintf(lbuf, 100, "Slab: %8" PRIu64 " kB\n", (uint64_t)0);
1f5596dd
CB
1280 printme = lbuf;
1281 } else if (startswith(line, "Buffers:")) {
acff9786 1282 snprintf(lbuf, 100, "Buffers: %8" PRIu64 " kB\n", (uint64_t)0);
1f5596dd
CB
1283 printme = lbuf;
1284 } else if (startswith(line, "Cached:")) {
acff9786
CB
1285 snprintf(lbuf, 100, "Cached: %8" PRIu64 " kB\n",
1286 mstat.total_cache / 1024);
1f5596dd
CB
1287 printme = lbuf;
1288 } else if (startswith(line, "SwapCached:")) {
acff9786 1289 snprintf(lbuf, 100, "SwapCached: %8" PRIu64 " kB\n", (uint64_t)0);
1f5596dd
CB
1290 printme = lbuf;
1291 } else if (startswith(line, "Active:")) {
acff9786
CB
1292 snprintf(lbuf, 100, "Active: %8" PRIu64 " kB\n",
1293 (mstat.total_active_anon +
1294 mstat.total_active_file) /
1295 1024);
1f5596dd
CB
1296 printme = lbuf;
1297 } else if (startswith(line, "Inactive:")) {
acff9786
CB
1298 snprintf(lbuf, 100, "Inactive: %8" PRIu64 " kB\n",
1299 (mstat.total_inactive_anon +
1300 mstat.total_inactive_file) /
1301 1024);
1f5596dd 1302 printme = lbuf;
659b0278 1303 } else if (startswith(line, "Active(anon):")) {
acff9786
CB
1304 snprintf(lbuf, 100, "Active(anon): %8" PRIu64 " kB\n",
1305 mstat.total_active_anon / 1024);
1f5596dd 1306 printme = lbuf;
659b0278 1307 } else if (startswith(line, "Inactive(anon):")) {
acff9786
CB
1308 snprintf(lbuf, 100, "Inactive(anon): %8" PRIu64 " kB\n",
1309 mstat.total_inactive_anon / 1024);
1f5596dd 1310 printme = lbuf;
659b0278 1311 } else if (startswith(line, "Active(file):")) {
acff9786
CB
1312 snprintf(lbuf, 100, "Active(file): %8" PRIu64 " kB\n",
1313 mstat.total_active_file / 1024);
1f5596dd 1314 printme = lbuf;
659b0278 1315 } else if (startswith(line, "Inactive(file):")) {
acff9786
CB
1316 snprintf(lbuf, 100, "Inactive(file): %8" PRIu64 " kB\n",
1317 mstat.total_inactive_file / 1024);
1f5596dd 1318 printme = lbuf;
659b0278 1319 } else if (startswith(line, "Unevictable:")) {
acff9786
CB
1320 snprintf(lbuf, 100, "Unevictable: %8" PRIu64 " kB\n",
1321 mstat.total_unevictable / 1024);
1322 printme = lbuf;
659b0278 1323 } else if (startswith(line, "Dirty:")) {
acff9786
CB
1324 snprintf(lbuf, 100, "Dirty: %8" PRIu64 " kB\n",
1325 mstat.total_dirty / 1024);
1326 printme = lbuf;
659b0278 1327 } else if (startswith(line, "Writeback:")) {
acff9786
CB
1328 snprintf(lbuf, 100, "Writeback: %8" PRIu64 " kB\n",
1329 mstat.total_writeback / 1024);
1330 printme = lbuf;
659b0278 1331 } else if (startswith(line, "AnonPages:")) {
acff9786
CB
1332 snprintf(lbuf, 100, "AnonPages: %8" PRIu64 " kB\n",
1333 (mstat.total_active_anon +
1334 mstat.total_inactive_anon - mstat.total_shmem) /
1335 1024);
1336 printme = lbuf;
659b0278 1337 } else if (startswith(line, "Mapped:")) {
acff9786
CB
1338 snprintf(lbuf, 100, "Mapped: %8" PRIu64 " kB\n",
1339 mstat.total_mapped_file / 1024);
1f5596dd 1340 printme = lbuf;
659b0278 1341 } else if (startswith(line, "SReclaimable:")) {
acff9786 1342 snprintf(lbuf, 100, "SReclaimable: %8" PRIu64 " kB\n", (uint64_t)0);
1f5596dd 1343 printme = lbuf;
659b0278 1344 } else if (startswith(line, "SUnreclaim:")) {
acff9786 1345 snprintf(lbuf, 100, "SUnreclaim: %8" PRIu64 " kB\n", (uint64_t)0);
1f5596dd
CB
1346 printme = lbuf;
1347 } else if (startswith(line, "Shmem:")) {
acff9786
CB
1348 snprintf(lbuf, 100, "Shmem: %8" PRIu64 " kB\n",
1349 mstat.total_shmem / 1024);
1f5596dd 1350 printme = lbuf;
659b0278 1351 } else if (startswith(line, "ShmemHugePages:")) {
acff9786 1352 snprintf(lbuf, 100, "ShmemHugePages: %8" PRIu64 " kB\n", (uint64_t)0);
1f5596dd 1353 printme = lbuf;
659b0278 1354 } else if (startswith(line, "ShmemPmdMapped:")) {
acff9786
CB
1355 snprintf(lbuf, 100, "ShmemPmdMapped: %8" PRIu64 " kB\n", (uint64_t)0);
1356 printme = lbuf;
659b0278 1357 } else if (startswith(line, "AnonHugePages:")) {
acff9786
CB
1358 snprintf(lbuf, 100, "AnonHugePages: %8" PRIu64 " kB\n",
1359 mstat.total_rss_huge / 1024);
1f5596dd 1360 printme = lbuf;
acff9786
CB
1361 } else {
1362 printme = line;
1363 }
1f5596dd
CB
1364
1365 l = snprintf(cache, cache_size, "%s", printme);
f75d5b75
CB
1366 if (l < 0)
1367 return log_error(0, "Failed to write cache");
3cf1e562 1368 if ((size_t)l >= cache_size)
f75d5b75 1369 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
1370
1371 cache += l;
1372 cache_size -= l;
1373 total_len += l;
1374 }
1375
1376 d->cached = 1;
1377 d->size = total_len;
f75d5b75
CB
1378 if (total_len > size)
1379 total_len = size;
1f5596dd
CB
1380 memcpy(buf, d->buf, total_len);
1381
1382 return total_len;
1383}
1384
6cc153e6
FS
1385static int proc_slabinfo_read(char *buf, size_t size, off_t offset,
1386 struct fuse_file_info *fi)
1387{
1388 __do_free char *cgroup = NULL, *line = NULL;
1389 __do_free void *fopen_cache = NULL;
1390 __do_fclose FILE *f = NULL;
1391 __do_close int fd = -EBADF;
1392 struct fuse_context *fc = fuse_get_context();
1393 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1394 size_t linelen = 0, total_len = 0;
1395 char *cache = d->buf;
1396 size_t cache_size = d->buflen;
1397 pid_t initpid;
1398
1399 if (offset) {
3cf1e562 1400 size_t left;
6cc153e6
FS
1401
1402 if (offset > d->size)
1403 return -EINVAL;
1404
1405 if (!d->cached)
1406 return 0;
1407
1408 left = d->size - offset;
1409 total_len = left > size ? size : left;
1410 memcpy(buf, cache + offset, total_len);
1411
1412 return total_len;
1413 }
1414
1415 initpid = lookup_initpid_in_store(fc->pid);
1416 if (initpid <= 1 || is_shared_pidns(initpid))
1417 initpid = fc->pid;
1418
1419 cgroup = get_pid_cgroup(initpid, "memory");
1420 if (!cgroup)
1421 return read_file_fuse("/proc/slabinfo", buf, size, d);
1422
1423 prune_init_slice(cgroup);
1424
1425 fd = cgroup_ops->get_memory_slabinfo_fd(cgroup_ops, cgroup);
1426 if (fd < 0)
1427 return read_file_fuse("/proc/slabinfo", buf, size, d);
1428
1429 f = fdopen_cached(fd, "re", &fopen_cache);
1430 if (!f)
1431 return read_file_fuse("/proc/slabinfo", buf, size, d);
1432
1433 while (getline(&line, &linelen, f) != -1) {
1434 ssize_t l = snprintf(cache, cache_size, "%s", line);
1435 if (l < 0)
1436 return log_error(0, "Failed to write cache");
3cf1e562 1437 if ((size_t)l >= cache_size)
6cc153e6
FS
1438 return log_error(0, "Write to cache was truncated");
1439
1440 cache += l;
1441 cache_size -= l;
1442 total_len += l;
1443 }
1444
1445 d->cached = 1;
1446 d->size = total_len;
1447 if (total_len > size)
1448 total_len = size;
1449 memcpy(buf, d->buf, total_len);
1450
1451 return total_len;
1452}
1453
2d7bcab7
CB
1454__lxcfs_fuse_ops int proc_read(const char *path, char *buf, size_t size,
1455 off_t offset, struct fuse_file_info *fi)
1f5596dd 1456{
99b183fb 1457 struct file_info *f = INTTYPE_TO_PTR(fi->fh);
1f5596dd
CB
1458
1459 switch (f->type) {
1460 case LXC_TYPE_PROC_MEMINFO:
cbfc55fd
CB
1461 if (liblxcfs_functional())
1462 return proc_meminfo_read(buf, size, offset, fi);
1463
1464 return read_file_fuse_with_offset(LXC_TYPE_PROC_MEMINFO_PATH,
1465 buf, size, offset, f);
1f5596dd 1466 case LXC_TYPE_PROC_CPUINFO:
cbfc55fd
CB
1467 if (liblxcfs_functional())
1468 return proc_cpuinfo_read(buf, size, offset, fi);
1469
1470 return read_file_fuse_with_offset(LXC_TYPE_PROC_CPUINFO_PATH,
1471 buf, size, offset, f);
1f5596dd 1472 case LXC_TYPE_PROC_UPTIME:
cbfc55fd
CB
1473 if (liblxcfs_functional())
1474 return proc_uptime_read(buf, size, offset, fi);
1475
1476 return read_file_fuse_with_offset(LXC_TYPE_PROC_UPTIME_PATH,
1477 buf, size, offset, f);
1f5596dd 1478 case LXC_TYPE_PROC_STAT:
cbfc55fd
CB
1479 if (liblxcfs_functional())
1480 return proc_stat_read(buf, size, offset, fi);
1481
1482 return read_file_fuse_with_offset(LXC_TYPE_PROC_STAT_PATH, buf,
1483 size, offset, f);
1f5596dd 1484 case LXC_TYPE_PROC_DISKSTATS:
cbfc55fd
CB
1485 if (liblxcfs_functional())
1486 return proc_diskstats_read(buf, size, offset, fi);
1487
1488 return read_file_fuse_with_offset(LXC_TYPE_PROC_DISKSTATS_PATH,
1489 buf, size, offset, f);
1f5596dd 1490 case LXC_TYPE_PROC_SWAPS:
cbfc55fd
CB
1491 if (liblxcfs_functional())
1492 return proc_swaps_read(buf, size, offset, fi);
1493
1494 return read_file_fuse_with_offset(LXC_TYPE_PROC_SWAPS_PATH, buf,
1495 size, offset, f);
1f5596dd 1496 case LXC_TYPE_PROC_LOADAVG:
cbfc55fd
CB
1497 if (liblxcfs_functional())
1498 return proc_loadavg_read(buf, size, offset, fi);
1499
1500 return read_file_fuse_with_offset(LXC_TYPE_PROC_LOADAVG_PATH,
1501 buf, size, offset, f);
6cc153e6
FS
1502 case LXC_TYPE_PROC_SLABINFO:
1503 if (liblxcfs_functional())
1504 return proc_slabinfo_read(buf, size, offset, fi);
1505
1506 return read_file_fuse_with_offset(LXC_TYPE_PROC_SLABINFO_PATH,
1507 buf, size, offset, f);
1f5596dd 1508 }
99b183fb
CB
1509
1510 return -EINVAL;
1f5596dd 1511}