]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/proc_fuse.c
Merge pull request #526 from brauner/2022-03-13.fixes
[mirror_lxcfs.git] / src / proc_fuse.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
1f5596dd 2
f834b6bf
SP
3#include "config.h"
4
1f5596dd
CB
5#include <dirent.h>
6#include <errno.h>
7#include <fcntl.h>
1f5596dd
CB
8#include <inttypes.h>
9#include <libgen.h>
10#include <pthread.h>
11#include <sched.h>
12#include <stdarg.h>
13#include <stdbool.h>
14#include <stdint.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <time.h>
19#include <unistd.h>
20#include <wait.h>
21#include <linux/magic.h>
22#include <linux/sched.h>
23#include <sys/epoll.h>
24#include <sys/mman.h>
25#include <sys/mount.h>
26#include <sys/param.h>
27#include <sys/socket.h>
28#include <sys/syscall.h>
29#include <sys/sysinfo.h>
30#include <sys/vfs.h>
31
e01afbb7
CB
32#include "proc_fuse.h"
33
1f5596dd 34#include "bindings.h"
1f5596dd
CB
35#include "cgroup_fuse.h"
36#include "cgroups/cgroup.h"
37#include "cgroups/cgroup_utils.h"
4ec5c9da 38#include "cpuset_parse.h"
ec2043ed 39#include "lxcfs_fuse_compat.h"
1f5596dd
CB
40#include "memory_utils.h"
41#include "proc_loadavg.h"
4ec5c9da 42#include "proc_cpuview.h"
1f5596dd
CB
43#include "utils.h"
44
acff9786
CB
45struct memory_stat {
46 uint64_t hierarchical_memory_limit;
47 uint64_t hierarchical_memsw_limit;
48 uint64_t total_cache;
49 uint64_t total_rss;
50 uint64_t total_rss_huge;
51 uint64_t total_shmem;
52 uint64_t total_mapped_file;
53 uint64_t total_dirty;
54 uint64_t total_writeback;
55 uint64_t total_swap;
56 uint64_t total_pgpgin;
57 uint64_t total_pgpgout;
58 uint64_t total_pgfault;
59 uint64_t total_pgmajfault;
60 uint64_t total_inactive_anon;
61 uint64_t total_active_anon;
62 uint64_t total_inactive_file;
63 uint64_t total_active_file;
64 uint64_t total_unevictable;
65};
66
ce554964
SP
67static off_t get_procfile_size(const char *path)
68{
69 __do_fclose FILE *f = NULL;
70 __do_free char *line = NULL;
71 size_t len = 0;
72 ssize_t sz, answer = 0;
73
74 f = fopen(path, "re");
75 if (!f)
76 return 0;
77
78 while ((sz = getline(&line, &len, f)) != -1)
79 answer += sz;
80
81 return answer;
82}
83
2d7bcab7 84__lxcfs_fuse_ops int proc_getattr(const char *path, struct stat *sb)
1f5596dd
CB
85{
86 struct timespec now;
87
88 memset(sb, 0, sizeof(struct stat));
89 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
90 return -EINVAL;
f75d5b75 91
1f5596dd
CB
92 sb->st_uid = sb->st_gid = 0;
93 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
94 if (strcmp(path, "/proc") == 0) {
95 sb->st_mode = S_IFDIR | 00555;
96 sb->st_nlink = 2;
97 return 0;
98 }
f75d5b75
CB
99
100 if (strcmp(path, "/proc/meminfo") == 0 ||
101 strcmp(path, "/proc/cpuinfo") == 0 ||
102 strcmp(path, "/proc/uptime") == 0 ||
103 strcmp(path, "/proc/stat") == 0 ||
104 strcmp(path, "/proc/diskstats") == 0 ||
105 strcmp(path, "/proc/swaps") == 0 ||
6cc153e6
FS
106 strcmp(path, "/proc/loadavg") == 0 ||
107 strcmp(path, "/proc/slabinfo") == 0) {
ce554964 108 sb->st_size = get_procfile_size(path);
1f5596dd
CB
109 sb->st_mode = S_IFREG | 00444;
110 sb->st_nlink = 1;
111 return 0;
112 }
113
114 return -ENOENT;
115}
116
2d7bcab7
CB
117__lxcfs_fuse_ops int proc_readdir(const char *path, void *buf,
118 fuse_fill_dir_t filler, off_t offset,
119 struct fuse_file_info *fi)
1f5596dd 120{
5aff2eb2
CBM
121 if (dir_filler(filler, buf, ".", 0) != 0 ||
122 dir_filler(filler, buf, "..", 0) != 0 ||
123 dir_filler(filler, buf, "cpuinfo", 0) != 0 ||
124 dir_filler(filler, buf, "meminfo", 0) != 0 ||
125 dir_filler(filler, buf, "stat", 0) != 0 ||
126 dir_filler(filler, buf, "uptime", 0) != 0 ||
127 dir_filler(filler, buf, "diskstats", 0) != 0 ||
128 dir_filler(filler, buf, "swaps", 0) != 0 ||
129 dir_filler(filler, buf, "loadavg", 0) != 0 ||
130 dir_filler(filler, buf, "slabinfo", 0) != 0)
1f5596dd
CB
131 return -EINVAL;
132
133 return 0;
134}
135
2d7bcab7 136__lxcfs_fuse_ops int proc_open(const char *path, struct fuse_file_info *fi)
1f5596dd 137{
700dd417 138 __do_free struct file_info *info = NULL;
1f5596dd 139 int type = -1;
1f5596dd
CB
140
141 if (strcmp(path, "/proc/meminfo") == 0)
142 type = LXC_TYPE_PROC_MEMINFO;
143 else if (strcmp(path, "/proc/cpuinfo") == 0)
144 type = LXC_TYPE_PROC_CPUINFO;
145 else if (strcmp(path, "/proc/uptime") == 0)
146 type = LXC_TYPE_PROC_UPTIME;
147 else if (strcmp(path, "/proc/stat") == 0)
148 type = LXC_TYPE_PROC_STAT;
149 else if (strcmp(path, "/proc/diskstats") == 0)
150 type = LXC_TYPE_PROC_DISKSTATS;
151 else if (strcmp(path, "/proc/swaps") == 0)
152 type = LXC_TYPE_PROC_SWAPS;
153 else if (strcmp(path, "/proc/loadavg") == 0)
154 type = LXC_TYPE_PROC_LOADAVG;
6cc153e6
FS
155 else if (strcmp(path, "/proc/slabinfo") == 0)
156 type = LXC_TYPE_PROC_SLABINFO;
1f5596dd
CB
157 if (type == -1)
158 return -ENOENT;
159
f1a33645 160 info = zalloc(sizeof(*info));
1f5596dd
CB
161 if (!info)
162 return -ENOMEM;
163
1f5596dd
CB
164 info->type = type;
165
166 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
167
f1a33645 168 info->buf = zalloc(info->buflen);
1f5596dd
CB
169 if (!info->buf)
170 return -ENOMEM;
1f5596dd
CB
171 /* set actual size to buffer size */
172 info->size = info->buflen;
173
700dd417 174 fi->fh = PTR_TO_UINT64(move_ptr(info));
1f5596dd
CB
175 return 0;
176}
177
2d7bcab7 178__lxcfs_fuse_ops int proc_access(const char *path, int mask)
1f5596dd
CB
179{
180 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
181 return 0;
182
183 /* these are all read-only */
184 if ((mask & ~R_OK) != 0)
185 return -EACCES;
f75d5b75 186
1f5596dd
CB
187 return 0;
188}
189
2d7bcab7 190__lxcfs_fuse_ops int proc_release(const char *path, struct fuse_file_info *fi)
1f5596dd
CB
191{
192 do_release_file_info(fi);
193 return 0;
194}
195
e9712933 196static uint64_t get_memlimit(const char *cgroup, bool swap)
1f5596dd 197{
1f5596dd 198 __do_free char *memlimit_str = NULL;
c83158f2 199 uint64_t memlimit = 0;
f75d5b75 200 int ret;
1f5596dd
CB
201
202 if (swap)
203 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
204 else
205 ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
ee1a885f 206 if (ret > 0 && memlimit_str[0] && safe_uint64(memlimit_str, &memlimit, 10) < 0)
1992babf
CBM
207 lxcfs_error("Failed to convert memory%s.max=%s for cgroup %s",
208 swap ? ".swap" : "", memlimit_str, cgroup);
1f5596dd
CB
209
210 return memlimit;
211}
212
6f88ab0c
JS
213/*
214 * This function taken from glibc-2.32, as POSIX dirname("/some-dir") will
215 * return "/some-dir" as opposed to "/", which breaks `get_min_memlimit()`
216 */
217static char *gnu_dirname(char *path)
218{
219 static const char dot[] = ".";
220 char *last_slash;
221
222 /* Find last '/'. */
223 last_slash = path != NULL ? strrchr(path, '/') : NULL;
224
225 if (last_slash != NULL && last_slash != path && last_slash[1] == '\0') {
226 /* Determine whether all remaining characters are slashes. */
227 char *runp;
228
229 for (runp = last_slash; runp != path; --runp)
230 if (runp[-1] != '/')
231 break;
232
233 /* The '/' is the last character, we have to look further. */
234 if (runp != path)
235 last_slash = memrchr(path, '/', runp - path);
236 }
237
238 if (last_slash != NULL) {
239 /* Determine whether all remaining characters are slashes. */
240 char *runp;
241
242 for (runp = last_slash; runp != path; --runp)
243 if (runp[-1] != '/')
244 break;
245
246 /* Terminate the path. */
247 if (runp == path) {
248 /*
249 * The last slash is the first character in the string.
250 * We have to return "/". As a special case we have to
251 * return "//" if there are exactly two slashes at the
252 * beginning of the string. See XBD 4.10 Path Name
253 * Resolution for more information
254 */
255 if (last_slash == path + 1)
256 ++last_slash;
257 else
258 last_slash = path + 1;
259 } else
260 last_slash = runp;
261
262 last_slash[0] = '\0';
263 } else {
264 /*
265 * This assignment is ill-designed but the XPG specs require to
266 * return a string containing "." in any case no directory part
267 * is found and so a static and constant string is required.
268 */
269 path = (char *)dot;
270 }
271
272 return path;
273}
274
e9712933 275static uint64_t get_min_memlimit(const char *cgroup, bool swap)
1f5596dd
CB
276{
277 __do_free char *copy = NULL;
c83158f2 278 uint64_t memlimit = 0, retlimit = 0;
1f5596dd
CB
279
280 copy = strdup(cgroup);
f75d5b75
CB
281 if (!copy)
282 return log_error_errno(0, ENOMEM, "Failed to allocate memory");
283
1f5596dd
CB
284 retlimit = get_memlimit(copy, swap);
285
6f88ab0c
JS
286 /*
287 * If the cgroup doesn't start with / (probably won't happen), dirname()
288 * will terminate with "" instead of "/"
289 */
290 while (*copy && strcmp(copy, "/") != 0) {
1f5596dd
CB
291 char *it = copy;
292
6f88ab0c 293 it = gnu_dirname(it);
1f5596dd 294 memlimit = get_memlimit(it, swap);
c83158f2 295 if (memlimit > 0 && memlimit < retlimit)
1f5596dd
CB
296 retlimit = memlimit;
297 };
298
299 return retlimit;
300}
301
f75d5b75 302static inline bool startswith(const char *line, const char *pref)
4ec5c9da 303{
f75d5b75 304 return strncmp(line, pref, strlen(pref)) == 0;
4ec5c9da
CB
305}
306
50f7faee
WB
307static void get_swap_info(const char *cgroup, uint64_t memlimit,
308 uint64_t memusage, uint64_t *swtotal,
309 uint64_t *swusage, uint64_t *memswpriority)
310{
311 __do_free char *memswusage_str = NULL, *memswpriority_str = NULL;
cd255e04 312 uint64_t memswlimit = 0, memswusage = 0;
50f7faee
WB
313 int ret;
314
315 *swtotal = *swusage = 0;
316 *memswpriority = 1;
317
318 memswlimit = get_min_memlimit(cgroup, true);
319 if (memswlimit > 0) {
320 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
321 if (ret < 0 || safe_uint64(memswusage_str, &memswusage, 10) != 0)
322 return;
323
324 if (liblxcfs_memory_is_cgroupv2()) {
325 *swtotal = memswlimit / 1024;
326 *swusage = memswusage / 1024;
327 } else {
328 if (memlimit > memswlimit)
329 *swtotal = 0;
330 else
331 *swtotal = (memswlimit - memlimit) / 1024;
332 if (memusage > memswusage || swtotal == 0)
333 *swusage = 0;
334 else
335 *swusage = (memswusage - memusage) / 1024;
336 }
337
338 ret = cgroup_ops->get_memory_swappiness(cgroup_ops, cgroup, &memswpriority_str);
339 if (ret >= 0)
340 safe_uint64(memswpriority_str, memswpriority, 10);
341 }
342}
343
1f5596dd
CB
344static int proc_swaps_read(char *buf, size_t size, off_t offset,
345 struct fuse_file_info *fi)
346{
362d1193
SG
347 __do_free char *cgroup = NULL, *memusage_str = NULL,
348 *memswusage_str = NULL, *memswpriority_str = NULL;
1f5596dd 349 struct fuse_context *fc = fuse_get_context();
84e184b1 350 bool wants_swap = lxcfs_has_opt(fuse_get_context()->private_data, LXCFS_SWAP_ON);
99b183fb 351 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
50f7faee 352 uint64_t memlimit = 0, memusage = 0,
79af0cd1 353 swtotal = 0, swusage = 0, memswpriority = 1,
3ce66074 354 hostswtotal = 0, hostswfree = 0;
1f5596dd
CB
355 ssize_t total_len = 0;
356 ssize_t l = 0;
357 char *cache = d->buf;
358 int ret;
3ce66074
SG
359 __do_free char *line = NULL;
360 __do_free void *fopen_cache = NULL;
361 __do_fclose FILE *f = NULL;
362 size_t linelen = 0;
1f5596dd
CB
363
364 if (offset) {
3cf1e562 365 size_t left;
1f5596dd
CB
366
367 if (offset > d->size)
368 return -EINVAL;
369
370 if (!d->cached)
371 return 0;
372
373 left = d->size - offset;
374 total_len = left > size ? size: left;
375 memcpy(buf, cache + offset, total_len);
376
377 return total_len;
378 }
379
380 pid_t initpid = lookup_initpid_in_store(fc->pid);
381 if (initpid <= 1 || is_shared_pidns(initpid))
382 initpid = fc->pid;
a9f0d623 383
b7b018d0
CB
384 cgroup = get_pid_cgroup(initpid, "memory");
385 if (!cgroup)
1f5596dd 386 return read_file_fuse("/proc/swaps", buf, size, d);
b7b018d0 387 prune_init_slice(cgroup);
1f5596dd 388
b7b018d0 389 memlimit = get_min_memlimit(cgroup, false);
1f5596dd 390
b7b018d0 391 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
1f5596dd
CB
392 if (ret < 0)
393 return 0;
394
c83158f2
CB
395 if (safe_uint64(memusage_str, &memusage, 10) < 0)
396 lxcfs_error("Failed to convert memusage %s", memusage_str);
1f5596dd 397
50f7faee
WB
398 if (wants_swap)
399 get_swap_info(cgroup, memlimit, memusage, &swtotal, &swusage, &memswpriority);
1f5596dd
CB
400
401 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
402
3ce66074
SG
403 /* Read host total and free values */
404 f = fopen_cached("/proc/meminfo", "re", &fopen_cache);
405 if (!f)
406 return 0;
1f5596dd 407
3ce66074
SG
408 while (getline(&line, &linelen, f) != -1) {
409 if (startswith(line, "SwapTotal:"))
410 sscanf(line, "SwapTotal: %8" PRIu64 " kB", &hostswtotal);
411 else if (startswith(line, "SwapFree:"))
412 sscanf(line, "SwapFree: %8" PRIu64 " kB", &hostswfree);
413 }
1f5596dd 414
3ce66074
SG
415 if (wants_swap) {
416 /* The total amount of swap is always reported to be the
417 lesser of the RAM+SWAP limit or the SWAP device size.
418 This is because the kernel can swap as much as it
419 wants and not only up to swtotal. */
420 swtotal = memlimit / 1024 + swtotal;
421 if (hostswtotal < swtotal) {
422 swtotal = hostswtotal;
1f5596dd 423 }
1f5596dd 424
3ce66074
SG
425 /* When swappiness is 0, pretend we can't swap. */
426 if (memswpriority == 0) {
427 swtotal = swusage;
428 }
362d1193
SG
429 }
430
b7b018d0 431 if (swtotal > 0) {
1f5596dd 432 l = snprintf(d->buf + total_len, d->size - total_len,
e9712933 433 "none%*svirtual\t\t%" PRIu64 "\t%" PRIu64 "\t0\n",
7cbfbc74 434 36, " ", swtotal, swusage);
1f5596dd
CB
435 total_len += l;
436 }
437
f75d5b75
CB
438 if (total_len < 0 || l < 0)
439 return log_error(0, "Failed writing to cache");
1f5596dd
CB
440
441 d->cached = 1;
442 d->size = (int)total_len;
443
3cf1e562 444 if ((size_t)total_len > size)
f75d5b75 445 total_len = size;
1f5596dd 446 memcpy(buf, d->buf, total_len);
f75d5b75 447
1f5596dd
CB
448 return total_len;
449}
450
451static void get_blkio_io_value(char *str, unsigned major, unsigned minor,
1ba088ae 452 char *iotype, uint64_t *v)
1f5596dd
CB
453{
454 char *eol;
455 char key[32];
f75d5b75 456 size_t len;
1f5596dd
CB
457
458 memset(key, 0, 32);
459 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
460
1f5596dd 461 *v = 0;
f75d5b75 462 len = strlen(key);
1f5596dd
CB
463 while (*str) {
464 if (startswith(str, key)) {
94cef7a2 465 sscanf(str + len, "%" PRIu64, v);
1f5596dd
CB
466 return;
467 }
468 eol = strchr(str, '\n');
469 if (!eol)
470 return;
94cef7a2 471 str = eol + 1;
1f5596dd
CB
472 }
473}
474
998cdfc9
CB
475struct lxcfs_diskstats {
476 unsigned int major; /* 1 - major number */
477 unsigned int minor; /* 2 - minor mumber */
478 char dev_name[72]; /* 3 - device name */
479 uint64_t read; /* 4 - reads completed successfully */
480 uint64_t read_merged; /* 5 - reads merged */
481 uint64_t read_sectors; /* 6 - sectors read */
482 uint64_t read_ticks; /* 7 - time spent reading (ms) */
483 uint64_t write; /* 8 - writes completed */
484 uint64_t write_merged; /* 9 - writes merged */
485 uint64_t write_sectors; /* 10 - sectors written */
486 uint64_t write_ticks; /* 11 - time spent writing (ms) */
487 uint64_t ios_pgr; /* 12 - I/Os currently in progress */
488 uint64_t total_ticks; /* 13 - time spent doing I/Os (ms) */
489 uint64_t rq_ticks; /* 14 - weighted time spent doing I/Os (ms) */
490 uint64_t discard; /* 15 - discards completed successfully (4.18+) */
491 uint64_t discard_merged; /* 16 - discards merged (4.18+) */
492 uint64_t discard_sectors; /* 17 - sectors discarded (4.18+) */
493 uint64_t discard_ticks; /* 18 - time spent discarding (4.18+) */
494};
495
1f5596dd
CB
496static int proc_diskstats_read(char *buf, size_t size, off_t offset,
497 struct fuse_file_info *fi)
498{
499 __do_free char *cg = NULL, *io_serviced_str = NULL,
500 *io_merged_str = NULL, *io_service_bytes_str = NULL,
501 *io_wait_time_str = NULL, *io_service_time_str = NULL,
502 *line = NULL;
757a63e7 503 __do_free void *fopen_cache = NULL;
1f5596dd
CB
504 __do_fclose FILE *f = NULL;
505 struct fuse_context *fc = fuse_get_context();
99b183fb 506 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
998cdfc9
CB
507 struct lxcfs_diskstats stats = {};
508 /* helper fields */
509 uint64_t read_service_time, write_service_time, discard_service_time, read_wait_time,
510 write_wait_time, discard_wait_time;
1f5596dd
CB
511 char *cache = d->buf;
512 size_t cache_size = d->buflen;
513 size_t linelen = 0, total_len = 0;
1f5596dd
CB
514 int i = 0;
515 int ret;
1f5596dd 516
cbfc55fd 517 if (offset) {
3cf1e562 518 size_t left;
1f5596dd
CB
519
520 if (offset > d->size)
521 return -EINVAL;
522
523 if (!d->cached)
524 return 0;
525
526 left = d->size - offset;
527 total_len = left > size ? size: left;
528 memcpy(buf, cache + offset, total_len);
529
530 return total_len;
531 }
532
533 pid_t initpid = lookup_initpid_in_store(fc->pid);
534 if (initpid <= 1 || is_shared_pidns(initpid))
535 initpid = fc->pid;
a9f0d623 536
1f5596dd
CB
537 cg = get_pid_cgroup(initpid, "blkio");
538 if (!cg)
539 return read_file_fuse("/proc/diskstats", buf, size, d);
540 prune_init_slice(cg);
541
542 ret = cgroup_ops->get_io_serviced(cgroup_ops, cg, &io_serviced_str);
543 if (ret < 0) {
544 if (ret == -EOPNOTSUPP)
545 return read_file_fuse("/proc/diskstats", buf, size, d);
546 }
547
548 ret = cgroup_ops->get_io_merged(cgroup_ops, cg, &io_merged_str);
549 if (ret < 0) {
550 if (ret == -EOPNOTSUPP)
551 return read_file_fuse("/proc/diskstats", buf, size, d);
552 }
553
554 ret = cgroup_ops->get_io_service_bytes(cgroup_ops, cg, &io_service_bytes_str);
555 if (ret < 0) {
556 if (ret == -EOPNOTSUPP)
557 return read_file_fuse("/proc/diskstats", buf, size, d);
558 }
559
560 ret = cgroup_ops->get_io_wait_time(cgroup_ops, cg, &io_wait_time_str);
561 if (ret < 0) {
562 if (ret == -EOPNOTSUPP)
563 return read_file_fuse("/proc/diskstats", buf, size, d);
564 }
565
566 ret = cgroup_ops->get_io_service_time(cgroup_ops, cg, &io_service_time_str);
567 if (ret < 0) {
568 if (ret == -EOPNOTSUPP)
569 return read_file_fuse("/proc/diskstats", buf, size, d);
570 }
571
757a63e7 572 f = fopen_cached("/proc/diskstats", "re", &fopen_cache);
1f5596dd
CB
573 if (!f)
574 return 0;
575
576 while (getline(&line, &linelen, f) != -1) {
577 ssize_t l;
578 char lbuf[256];
579
998cdfc9 580 i = sscanf(line, "%u %u %71s", &stats.major, &stats.minor, stats.dev_name);
1f5596dd
CB
581 if (i != 3)
582 continue;
583
998cdfc9
CB
584 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Read", &stats.read);
585 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Write", &stats.write);
586 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Discard", &stats.discard);
587
588 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Read", &stats.read_merged);
589 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Write", &stats.write_merged);
590 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Discard", &stats.discard_merged);
591
592 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Read", &stats.read_sectors);
593 stats.read_sectors = stats.read_sectors / 512;
594 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Write", &stats.write_sectors);
595 stats.write_sectors = stats.write_sectors / 512;
596 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Discard", &stats.discard_sectors);
597 stats.discard_sectors = stats.discard_sectors / 512;
598
599 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Read", &read_service_time);
600 read_service_time = read_service_time / 1000000;
601 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Read", &read_wait_time);
602 read_wait_time = read_wait_time / 1000000;
603 stats.read_ticks = read_service_time + read_wait_time;
604
605 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Write", &write_service_time);
606 write_service_time = write_service_time / 1000000;
607 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Write", &write_wait_time);
608 write_wait_time = write_wait_time / 1000000;
609 stats.write_ticks = write_service_time + write_wait_time;
610
611 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Discard", &discard_service_time);
612 discard_service_time = discard_service_time / 1000000;
613 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Discard", &discard_wait_time);
614 discard_wait_time = discard_wait_time / 1000000;
615 stats.discard_ticks = discard_service_time + discard_wait_time;
616
617 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Total", &stats.total_ticks);
618 stats.total_ticks = stats.total_ticks / 1000000;
1f5596dd
CB
619
620 memset(lbuf, 0, 256);
998cdfc9
CB
621 if (stats.read || stats.write || stats.read_merged || stats.write_merged ||
622 stats.read_sectors || stats.write_sectors || stats.read_ticks ||
623 stats.write_ticks || stats.ios_pgr || stats.total_ticks || stats.rq_ticks ||
624 stats.discard_merged || stats.discard_sectors || stats.discard_ticks)
2c559f0b
WB
625 snprintf(
626 lbuf,
627 256,
628 "%u %u" /* major, minor */
629 " %s" /* dev_name */
630 " %" PRIu64 /* read */
631 " %" PRIu64 /* read_merged */
632 " %" PRIu64 /* read_sectors */
633 " %" PRIu64 /* read_ticks */
634 " %" PRIu64 /* write */
635 " %" PRIu64 /* write_merged */
636 " %" PRIu64 /* write_sectors */
637 " %" PRIu64 /* write_ticks */
638 " %" PRIu64 /* ios_pgr */
639 " %" PRIu64 /* total_ticks */
640 " %" PRIu64 /* rq_ticks */
641 " %" PRIu64 /* discard_merged */
642 " %" PRIu64 /* discard_sectors */
643 " %" PRIu64 /* discard_ticks */
644 "\n",
998cdfc9
CB
645 stats.major,
646 stats.minor,
647 stats.dev_name,
648 stats.read,
649 stats.read_merged,
650 stats.read_sectors,
651 stats.read_ticks,
652 stats.write,
653 stats.write_merged,
654 stats.write_sectors,
655 stats.write_ticks,
656 stats.ios_pgr,
657 stats.total_ticks,
658 stats.rq_ticks,
659 stats.discard_merged,
660 stats.discard_sectors,
661 stats.discard_ticks);
1f5596dd
CB
662 else
663 continue;
664
665 l = snprintf(cache, cache_size, "%s", lbuf);
f75d5b75
CB
666 if (l < 0)
667 return log_error(0, "Failed to write cache");
3cf1e562 668 if ((size_t)l >= cache_size)
f75d5b75
CB
669 return log_error(0, "Write to cache was truncated");
670
1f5596dd
CB
671 cache += l;
672 cache_size -= l;
673 total_len += l;
674 }
675
676 d->cached = 1;
677 d->size = total_len;
cbfc55fd
CB
678 if (total_len > size)
679 total_len = size;
1f5596dd
CB
680 memcpy(buf, d->buf, total_len);
681
682 return total_len;
683}
684
b1ef0dde 685#ifdef RELOADTEST
12a60884 686static inline void iwashere(void)
1f5596dd 687{
12a60884 688 mknod("/tmp/lxcfs-iwashere", S_IFREG, 0644);
1f5596dd
CB
689}
690#endif
691
c9c93749
CB
692/*
693 * This function retrieves the busy time of a group of tasks by looking at
1f5596dd
CB
694 * cpuacct.usage. Unfortunately, this only makes sense when the container has
695 * been given it's own cpuacct cgroup. If not, this function will take the busy
696 * time of all other taks that do not actually belong to the container into
697 * account as well. If someone has a clever solution for this please send a
698 * patch!
699 */
700static double get_reaper_busy(pid_t task)
701{
702 __do_free char *cgroup = NULL, *usage_str = NULL;
e9712933 703 uint64_t usage = 0;
1f5596dd
CB
704 pid_t initpid;
705
706 initpid = lookup_initpid_in_store(task);
707 if (initpid <= 0)
708 return 0;
709
710 cgroup = get_pid_cgroup(initpid, "cpuacct");
711 if (!cgroup)
712 return 0;
713 prune_init_slice(cgroup);
c9c93749 714
e9712933 715 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cgroup, "cpuacct.usage", &usage_str))
1f5596dd
CB
716 return 0;
717
c83158f2
CB
718 if (safe_uint64(usage_str, &usage, 10) < 0)
719 lxcfs_error("Failed to convert usage %s", usage_str);
720
1f5596dd
CB
721 return ((double)usage / 1000000000);
722}
723
724static uint64_t get_reaper_start_time(pid_t pid)
725{
757a63e7 726 __do_free void *fopen_cache = NULL;
12a60884 727 __do_fclose FILE *f = NULL;
1f5596dd 728 int ret;
1f5596dd 729 uint64_t starttime;
c9c93749
CB
730 char path[STRLITERALLEN("/proc/") + LXCFS_NUMSTRLEN64 +
731 STRLITERALLEN("/stat") + 1];
1f5596dd
CB
732 pid_t qpid;
733
734 qpid = lookup_initpid_in_store(pid);
c9c93749
CB
735 if (qpid <= 0)
736 return ret_errno(EINVAL);
1f5596dd 737
c9c93749
CB
738 ret = snprintf(path, sizeof(path), "/proc/%d/stat", qpid);
739 if (ret < 0 || (size_t)ret >= sizeof(path))
740 return ret_errno(EINVAL);
1f5596dd 741
757a63e7 742 f = fopen_cached(path, "re", &fopen_cache);
c9c93749
CB
743 if (!f)
744 return ret_errno(EINVAL);
1f5596dd
CB
745
746 /* Note that the *scanf() argument supression requires that length
747 * modifiers such as "l" are omitted. Otherwise some compilers will yell
748 * at us. It's like telling someone you're not married and then asking
749 * if you can bring your wife to the party.
750 */
751 ret = fscanf(f, "%*d " /* (1) pid %d */
752 "%*s " /* (2) comm %s */
753 "%*c " /* (3) state %c */
754 "%*d " /* (4) ppid %d */
755 "%*d " /* (5) pgrp %d */
756 "%*d " /* (6) session %d */
757 "%*d " /* (7) tty_nr %d */
758 "%*d " /* (8) tpgid %d */
759 "%*u " /* (9) flags %u */
760 "%*u " /* (10) minflt %lu */
761 "%*u " /* (11) cminflt %lu */
762 "%*u " /* (12) majflt %lu */
763 "%*u " /* (13) cmajflt %lu */
764 "%*u " /* (14) utime %lu */
765 "%*u " /* (15) stime %lu */
766 "%*d " /* (16) cutime %ld */
767 "%*d " /* (17) cstime %ld */
768 "%*d " /* (18) priority %ld */
769 "%*d " /* (19) nice %ld */
770 "%*d " /* (20) num_threads %ld */
771 "%*d " /* (21) itrealvalue %ld */
772 "%" PRIu64, /* (22) starttime %llu */
773 &starttime);
12a60884 774 if (ret != 1)
c9c93749 775 return ret_errno(EINVAL);
1f5596dd 776
12a60884 777 return ret_set_errno(starttime, 0);
1f5596dd
CB
778}
779
780static double get_reaper_start_time_in_sec(pid_t pid)
781{
782 uint64_t clockticks, ticks_per_sec;
783 int64_t ret;
784 double res = 0;
785
786 clockticks = get_reaper_start_time(pid);
c9c93749 787 if (clockticks <= 0)
f75d5b75 788 return log_debug(0, "Failed to retrieve start time of pid %d", pid);
1f5596dd
CB
789
790 ret = sysconf(_SC_CLK_TCK);
c9c93749 791 if (ret < 0)
f75d5b75 792 return log_debug(0, "Failed to determine number of clock ticks in a second");
1f5596dd
CB
793
794 ticks_per_sec = (uint64_t)ret;
795 res = (double)clockticks / ticks_per_sec;
796 return res;
797}
798
799static double get_reaper_age(pid_t pid)
800{
801 uint64_t uptime_ms;
802 double procstart, procage;
803
c9c93749
CB
804 /*
805 * We need to substract the time the process has started since system
1f5596dd
CB
806 * boot minus the time when the system has started to get the actual
807 * reaper age.
808 */
809 procstart = get_reaper_start_time_in_sec(pid);
810 procage = procstart;
811 if (procstart > 0) {
812 int ret;
813 struct timespec spec;
814
815 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
816 if (ret < 0)
817 return 0;
818
1f5596dd
CB
819 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
820 procage = (uptime_ms - (procstart * 1000)) / 1000;
821 }
822
823 return procage;
824}
825
826/*
827 * We read /proc/uptime and reuse its second field.
828 * For the first field, we use the mtime for the reaper for
829 * the calling pid as returned by getreaperage
830 */
831static int proc_uptime_read(char *buf, size_t size, off_t offset,
832 struct fuse_file_info *fi)
833{
834 struct fuse_context *fc = fuse_get_context();
99b183fb 835 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1f5596dd 836 char *cache = d->buf;
d7718002 837 ssize_t total_len = 0, ret = 0;
ea725aba 838 double busytime, idletime, reaperage;
1f5596dd 839
b1ef0dde 840#ifdef RELOADTEST
1f5596dd
CB
841 iwashere();
842#endif
843
f75d5b75 844 if (offset) {
3cf1e562 845 size_t left;
f75d5b75 846
1f5596dd
CB
847 if (offset > d->size)
848 return -EINVAL;
f75d5b75 849
d7718002
CB
850 if (!d->cached)
851 return 0;
852
f75d5b75
CB
853 left = d->size - offset;
854 total_len = left > size ? size : left;
1f5596dd 855 memcpy(buf, cache + offset, total_len);
f75d5b75 856
1f5596dd
CB
857 return total_len;
858 }
859
860 reaperage = get_reaper_age(fc->pid);
f75d5b75
CB
861 /*
862 * To understand why this is done, please read the comment to the
1f5596dd
CB
863 * get_reaper_busy() function.
864 */
865 idletime = reaperage;
ea725aba 866 busytime = get_reaper_busy(fc->pid);
1f5596dd
CB
867 if (reaperage >= busytime)
868 idletime = reaperage - busytime;
869
d7718002
CB
870 ret = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
871 if (ret < 0 || ret >= d->buflen)
c9c93749 872 return read_file_fuse("/proc/uptime", buf, size, d);
d7718002 873 total_len = ret;
1f5596dd 874
1f5596dd 875 d->cached = 1;
d7718002 876 d->size = total_len;
3cf1e562 877 if ((size_t)total_len > size)
f75d5b75 878 total_len = size;
1f5596dd 879 memcpy(buf, d->buf, total_len);
c9c93749 880
1f5596dd
CB
881 return total_len;
882}
883
884#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
885static int proc_stat_read(char *buf, size_t size, off_t offset,
886 struct fuse_file_info *fi)
887{
888 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
757a63e7 889 __do_free void *fopen_cache = NULL;
1f5596dd
CB
890 __do_free struct cpuacct_usage *cg_cpu_usage = NULL;
891 __do_fclose FILE *f = NULL;
892 struct fuse_context *fc = fuse_get_context();
8044f626 893 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
99b183fb 894 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1f5596dd
CB
895 size_t linelen = 0, total_len = 0;
896 int curcpu = -1; /* cpu numbering starts at 0 */
897 int physcpu = 0;
1ba088ae
CB
898 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
899 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
900 uint64_t user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0,
901 iowait_sum = 0, irq_sum = 0, softirq_sum = 0, steal_sum = 0,
902 guest_sum = 0, guest_nice_sum = 0;
1f5596dd
CB
903 char cpuall[CPUALL_MAX_SIZE];
904 /* reserve for cpu all */
905 char *cache = d->buf + CPUALL_MAX_SIZE;
906 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
907 int cg_cpu_usage_size = 0;
908
7b367480 909 if (offset) {
3cf1e562 910 size_t left;
7b367480 911
1f5596dd
CB
912 if (offset > d->size)
913 return -EINVAL;
7b367480 914
1f5596dd
CB
915 if (!d->cached)
916 return 0;
7b367480
CB
917
918 left = d->size - offset;
919 total_len = left > size ? size : left;
1f5596dd 920 memcpy(buf, d->buf + offset, total_len);
7b367480 921
1f5596dd
CB
922 return total_len;
923 }
924
925 pid_t initpid = lookup_initpid_in_store(fc->pid);
a9f0d623 926 if (initpid <= 1 || is_shared_pidns(initpid))
1f5596dd
CB
927 initpid = fc->pid;
928
929 /*
930 * when container run with host pid namespace initpid == 1, cgroup will "/"
931 * we should return host os's /proc contents.
932 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
933 */
605e157d 934 if (initpid == 1)
6300e6c6 935 return read_file_fuse("/proc/stat", buf, size, d);
1f5596dd
CB
936
937 cg = get_pid_cgroup(initpid, "cpuset");
1f5596dd
CB
938 if (!cg)
939 return read_file_fuse("/proc/stat", buf, size, d);
940 prune_init_slice(cg);
941
942 cpuset = get_cpuset(cg);
943 if (!cpuset)
944 return 0;
945
77711d7a
CB
946 f = fopen_cached("/proc/stat", "re", &fopen_cache);
947 if (!f)
948 return 0;
949
1494771e
CB
950 /* Skip first system cpu line. */
951 if (getline(&line, &linelen, f) < 0)
952 return log_error(0, "proc_stat_read read first line failed");
953
1f5596dd
CB
954 /*
955 * Read cpuacct.usage_all for all CPUs.
956 * If the cpuacct cgroup is present, it is used to calculate the container's
957 * CPU usage. If not, values from the host's /proc/stat are used.
958 */
f9434b9a
CB
959 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) == 0) {
960 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs) {
961 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage,
962 cg_cpu_usage_size, f,
963 d->buf, d->buflen);
964 goto out;
965 }
966 } else {
967 lxcfs_v("proc_stat_read failed to read from cpuacct, falling back to the host's /proc/stat");
968 }
1f5596dd 969
1f5596dd
CB
970 while (getline(&line, &linelen, f) != -1) {
971 ssize_t l;
972 char cpu_char[10]; /* That's a lot of cores */
973 char *c;
974 uint64_t all_used, cg_used, new_idle;
975 int ret;
976
977 if (strlen(line) == 0)
978 continue;
979 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
980 /* not a ^cpuN line containing a number N, just print it */
981 l = snprintf(cache, cache_size, "%s", line);
f75d5b75
CB
982 if (l < 0)
983 return log_error(0, "Failed to write cache");
3cf1e562 984 if ((size_t)l >= cache_size)
f75d5b75
CB
985 return log_error(0, "Write to cache was truncated");
986
1f5596dd
CB
987 cache += l;
988 cache_size -= l;
989 total_len += l;
f75d5b75 990
1f5596dd
CB
991 continue;
992 }
993
994 if (sscanf(cpu_char, "%d", &physcpu) != 1)
995 continue;
f75d5b75 996
1f5596dd
CB
997 if (!cpu_in_cpuset(physcpu, cpuset))
998 continue;
f75d5b75 999
2b8eff1d 1000 curcpu++;
1f5596dd 1001
2c559f0b
WB
1002 ret = sscanf(
1003 line,
1004 "%*s" /* <skip> */
1005 " %" PRIu64 /* user */
1006 " %" PRIu64 /* nice */
1007 " %" PRIu64 /* system */
1008 " %" PRIu64 /* idle */
1009 " %" PRIu64 /* iowait */
1010 " %" PRIu64 /* irq */
1011 " %" PRIu64 /* softirq */
1012 " %" PRIu64 /* steal */
1013 " %" PRIu64 /* guest */
1014 " %" PRIu64, /* guest_nice */
1f5596dd
CB
1015 &user,
1016 &nice,
1017 &system,
1018 &idle,
1019 &iowait,
1020 &irq,
1021 &softirq,
1022 &steal,
1023 &guest,
1024 &guest_nice);
1f5596dd
CB
1025 if (ret != 10 || !cg_cpu_usage) {
1026 c = strchr(line, ' ');
1027 if (!c)
1028 continue;
1f5596dd 1029
f75d5b75
CB
1030 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
1031 if (l < 0)
1032 return log_error(0, "Failed to write cache");
3cf1e562 1033 if ((size_t)l >= cache_size)
f75d5b75 1034 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
1035
1036 cache += l;
1037 cache_size -= l;
1038 total_len += l;
1039
1040 if (ret != 10)
1041 continue;
1042 }
1043
1044 if (cg_cpu_usage) {
1045 if (physcpu >= cg_cpu_usage_size)
1046 break;
1047
1048 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
1049 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
1050
1051 if (all_used >= cg_used) {
1052 new_idle = idle + (all_used - cg_used);
1f5596dd 1053 } else {
1e3aa115 1054 lxcfs_debug("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
2b8eff1d 1055 curcpu, cg, all_used, cg_used);
1f5596dd
CB
1056 new_idle = idle;
1057 }
1058
2b8eff1d
CB
1059 l = snprintf(cache, cache_size,
1060 "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
1061 curcpu, cg_cpu_usage[physcpu].user,
1062 cg_cpu_usage[physcpu].system, new_idle);
f75d5b75
CB
1063 if (l < 0)
1064 return log_error(0, "Failed to write cache");
3cf1e562 1065 if ((size_t)l >= cache_size)
f75d5b75 1066 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
1067
1068 cache += l;
1069 cache_size -= l;
1070 total_len += l;
1071
1072 user_sum += cg_cpu_usage[physcpu].user;
1073 system_sum += cg_cpu_usage[physcpu].system;
1074 idle_sum += new_idle;
1f5596dd
CB
1075 } else {
1076 user_sum += user;
1077 nice_sum += nice;
1078 system_sum += system;
1079 idle_sum += idle;
1080 iowait_sum += iowait;
1081 irq_sum += irq;
1082 softirq_sum += softirq;
1083 steal_sum += steal;
1084 guest_sum += guest;
1085 guest_nice_sum += guest_nice;
1086 }
1087 }
1088
1089 cache = d->buf;
1090
2c559f0b
WB
1091 int cpuall_len = snprintf(
1092 cpuall,
1093 CPUALL_MAX_SIZE,
1094 "cpu "
1095 " %" PRIu64 /* user_sum */
1096 " %" PRIu64 /* nice_sum */
1097 " %" PRIu64 /* system_sum */
1098 " %" PRIu64 /* idle_sum */
1099 " %" PRIu64 /* iowait_sum */
1100 " %" PRIu64 /* irq_sum */
1101 " %" PRIu64 /* softirq_sum */
1102 " %" PRIu64 /* steal_sum */
1103 " %" PRIu64 /* guest_sum */
1104 " %" PRIu64 /* guest_nice_sum */
1105 "\n",
1f5596dd
CB
1106 user_sum,
1107 nice_sum,
1108 system_sum,
1109 idle_sum,
1110 iowait_sum,
1111 irq_sum,
1112 softirq_sum,
1113 steal_sum,
1114 guest_sum,
1115 guest_nice_sum);
1116 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
1117 memcpy(cache, cpuall, cpuall_len);
1118 cache += cpuall_len;
1119 } else {
1120 /* shouldn't happen */
f75d5b75 1121 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d", cpuall_len);
1f5596dd
CB
1122 cpuall_len = 0;
1123 }
1124
1125 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
1126 total_len += cpuall_len;
1127
1128out:
1129 d->cached = 1;
1130 d->size = total_len;
1131 if (total_len > size)
1132 total_len = size;
1133
1134 memcpy(buf, d->buf, total_len);
1135 return total_len;
1136}
1137
1138/* Note that "memory.stat" in cgroup2 is hierarchical by default. */
acff9786 1139static bool cgroup_parse_memory_stat(const char *cgroup, struct memory_stat *mstat)
1f5596dd 1140{
05b7a16d 1141 __do_close int fd = -EBADF;
acff9786
CB
1142 __do_fclose FILE *f = NULL;
1143 __do_free char *line = NULL;
28519477 1144 __do_free void *fdopen_cache = NULL;
acff9786
CB
1145 bool unified;
1146 size_t len = 0;
1147 ssize_t linelen;
1f5596dd 1148
acff9786
CB
1149 fd = cgroup_ops->get_memory_stats_fd(cgroup_ops, cgroup);
1150 if (fd < 0)
1151 return false;
1152
28519477 1153 f = fdopen_cached(fd, "re", &fdopen_cache);
acff9786
CB
1154 if (!f)
1155 return false;
acff9786
CB
1156
1157 unified = pure_unified_layout(cgroup_ops);
1158 while ((linelen = getline(&line, &len, f)) != -1) {
1159 if (!unified && startswith(line, "hierarchical_memory_limit")) {
1160 sscanf(line, "hierarchical_memory_limit %" PRIu64, &(mstat->hierarchical_memory_limit));
1161 } else if (!unified && startswith(line, "hierarchical_memsw_limit")) {
1162 sscanf(line, "hierarchical_memsw_limit %" PRIu64, &(mstat->hierarchical_memsw_limit));
91d63a9e
IM
1163 } else if (startswith(line, unified ? "file" :"total_cache")) {
1164 sscanf(line, unified ? "file %" PRIu64 : "total_cache %" PRIu64, &(mstat->total_cache));
acff9786
CB
1165 } else if (!unified && startswith(line, "total_rss")) {
1166 sscanf(line, "total_rss %" PRIu64, &(mstat->total_rss));
1167 } else if (!unified && startswith(line, "total_rss_huge")) {
1168 sscanf(line, "total_rss_huge %" PRIu64, &(mstat->total_rss_huge));
1169 } else if (startswith(line, unified ? "shmem" : "total_shmem")) {
1170 sscanf(line, unified ? "shmem %" PRIu64 : "total_shmem %" PRIu64, &(mstat->total_shmem));
1171 } else if (startswith(line, unified ? "file_mapped" : "total_mapped_file")) {
1172 sscanf(line, unified ? "file_mapped %" PRIu64 : "total_mapped_file %" PRIu64, &(mstat->total_mapped_file));
1173 } else if (!unified && startswith(line, "total_dirty")) {
1174 sscanf(line, "total_dirty %" PRIu64, &(mstat->total_dirty));
1175 } else if (!unified && startswith(line, "total_writeback")) {
1176 sscanf(line, "total_writeback %" PRIu64, &(mstat->total_writeback));
1177 } else if (!unified && startswith(line, "total_swap")) {
1178 sscanf(line, "total_swap %" PRIu64, &(mstat->total_swap));
1179 } else if (!unified && startswith(line, "total_pgpgin")) {
1180 sscanf(line, "total_pgpgin %" PRIu64, &(mstat->total_pgpgin));
1181 } else if (!unified && startswith(line, "total_pgpgout")) {
1182 sscanf(line, "total_pgpgout %" PRIu64, &(mstat->total_pgpgout));
1183 } else if (startswith(line, unified ? "pgfault" : "total_pgfault")) {
1184 sscanf(line, unified ? "pgfault %" PRIu64 : "total_pgfault %" PRIu64, &(mstat->total_pgfault));
1185 } else if (startswith(line, unified ? "pgmajfault" : "total_pgmajfault")) {
1186 sscanf(line, unified ? "pgmajfault %" PRIu64 : "total_pgmajfault %" PRIu64, &(mstat->total_pgmajfault));
1187 } else if (startswith(line, unified ? "inactive_anon" : "total_inactive_anon")) {
1188 sscanf(line, unified ? "inactive_anon %" PRIu64 : "total_inactive_anon %" PRIu64, &(mstat->total_inactive_anon));
1189 } else if (startswith(line, unified ? "active_anon" : "total_active_anon")) {
1190 sscanf(line, unified ? "active_anon %" PRIu64 : "total_active_anon %" PRIu64, &(mstat->total_active_anon));
1191 } else if (startswith(line, unified ? "inactive_file" : "total_inactive_file")) {
1192 sscanf(line, unified ? "inactive_file %" PRIu64 : "total_inactive_file %" PRIu64, &(mstat->total_inactive_file));
1193 } else if (startswith(line, unified ? "active_file" : "total_active_file")) {
1194 sscanf(line, unified ? "active_file %" PRIu64 : "total_active_file %" PRIu64, &(mstat->total_active_file));
1195 } else if (startswith(line, unified ? "unevictable" : "total_unevictable")) {
1196 sscanf(line, unified ? "unevictable %" PRIu64 : "total_unevictable %" PRIu64, &(mstat->total_unevictable));
1f5596dd 1197 }
1f5596dd 1198 }
acff9786
CB
1199
1200 return true;
1f5596dd
CB
1201}
1202
1f5596dd
CB
1203static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1204 struct fuse_file_info *fi)
1205{
63f35cc0 1206 __do_free char *cgroup = NULL, *line = NULL, *memusage_str = NULL,
362d1193 1207 *memswusage_str = NULL, *memswpriority_str = NULL;
757a63e7 1208 __do_free void *fopen_cache = NULL;
1f5596dd
CB
1209 __do_fclose FILE *f = NULL;
1210 struct fuse_context *fc = fuse_get_context();
84e184b1 1211 bool wants_swap = lxcfs_has_opt(fuse_get_context()->private_data, LXCFS_SWAP_ON);
99b183fb 1212 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
50f7faee 1213 uint64_t memlimit = 0, memusage = 0,
362d1193
SG
1214 hosttotal = 0, swfree = 0, swusage = 0, swtotal = 0,
1215 memswpriority = 1;
334a14f9 1216 struct memory_stat mstat = {};
1f5596dd
CB
1217 size_t linelen = 0, total_len = 0;
1218 char *cache = d->buf;
1219 size_t cache_size = d->buflen;
1220 int ret;
1221
1222 if (offset) {
3cf1e562 1223 size_t left;
1f5596dd
CB
1224
1225 if (offset > d->size)
1226 return -EINVAL;
1227
1228 if (!d->cached)
1229 return 0;
1230
1231 left = d->size - offset;
1232 total_len = left > size ? size : left;
1233 memcpy(buf, cache + offset, total_len);
1234
1235 return total_len;
1236 }
1237
1238 pid_t initpid = lookup_initpid_in_store(fc->pid);
1239 if (initpid <= 1 || is_shared_pidns(initpid))
1240 initpid = fc->pid;
1241
1242 cgroup = get_pid_cgroup(initpid, "memory");
1243 if (!cgroup)
1244 return read_file_fuse("/proc/meminfo", buf, size, d);
1245
1246 prune_init_slice(cgroup);
1247
b7b018d0 1248 /* memory limits */
1f5596dd
CB
1249 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
1250 if (ret < 0)
b0f33646 1251 return read_file_fuse("/proc/meminfo", buf, size, d);
1f5596dd 1252
b7b018d0
CB
1253 if (safe_uint64(memusage_str, &memusage, 10) < 0)
1254 lxcfs_error("Failed to convert memusage %s", memusage_str);
1255
acff9786 1256 if (!cgroup_parse_memory_stat(cgroup, &mstat))
b0f33646 1257 return read_file_fuse("/proc/meminfo", buf, size, d);
1f5596dd 1258
b7b018d0
CB
1259 memlimit = get_min_memlimit(cgroup, false);
1260
1f5596dd
CB
1261 /*
1262 * Following values are allowed to fail, because swapaccount might be
1263 * turned off for current kernel.
1264 */
50f7faee
WB
1265 if (wants_swap)
1266 get_swap_info(cgroup, memlimit, memusage, &swtotal, &swusage, &memswpriority);
1f5596dd 1267
757a63e7 1268 f = fopen_cached("/proc/meminfo", "re", &fopen_cache);
1f5596dd 1269 if (!f)
b0f33646 1270 return read_file_fuse("/proc/meminfo", buf, size, d);
1f5596dd 1271
b7b018d0
CB
1272 memusage /= 1024;
1273 memlimit /= 1024;
1f5596dd
CB
1274 while (getline(&line, &linelen, f) != -1) {
1275 ssize_t l;
1276 char *printme, lbuf[100];
1277
1278 memset(lbuf, 0, 100);
1279 if (startswith(line, "MemTotal:")) {
acff9786 1280 sscanf(line+sizeof("MemTotal:")-1, "%" PRIu64, &hosttotal);
114eb8b8
CB
1281 if (memlimit == 0)
1282 memlimit = hosttotal;
1283
1f5596dd
CB
1284 if (hosttotal < memlimit)
1285 memlimit = hosttotal;
acff9786 1286 snprintf(lbuf, 100, "MemTotal: %8" PRIu64 " kB\n", memlimit);
1f5596dd
CB
1287 printme = lbuf;
1288 } else if (startswith(line, "MemFree:")) {
acff9786 1289 snprintf(lbuf, 100, "MemFree: %8" PRIu64 " kB\n", memlimit - memusage);
1f5596dd
CB
1290 printme = lbuf;
1291 } else if (startswith(line, "MemAvailable:")) {
acff9786 1292 snprintf(lbuf, 100, "MemAvailable: %8" PRIu64 " kB\n", memlimit - memusage + mstat.total_cache / 1024);
1f5596dd 1293 printme = lbuf;
07c90197 1294 } else if (startswith(line, "SwapTotal:")) {
b7b018d0
CB
1295 if (wants_swap) {
1296 uint64_t hostswtotal = 0;
1297
1298 sscanf(line + STRLITERALLEN("SwapTotal:"), "%" PRIu64, &hostswtotal);
1299
33aa929e
SG
1300 /* The total amount of swap is always reported to be the
1301 lesser of the RAM+SWAP limit or the SWAP device size.
1302 This is because the kernel can swap as much as it
1303 wants and not only up to swtotal. */
1304
50f7faee
WB
1305 if (!liblxcfs_memory_is_cgroupv2())
1306 swtotal += memlimit;
1307
33aa929e 1308 if (hostswtotal < swtotal) {
b7b018d0 1309 swtotal = hostswtotal;
b7b018d0 1310 }
362d1193
SG
1311
1312 /* When swappiness is 0, pretend we can't swap. */
1313 if (memswpriority == 0) {
1314 swtotal = swusage;
1315 }
b7b018d0
CB
1316 }
1317
6bfe1016 1318 snprintf(lbuf, 100, "SwapTotal: %8" PRIu64 " kB\n", swtotal);
1f5596dd 1319 printme = lbuf;
07c90197 1320 } else if (startswith(line, "SwapFree:")) {
b7b018d0 1321 if (wants_swap) {
33aa929e 1322 swfree = swtotal - swusage;
07c90197 1323 }
b7b018d0 1324
6bfe1016 1325 snprintf(lbuf, 100, "SwapFree: %8" PRIu64 " kB\n", swfree);
1f5596dd
CB
1326 printme = lbuf;
1327 } else if (startswith(line, "Slab:")) {
6ddc3c00 1328 snprintf(lbuf, 100, "Slab: %8" PRIu64 " kB\n", (uint64_t)0);
1f5596dd
CB
1329 printme = lbuf;
1330 } else if (startswith(line, "Buffers:")) {
acff9786 1331 snprintf(lbuf, 100, "Buffers: %8" PRIu64 " kB\n", (uint64_t)0);
1f5596dd
CB
1332 printme = lbuf;
1333 } else if (startswith(line, "Cached:")) {
acff9786
CB
1334 snprintf(lbuf, 100, "Cached: %8" PRIu64 " kB\n",
1335 mstat.total_cache / 1024);
1f5596dd
CB
1336 printme = lbuf;
1337 } else if (startswith(line, "SwapCached:")) {
acff9786 1338 snprintf(lbuf, 100, "SwapCached: %8" PRIu64 " kB\n", (uint64_t)0);
1f5596dd
CB
1339 printme = lbuf;
1340 } else if (startswith(line, "Active:")) {
acff9786
CB
1341 snprintf(lbuf, 100, "Active: %8" PRIu64 " kB\n",
1342 (mstat.total_active_anon +
1343 mstat.total_active_file) /
1344 1024);
1f5596dd
CB
1345 printme = lbuf;
1346 } else if (startswith(line, "Inactive:")) {
acff9786
CB
1347 snprintf(lbuf, 100, "Inactive: %8" PRIu64 " kB\n",
1348 (mstat.total_inactive_anon +
1349 mstat.total_inactive_file) /
1350 1024);
1f5596dd 1351 printme = lbuf;
659b0278 1352 } else if (startswith(line, "Active(anon):")) {
acff9786
CB
1353 snprintf(lbuf, 100, "Active(anon): %8" PRIu64 " kB\n",
1354 mstat.total_active_anon / 1024);
1f5596dd 1355 printme = lbuf;
659b0278 1356 } else if (startswith(line, "Inactive(anon):")) {
acff9786
CB
1357 snprintf(lbuf, 100, "Inactive(anon): %8" PRIu64 " kB\n",
1358 mstat.total_inactive_anon / 1024);
1f5596dd 1359 printme = lbuf;
659b0278 1360 } else if (startswith(line, "Active(file):")) {
acff9786
CB
1361 snprintf(lbuf, 100, "Active(file): %8" PRIu64 " kB\n",
1362 mstat.total_active_file / 1024);
1f5596dd 1363 printme = lbuf;
659b0278 1364 } else if (startswith(line, "Inactive(file):")) {
acff9786
CB
1365 snprintf(lbuf, 100, "Inactive(file): %8" PRIu64 " kB\n",
1366 mstat.total_inactive_file / 1024);
1f5596dd 1367 printme = lbuf;
659b0278 1368 } else if (startswith(line, "Unevictable:")) {
acff9786
CB
1369 snprintf(lbuf, 100, "Unevictable: %8" PRIu64 " kB\n",
1370 mstat.total_unevictable / 1024);
1371 printme = lbuf;
659b0278 1372 } else if (startswith(line, "Dirty:")) {
acff9786
CB
1373 snprintf(lbuf, 100, "Dirty: %8" PRIu64 " kB\n",
1374 mstat.total_dirty / 1024);
1375 printme = lbuf;
659b0278 1376 } else if (startswith(line, "Writeback:")) {
acff9786
CB
1377 snprintf(lbuf, 100, "Writeback: %8" PRIu64 " kB\n",
1378 mstat.total_writeback / 1024);
1379 printme = lbuf;
659b0278 1380 } else if (startswith(line, "AnonPages:")) {
acff9786
CB
1381 snprintf(lbuf, 100, "AnonPages: %8" PRIu64 " kB\n",
1382 (mstat.total_active_anon +
1383 mstat.total_inactive_anon - mstat.total_shmem) /
1384 1024);
1385 printme = lbuf;
659b0278 1386 } else if (startswith(line, "Mapped:")) {
acff9786
CB
1387 snprintf(lbuf, 100, "Mapped: %8" PRIu64 " kB\n",
1388 mstat.total_mapped_file / 1024);
1f5596dd 1389 printme = lbuf;
659b0278 1390 } else if (startswith(line, "SReclaimable:")) {
acff9786 1391 snprintf(lbuf, 100, "SReclaimable: %8" PRIu64 " kB\n", (uint64_t)0);
1f5596dd 1392 printme = lbuf;
659b0278 1393 } else if (startswith(line, "SUnreclaim:")) {
acff9786 1394 snprintf(lbuf, 100, "SUnreclaim: %8" PRIu64 " kB\n", (uint64_t)0);
1f5596dd
CB
1395 printme = lbuf;
1396 } else if (startswith(line, "Shmem:")) {
acff9786
CB
1397 snprintf(lbuf, 100, "Shmem: %8" PRIu64 " kB\n",
1398 mstat.total_shmem / 1024);
1f5596dd 1399 printme = lbuf;
659b0278 1400 } else if (startswith(line, "ShmemHugePages:")) {
acff9786 1401 snprintf(lbuf, 100, "ShmemHugePages: %8" PRIu64 " kB\n", (uint64_t)0);
1f5596dd 1402 printme = lbuf;
659b0278 1403 } else if (startswith(line, "ShmemPmdMapped:")) {
acff9786
CB
1404 snprintf(lbuf, 100, "ShmemPmdMapped: %8" PRIu64 " kB\n", (uint64_t)0);
1405 printme = lbuf;
659b0278 1406 } else if (startswith(line, "AnonHugePages:")) {
acff9786
CB
1407 snprintf(lbuf, 100, "AnonHugePages: %8" PRIu64 " kB\n",
1408 mstat.total_rss_huge / 1024);
1f5596dd 1409 printme = lbuf;
acff9786
CB
1410 } else {
1411 printme = line;
1412 }
1f5596dd
CB
1413
1414 l = snprintf(cache, cache_size, "%s", printme);
f75d5b75
CB
1415 if (l < 0)
1416 return log_error(0, "Failed to write cache");
3cf1e562 1417 if ((size_t)l >= cache_size)
f75d5b75 1418 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
1419
1420 cache += l;
1421 cache_size -= l;
1422 total_len += l;
1423 }
1424
1425 d->cached = 1;
1426 d->size = total_len;
f75d5b75
CB
1427 if (total_len > size)
1428 total_len = size;
1f5596dd
CB
1429 memcpy(buf, d->buf, total_len);
1430
1431 return total_len;
1432}
1433
6cc153e6
FS
1434static int proc_slabinfo_read(char *buf, size_t size, off_t offset,
1435 struct fuse_file_info *fi)
1436{
1437 __do_free char *cgroup = NULL, *line = NULL;
1438 __do_free void *fopen_cache = NULL;
1439 __do_fclose FILE *f = NULL;
1440 __do_close int fd = -EBADF;
1441 struct fuse_context *fc = fuse_get_context();
1442 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1443 size_t linelen = 0, total_len = 0;
1444 char *cache = d->buf;
1445 size_t cache_size = d->buflen;
1446 pid_t initpid;
1447
1448 if (offset) {
3cf1e562 1449 size_t left;
6cc153e6
FS
1450
1451 if (offset > d->size)
1452 return -EINVAL;
1453
1454 if (!d->cached)
1455 return 0;
1456
1457 left = d->size - offset;
1458 total_len = left > size ? size : left;
1459 memcpy(buf, cache + offset, total_len);
1460
1461 return total_len;
1462 }
1463
1464 initpid = lookup_initpid_in_store(fc->pid);
1465 if (initpid <= 1 || is_shared_pidns(initpid))
1466 initpid = fc->pid;
1467
1468 cgroup = get_pid_cgroup(initpid, "memory");
1469 if (!cgroup)
1470 return read_file_fuse("/proc/slabinfo", buf, size, d);
1471
1472 prune_init_slice(cgroup);
1473
1474 fd = cgroup_ops->get_memory_slabinfo_fd(cgroup_ops, cgroup);
1475 if (fd < 0)
1476 return read_file_fuse("/proc/slabinfo", buf, size, d);
1477
1478 f = fdopen_cached(fd, "re", &fopen_cache);
1479 if (!f)
1480 return read_file_fuse("/proc/slabinfo", buf, size, d);
1481
1482 while (getline(&line, &linelen, f) != -1) {
1483 ssize_t l = snprintf(cache, cache_size, "%s", line);
1484 if (l < 0)
1485 return log_error(0, "Failed to write cache");
3cf1e562 1486 if ((size_t)l >= cache_size)
6cc153e6
FS
1487 return log_error(0, "Write to cache was truncated");
1488
1489 cache += l;
1490 cache_size -= l;
1491 total_len += l;
1492 }
1493
1494 d->cached = 1;
1495 d->size = total_len;
1496 if (total_len > size)
1497 total_len = size;
1498 memcpy(buf, d->buf, total_len);
1499
1500 return total_len;
1501}
1502
2d7bcab7
CB
1503__lxcfs_fuse_ops int proc_read(const char *path, char *buf, size_t size,
1504 off_t offset, struct fuse_file_info *fi)
1f5596dd 1505{
99b183fb 1506 struct file_info *f = INTTYPE_TO_PTR(fi->fh);
1f5596dd
CB
1507
1508 switch (f->type) {
1509 case LXC_TYPE_PROC_MEMINFO:
cbfc55fd
CB
1510 if (liblxcfs_functional())
1511 return proc_meminfo_read(buf, size, offset, fi);
1512
1513 return read_file_fuse_with_offset(LXC_TYPE_PROC_MEMINFO_PATH,
1514 buf, size, offset, f);
1f5596dd 1515 case LXC_TYPE_PROC_CPUINFO:
cbfc55fd
CB
1516 if (liblxcfs_functional())
1517 return proc_cpuinfo_read(buf, size, offset, fi);
1518
1519 return read_file_fuse_with_offset(LXC_TYPE_PROC_CPUINFO_PATH,
1520 buf, size, offset, f);
1f5596dd 1521 case LXC_TYPE_PROC_UPTIME:
cbfc55fd
CB
1522 if (liblxcfs_functional())
1523 return proc_uptime_read(buf, size, offset, fi);
1524
1525 return read_file_fuse_with_offset(LXC_TYPE_PROC_UPTIME_PATH,
1526 buf, size, offset, f);
1f5596dd 1527 case LXC_TYPE_PROC_STAT:
cbfc55fd
CB
1528 if (liblxcfs_functional())
1529 return proc_stat_read(buf, size, offset, fi);
1530
1531 return read_file_fuse_with_offset(LXC_TYPE_PROC_STAT_PATH, buf,
1532 size, offset, f);
1f5596dd 1533 case LXC_TYPE_PROC_DISKSTATS:
cbfc55fd
CB
1534 if (liblxcfs_functional())
1535 return proc_diskstats_read(buf, size, offset, fi);
1536
1537 return read_file_fuse_with_offset(LXC_TYPE_PROC_DISKSTATS_PATH,
1538 buf, size, offset, f);
1f5596dd 1539 case LXC_TYPE_PROC_SWAPS:
cbfc55fd
CB
1540 if (liblxcfs_functional())
1541 return proc_swaps_read(buf, size, offset, fi);
1542
1543 return read_file_fuse_with_offset(LXC_TYPE_PROC_SWAPS_PATH, buf,
1544 size, offset, f);
1f5596dd 1545 case LXC_TYPE_PROC_LOADAVG:
cbfc55fd
CB
1546 if (liblxcfs_functional())
1547 return proc_loadavg_read(buf, size, offset, fi);
1548
1549 return read_file_fuse_with_offset(LXC_TYPE_PROC_LOADAVG_PATH,
1550 buf, size, offset, f);
6cc153e6
FS
1551 case LXC_TYPE_PROC_SLABINFO:
1552 if (liblxcfs_functional())
1553 return proc_slabinfo_read(buf, size, offset, fi);
1554
1555 return read_file_fuse_with_offset(LXC_TYPE_PROC_SLABINFO_PATH,
1556 buf, size, offset, f);
1f5596dd 1557 }
99b183fb
CB
1558
1559 return -EINVAL;
1f5596dd 1560}