]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/proc_fuse.c
Release LXCFS 6.0.0
[mirror_lxcfs.git] / src / proc_fuse.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include "config.h"
4
5 #include <dirent.h>
6 #include <errno.h>
7 #include <fcntl.h>
8 #include <inttypes.h>
9 #include <libgen.h>
10 #include <pthread.h>
11 #include <sched.h>
12 #include <stdarg.h>
13 #include <stdbool.h>
14 #include <stdint.h>
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <time.h>
19 #include <unistd.h>
20 #include <wait.h>
21 #include <linux/magic.h>
22 #include <linux/sched.h>
23 #include <sys/epoll.h>
24 #include <sys/mman.h>
25 #include <sys/mount.h>
26 #include <sys/param.h>
27 #include <sys/personality.h>
28 #include <sys/socket.h>
29 #include <sys/syscall.h>
30 #include <sys/sysinfo.h>
31 #include <sys/vfs.h>
32
33 #include "proc_fuse.h"
34
35 #include "bindings.h"
36 #include "cgroup_fuse.h"
37 #include "cgroups/cgroup.h"
38 #include "cgroups/cgroup_utils.h"
39 #include "cpuset_parse.h"
40 #include "lxcfs_fuse_compat.h"
41 #include "memory_utils.h"
42 #include "proc_loadavg.h"
43 #include "proc_cpuview.h"
44 #include "utils.h"
45
46 struct memory_stat {
47 uint64_t hierarchical_memory_limit;
48 uint64_t hierarchical_memsw_limit;
49 uint64_t total_cache;
50 uint64_t total_rss;
51 uint64_t total_rss_huge;
52 uint64_t total_shmem;
53 uint64_t total_mapped_file;
54 uint64_t total_dirty;
55 uint64_t total_writeback;
56 uint64_t total_swap;
57 uint64_t total_pgpgin;
58 uint64_t total_pgpgout;
59 uint64_t total_pgfault;
60 uint64_t total_pgmajfault;
61 uint64_t total_inactive_anon;
62 uint64_t total_active_anon;
63 uint64_t total_inactive_file;
64 uint64_t total_active_file;
65 uint64_t total_unevictable;
66 };
67
68 static off_t get_procfile_size(const char *path)
69 {
70 __do_fclose FILE *f = NULL;
71 __do_free char *line = NULL;
72 size_t len = 0;
73 ssize_t sz, answer = 0;
74
75 f = fopen(path, "re");
76 if (!f)
77 return 0;
78
79 while ((sz = getline(&line, &len, f)) != -1)
80 answer += sz;
81
82 return answer;
83 }
84
85 static off_t get_procfile_size_with_personality(const char *path)
86 {
87 struct fuse_context *fc = fuse_get_context();
88 __u32 host_personality = liblxcfs_personality(), caller_personality;
89 bool change_personality;
90 int ret;
91 off_t procfile_size_ret;
92
93 if (get_task_personality(fc->pid, &caller_personality) < 0)
94 return log_error(0, "Failed to get caller process (pid: %d) personality", fc->pid);
95
96 /* do we need to change thread personality? */
97 change_personality = host_personality != caller_personality;
98
99 if (change_personality) {
100 ret = personality(caller_personality);
101 if (ret == -1)
102 return log_error(0, "Call to personality(%d) failed: %s\n",
103 caller_personality, strerror(errno));
104
105 lxcfs_debug("task (tid: %d) personality was changed %d -> %d\n",
106 (int)syscall(SYS_gettid), ret, caller_personality);
107 }
108
109 procfile_size_ret = get_procfile_size(path);
110
111 if (change_personality) {
112 ret = personality(host_personality);
113 if (ret == -1)
114 return log_error(0, "Call to personality(%d) failed: %s\n",
115 host_personality, strerror(errno));
116
117 lxcfs_debug("task (tid: %d) personality was restored %d -> %d\n",
118 (int)syscall(SYS_gettid), ret, host_personality);
119 }
120
121 return procfile_size_ret;
122 }
123
124 __lxcfs_fuse_ops int proc_getattr(const char *path, struct stat *sb)
125 {
126 struct timespec now;
127
128 memset(sb, 0, sizeof(struct stat));
129 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
130 return -EINVAL;
131
132 sb->st_uid = sb->st_gid = 0;
133 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
134 if (strcmp(path, "/proc") == 0) {
135 sb->st_mode = S_IFDIR | 00555;
136 sb->st_nlink = 2;
137 return 0;
138 }
139
140 if (strcmp(path, "/proc/meminfo") == 0 ||
141 strcmp(path, "/proc/cpuinfo") == 0 ||
142 strcmp(path, "/proc/uptime") == 0 ||
143 strcmp(path, "/proc/stat") == 0 ||
144 strcmp(path, "/proc/diskstats") == 0 ||
145 strcmp(path, "/proc/swaps") == 0 ||
146 strcmp(path, "/proc/loadavg") == 0 ||
147 strcmp(path, "/proc/slabinfo") == 0) {
148 if (liblxcfs_functional())
149 sb->st_size = get_procfile_size_with_personality(path);
150 else
151 sb->st_size = get_procfile_size(path);
152 sb->st_mode = S_IFREG | 00444;
153 sb->st_nlink = 1;
154 return 0;
155 }
156
157 return -ENOENT;
158 }
159
160 __lxcfs_fuse_ops int proc_readdir(const char *path, void *buf,
161 fuse_fill_dir_t filler, off_t offset,
162 struct fuse_file_info *fi)
163 {
164 if (dir_filler(filler, buf, ".", 0) != 0 ||
165 dir_filler(filler, buf, "..", 0) != 0 ||
166 dir_filler(filler, buf, "cpuinfo", 0) != 0 ||
167 dir_filler(filler, buf, "meminfo", 0) != 0 ||
168 dir_filler(filler, buf, "stat", 0) != 0 ||
169 dir_filler(filler, buf, "uptime", 0) != 0 ||
170 dir_filler(filler, buf, "diskstats", 0) != 0 ||
171 dir_filler(filler, buf, "swaps", 0) != 0 ||
172 dir_filler(filler, buf, "loadavg", 0) != 0 ||
173 dir_filler(filler, buf, "slabinfo", 0) != 0)
174 return -EINVAL;
175
176 return 0;
177 }
178
179 __lxcfs_fuse_ops int proc_open(const char *path, struct fuse_file_info *fi)
180 {
181 __do_free struct file_info *info = NULL;
182 int type = -1;
183
184 if (strcmp(path, "/proc/meminfo") == 0)
185 type = LXC_TYPE_PROC_MEMINFO;
186 else if (strcmp(path, "/proc/cpuinfo") == 0)
187 type = LXC_TYPE_PROC_CPUINFO;
188 else if (strcmp(path, "/proc/uptime") == 0)
189 type = LXC_TYPE_PROC_UPTIME;
190 else if (strcmp(path, "/proc/stat") == 0)
191 type = LXC_TYPE_PROC_STAT;
192 else if (strcmp(path, "/proc/diskstats") == 0)
193 type = LXC_TYPE_PROC_DISKSTATS;
194 else if (strcmp(path, "/proc/swaps") == 0)
195 type = LXC_TYPE_PROC_SWAPS;
196 else if (strcmp(path, "/proc/loadavg") == 0)
197 type = LXC_TYPE_PROC_LOADAVG;
198 else if (strcmp(path, "/proc/slabinfo") == 0)
199 type = LXC_TYPE_PROC_SLABINFO;
200 if (type == -1)
201 return -ENOENT;
202
203 info = zalloc(sizeof(*info));
204 if (!info)
205 return -ENOMEM;
206
207 info->type = type;
208
209 if (liblxcfs_functional())
210 info->buflen = get_procfile_size_with_personality(path) + BUF_RESERVE_SIZE;
211 else
212 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
213
214 info->buf = zalloc(info->buflen);
215 if (!info->buf)
216 return -ENOMEM;
217 /* set actual size to buffer size */
218 info->size = info->buflen;
219
220 fi->fh = PTR_TO_UINT64(move_ptr(info));
221 return 0;
222 }
223
224 __lxcfs_fuse_ops int proc_access(const char *path, int mask)
225 {
226 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
227 return 0;
228
229 /* these are all read-only */
230 if ((mask & ~R_OK) != 0)
231 return -EACCES;
232
233 return 0;
234 }
235
236 __lxcfs_fuse_ops int proc_release(const char *path, struct fuse_file_info *fi)
237 {
238 do_release_file_info(fi);
239 return 0;
240 }
241
242 /**
243 * Gets a non-hierarchical memory controller limit, or UINT64_MAX if no limit is
244 * in place. If `swap` is true, reads 'swap' (v2) or 'memsw' (v1); otherwise
245 * reads the memory (RAM) limits.
246 *
247 * @returns 0 on success (and sets `*limit`), < 0 on error
248 */
249 static int get_memlimit(const char *cgroup, bool swap, uint64_t *limit)
250 {
251 __do_free char *memlimit_str = NULL;
252 uint64_t memlimit = UINT64_MAX;
253 int ret;
254
255 if (swap)
256 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
257 else
258 ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
259
260 if (ret < 0)
261 return ret;
262
263 if (memlimit_str[0]) {
264 ret = safe_uint64(memlimit_str, &memlimit, 10);
265 if (ret < 0) {
266 lxcfs_error("Failed to convert memory%s.max=%s for cgroup %s",
267 swap ? ".swap" : "", memlimit_str, cgroup);
268 return ret;
269 }
270 }
271 *limit = memlimit;
272 return 0;
273 }
274
275 /*
276 * This function taken from glibc-2.32, as POSIX dirname("/some-dir") will
277 * return "/some-dir" as opposed to "/", which breaks `get_min_memlimit()`
278 */
279 static char *gnu_dirname(char *path)
280 {
281 static const char dot[] = ".";
282 char *last_slash;
283
284 /* Find last '/'. */
285 last_slash = path != NULL ? strrchr(path, '/') : NULL;
286
287 if (last_slash != NULL && last_slash != path && last_slash[1] == '\0') {
288 /* Determine whether all remaining characters are slashes. */
289 char *runp;
290
291 for (runp = last_slash; runp != path; --runp)
292 if (runp[-1] != '/')
293 break;
294
295 /* The '/' is the last character, we have to look further. */
296 if (runp != path)
297 last_slash = memrchr(path, '/', runp - path);
298 }
299
300 if (last_slash != NULL) {
301 /* Determine whether all remaining characters are slashes. */
302 char *runp;
303
304 for (runp = last_slash; runp != path; --runp)
305 if (runp[-1] != '/')
306 break;
307
308 /* Terminate the path. */
309 if (runp == path) {
310 /*
311 * The last slash is the first character in the string.
312 * We have to return "/". As a special case we have to
313 * return "//" if there are exactly two slashes at the
314 * beginning of the string. See XBD 4.10 Path Name
315 * Resolution for more information
316 */
317 if (last_slash == path + 1)
318 ++last_slash;
319 else
320 last_slash = path + 1;
321 } else
322 last_slash = runp;
323
324 last_slash[0] = '\0';
325 } else {
326 /*
327 * This assignment is ill-designed but the XPG specs require to
328 * return a string containing "." in any case no directory part
329 * is found and so a static and constant string is required.
330 */
331 path = (char *)dot;
332 }
333
334 return path;
335 }
336
337 /**
338 * Gets a hierarchical memory controller limit, or UINT64_MAX if no limit is
339 * in place. If `swap` is true, reads 'swap' (v2) or 'memsw' (v1); otherwise
340 * reads the memory (RAM) limits.
341 *
342 * @returns 0 on success (and sets `*limit`), < 0 on error
343 */
344 static int get_min_memlimit(const char *cgroup, bool swap, uint64_t *limit)
345 {
346 __do_free char *copy = NULL;
347 uint64_t memlimit = UINT64_MAX, retlimit = UINT64_MAX;
348 int ret;
349
350 copy = strdup(cgroup);
351 if (!copy)
352 return log_error_errno(0, ENOMEM, "Failed to allocate memory");
353
354 ret = get_memlimit(copy, swap, &retlimit);
355 if (ret < 0)
356 return ret;
357
358 /*
359 * If the cgroup doesn't start with / (probably won't happen), dirname()
360 * will terminate with "" instead of "/"
361 */
362 while (retlimit != 0 && *copy && strcmp(copy, "/") != 0) {
363 char *it = copy;
364
365 it = gnu_dirname(it);
366 ret = get_memlimit(it, swap, &memlimit);
367 if (ret < 0)
368 return ret;
369 if (memlimit < retlimit)
370 retlimit = memlimit;
371 }
372
373 *limit = retlimit;
374 return 0;
375 }
376
377 static inline bool startswith(const char *line, const char *pref)
378 {
379 return strncmp(line, pref, strlen(pref)) == 0;
380 }
381
382 static void get_swap_info(const char *cgroup, uint64_t memlimit,
383 uint64_t memusage, uint64_t *swtotal,
384 uint64_t *swusage, uint64_t *memswpriority)
385 {
386 __do_free char *memswusage_str = NULL, *memswpriority_str = NULL;
387 uint64_t memswlimit = 0, memswusage = 0;
388 int ret;
389
390 *swtotal = *swusage = 0;
391 *memswpriority = 1;
392
393 ret = get_min_memlimit(cgroup, true, &memswlimit);
394 if (ret < 0)
395 return;
396 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
397 if (ret < 0 || safe_uint64(memswusage_str, &memswusage, 10) < 0)
398 return;
399
400 if (liblxcfs_memory_is_cgroupv2()) {
401 *swtotal = memswlimit / 1024;
402 *swusage = memswusage / 1024;
403 } else {
404 if (memlimit > memswlimit)
405 *swtotal = 0;
406 else
407 *swtotal = (memswlimit - memlimit) / 1024;
408 if (memusage > memswusage || *swtotal == 0)
409 *swusage = 0;
410 else
411 *swusage = (memswusage - memusage) / 1024;
412 }
413
414 ret = cgroup_ops->get_memory_swappiness(cgroup_ops, cgroup, &memswpriority_str);
415 if (ret >= 0)
416 safe_uint64(memswpriority_str, memswpriority, 10);
417 }
418
419 static int proc_swaps_read(char *buf, size_t size, off_t offset,
420 struct fuse_file_info *fi)
421 {
422 __do_free char *cgroup = NULL, *memusage_str = NULL,
423 *memswusage_str = NULL, *memswpriority_str = NULL;
424 struct fuse_context *fc = fuse_get_context();
425 bool wants_swap = lxcfs_has_opt(fuse_get_context()->private_data, LXCFS_SWAP_ON);
426 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
427 uint64_t memlimit = 0, memusage = 0,
428 swtotal = 0, swusage = 0, memswpriority = 1,
429 hostswtotal = 0, hostswfree = 0;
430 ssize_t total_len = 0;
431 ssize_t l = 0;
432 char *cache = d->buf;
433 int ret;
434 __do_free char *line = NULL;
435 __do_free void *fopen_cache = NULL;
436 __do_fclose FILE *f = NULL;
437 size_t linelen = 0;
438
439 if (offset) {
440 size_t left;
441
442 if (offset > d->size)
443 return -EINVAL;
444
445 if (!d->cached)
446 return 0;
447
448 left = d->size - offset;
449 total_len = left > size ? size: left;
450 memcpy(buf, cache + offset, total_len);
451
452 return total_len;
453 }
454
455 pid_t initpid = lookup_initpid_in_store(fc->pid);
456 if (initpid <= 1 || is_shared_pidns(initpid))
457 initpid = fc->pid;
458
459 cgroup = get_pid_cgroup(initpid, "memory");
460 if (!cgroup)
461 return read_file_fuse("/proc/swaps", buf, size, d);
462 prune_init_slice(cgroup);
463
464 ret = get_min_memlimit(cgroup, false, &memlimit);
465 if (ret < 0)
466 return 0;
467 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
468 if (ret < 0)
469 return 0;
470 if (safe_uint64(memusage_str, &memusage, 10) < 0)
471 lxcfs_error("Failed to convert memusage %s", memusage_str);
472
473 if (wants_swap)
474 get_swap_info(cgroup, memlimit, memusage, &swtotal, &swusage, &memswpriority);
475
476 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
477
478 /* Read host total and free values */
479 f = fopen_cached("/proc/meminfo", "re", &fopen_cache);
480 if (!f)
481 return 0;
482
483 while (getline(&line, &linelen, f) != -1) {
484 if (startswith(line, "SwapTotal:"))
485 sscanf(line, "SwapTotal: %8" PRIu64 " kB", &hostswtotal);
486 else if (startswith(line, "SwapFree:"))
487 sscanf(line, "SwapFree: %8" PRIu64 " kB", &hostswfree);
488 }
489
490 if (wants_swap) {
491 /* For cgroups v1, the total amount of swap is always reported to be the
492 lesser of the RAM+SWAP limit or the SWAP device size.
493 This is because the kernel can swap as much as it
494 wants and not only up to swtotal. */
495 if (!liblxcfs_memory_is_cgroupv2())
496 swtotal = memlimit / 1024 + swtotal;
497
498 if (hostswtotal < swtotal) {
499 swtotal = hostswtotal;
500 }
501
502 /* When swappiness is 0, pretend we can't swap. */
503 if (memswpriority == 0) {
504 swtotal = swusage;
505 }
506 }
507
508 if (swtotal > 0) {
509 l = snprintf(d->buf + total_len, d->size - total_len,
510 "none%*svirtual\t\t%" PRIu64 "\t%" PRIu64 "\t0\n",
511 36, " ", swtotal, swusage);
512 total_len += l;
513 }
514
515 if (total_len < 0 || l < 0)
516 return log_error(0, "Failed writing to cache");
517
518 d->cached = 1;
519 d->size = (int)total_len;
520
521 if ((size_t)total_len > size)
522 total_len = size;
523 memcpy(buf, d->buf, total_len);
524
525 return total_len;
526 }
527
528 static void get_blkio_io_value(char *str, unsigned major, unsigned minor,
529 char *iotype, uint64_t *v)
530 {
531 char *eol;
532 char key[32];
533 size_t len;
534
535 memset(key, 0, 32);
536 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
537
538 *v = 0;
539 len = strlen(key);
540 while (*str) {
541 if (startswith(str, key)) {
542 sscanf(str + len, "%" PRIu64, v);
543 return;
544 }
545 eol = strchr(str, '\n');
546 if (!eol)
547 return;
548 str = eol + 1;
549 }
550 }
551
552 struct lxcfs_diskstats {
553 unsigned int major; /* 1 - major number */
554 unsigned int minor; /* 2 - minor mumber */
555 char dev_name[72]; /* 3 - device name */
556 uint64_t read; /* 4 - reads completed successfully */
557 uint64_t read_merged; /* 5 - reads merged */
558 uint64_t read_sectors; /* 6 - sectors read */
559 uint64_t read_ticks; /* 7 - time spent reading (ms) */
560 uint64_t write; /* 8 - writes completed */
561 uint64_t write_merged; /* 9 - writes merged */
562 uint64_t write_sectors; /* 10 - sectors written */
563 uint64_t write_ticks; /* 11 - time spent writing (ms) */
564 uint64_t ios_pgr; /* 12 - I/Os currently in progress */
565 uint64_t total_ticks; /* 13 - time spent doing I/Os (ms) */
566 uint64_t rq_ticks; /* 14 - weighted time spent doing I/Os (ms) */
567 uint64_t discard; /* 15 - discards completed successfully (4.18+) */
568 uint64_t discard_merged; /* 16 - discards merged (4.18+) */
569 uint64_t discard_sectors; /* 17 - sectors discarded (4.18+) */
570 uint64_t discard_ticks; /* 18 - time spent discarding (4.18+) */
571 };
572
573 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
574 struct fuse_file_info *fi)
575 {
576 __do_free char *cg = NULL, *io_serviced_str = NULL,
577 *io_merged_str = NULL, *io_service_bytes_str = NULL,
578 *io_wait_time_str = NULL, *io_service_time_str = NULL,
579 *line = NULL;
580 __do_free void *fopen_cache = NULL;
581 __do_fclose FILE *f = NULL;
582 struct fuse_context *fc = fuse_get_context();
583 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
584 struct lxcfs_diskstats stats = {};
585 /* helper fields */
586 uint64_t read_service_time, write_service_time, discard_service_time, read_wait_time,
587 write_wait_time, discard_wait_time;
588 char *cache = d->buf;
589 size_t cache_size = d->buflen;
590 size_t linelen = 0, total_len = 0;
591 int i = 0;
592 int ret;
593
594 if (offset) {
595 size_t left;
596
597 if (offset > d->size)
598 return -EINVAL;
599
600 if (!d->cached)
601 return 0;
602
603 left = d->size - offset;
604 total_len = left > size ? size: left;
605 memcpy(buf, cache + offset, total_len);
606
607 return total_len;
608 }
609
610 pid_t initpid = lookup_initpid_in_store(fc->pid);
611 if (initpid <= 1 || is_shared_pidns(initpid))
612 initpid = fc->pid;
613
614 cg = get_pid_cgroup(initpid, "blkio");
615 if (!cg)
616 return read_file_fuse("/proc/diskstats", buf, size, d);
617 prune_init_slice(cg);
618
619 ret = cgroup_ops->get_io_serviced(cgroup_ops, cg, &io_serviced_str);
620 if (ret < 0) {
621 if (ret == -EOPNOTSUPP)
622 return read_file_fuse("/proc/diskstats", buf, size, d);
623 }
624
625 ret = cgroup_ops->get_io_merged(cgroup_ops, cg, &io_merged_str);
626 if (ret < 0) {
627 if (ret == -EOPNOTSUPP)
628 return read_file_fuse("/proc/diskstats", buf, size, d);
629 }
630
631 ret = cgroup_ops->get_io_service_bytes(cgroup_ops, cg, &io_service_bytes_str);
632 if (ret < 0) {
633 if (ret == -EOPNOTSUPP)
634 return read_file_fuse("/proc/diskstats", buf, size, d);
635 }
636
637 ret = cgroup_ops->get_io_wait_time(cgroup_ops, cg, &io_wait_time_str);
638 if (ret < 0) {
639 if (ret == -EOPNOTSUPP)
640 return read_file_fuse("/proc/diskstats", buf, size, d);
641 }
642
643 ret = cgroup_ops->get_io_service_time(cgroup_ops, cg, &io_service_time_str);
644 if (ret < 0) {
645 if (ret == -EOPNOTSUPP)
646 return read_file_fuse("/proc/diskstats", buf, size, d);
647 }
648
649 f = fopen_cached("/proc/diskstats", "re", &fopen_cache);
650 if (!f)
651 return 0;
652
653 while (getline(&line, &linelen, f) != -1) {
654 ssize_t l;
655 char lbuf[256];
656
657 i = sscanf(line, "%u %u %71s", &stats.major, &stats.minor, stats.dev_name);
658 if (i != 3)
659 continue;
660
661 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Read", &stats.read);
662 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Write", &stats.write);
663 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Discard", &stats.discard);
664
665 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Read", &stats.read_merged);
666 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Write", &stats.write_merged);
667 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Discard", &stats.discard_merged);
668
669 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Read", &stats.read_sectors);
670 stats.read_sectors = stats.read_sectors / 512;
671 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Write", &stats.write_sectors);
672 stats.write_sectors = stats.write_sectors / 512;
673 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Discard", &stats.discard_sectors);
674 stats.discard_sectors = stats.discard_sectors / 512;
675
676 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Read", &read_service_time);
677 read_service_time = read_service_time / 1000000;
678 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Read", &read_wait_time);
679 read_wait_time = read_wait_time / 1000000;
680 stats.read_ticks = read_service_time + read_wait_time;
681
682 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Write", &write_service_time);
683 write_service_time = write_service_time / 1000000;
684 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Write", &write_wait_time);
685 write_wait_time = write_wait_time / 1000000;
686 stats.write_ticks = write_service_time + write_wait_time;
687
688 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Discard", &discard_service_time);
689 discard_service_time = discard_service_time / 1000000;
690 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Discard", &discard_wait_time);
691 discard_wait_time = discard_wait_time / 1000000;
692 stats.discard_ticks = discard_service_time + discard_wait_time;
693
694 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Total", &stats.total_ticks);
695 stats.total_ticks = stats.total_ticks / 1000000;
696
697 memset(lbuf, 0, sizeof(lbuf));
698 if (stats.read || stats.write || stats.read_merged || stats.write_merged ||
699 stats.read_sectors || stats.write_sectors || stats.read_ticks ||
700 stats.write_ticks || stats.ios_pgr || stats.total_ticks || stats.rq_ticks || stats.discard ||
701 stats.discard_merged || stats.discard_sectors || stats.discard_ticks) {
702 ret = strnprintf(
703 lbuf,
704 sizeof(lbuf),
705 "%u %u" /* major, minor */
706 " %s" /* dev_name */
707 " %" PRIu64 /* read */
708 " %" PRIu64 /* read_merged */
709 " %" PRIu64 /* read_sectors */
710 " %" PRIu64 /* read_ticks */
711 " %" PRIu64 /* write */
712 " %" PRIu64 /* write_merged */
713 " %" PRIu64 /* write_sectors */
714 " %" PRIu64 /* write_ticks */
715 " %" PRIu64 /* ios_pgr */
716 " %" PRIu64 /* total_ticks */
717 " %" PRIu64 /* rq_ticks */
718 " %" PRIu64 /* discard */
719 " %" PRIu64 /* discard_merged */
720 " %" PRIu64 /* discard_sectors */
721 " %" PRIu64 /* discard_ticks */
722 "\n",
723 stats.major,
724 stats.minor,
725 stats.dev_name,
726 stats.read,
727 stats.read_merged,
728 stats.read_sectors,
729 stats.read_ticks,
730 stats.write,
731 stats.write_merged,
732 stats.write_sectors,
733 stats.write_ticks,
734 stats.ios_pgr,
735 stats.total_ticks,
736 stats.rq_ticks,
737 stats.discard,
738 stats.discard_merged,
739 stats.discard_sectors,
740 stats.discard_ticks);
741 if (ret < 0) {
742 lxcfs_error("Insufficient buffer for %u:%u %s diskstats",
743 stats.major, stats.minor, stats.dev_name);
744 continue;
745 }
746 } else {
747 continue;
748 }
749
750 l = snprintf(cache, cache_size, "%s", lbuf);
751 if (l < 0)
752 return log_error(0, "Failed to write cache");
753 if ((size_t)l >= cache_size)
754 return log_error(0, "Write to cache was truncated");
755
756 cache += l;
757 cache_size -= l;
758 total_len += l;
759 }
760
761 d->cached = 1;
762 d->size = total_len;
763 if (total_len > size)
764 total_len = size;
765 memcpy(buf, d->buf, total_len);
766
767 return total_len;
768 }
769
770 #ifdef RELOADTEST
771 static inline void iwashere(void)
772 {
773 mknod("/tmp/lxcfs-iwashere", S_IFREG, 0644);
774 }
775 #endif
776
777 /*
778 * This function retrieves the busy time of a group of tasks by looking at
779 * cpuacct.usage. Unfortunately, this only makes sense when the container has
780 * been given it's own cpuacct cgroup. If not, this function will take the busy
781 * time of all other taks that do not actually belong to the container into
782 * account as well. If someone has a clever solution for this please send a
783 * patch!
784 */
785 static double get_reaper_busy(pid_t task)
786 {
787 __do_free char *cgroup = NULL, *usage_str = NULL;
788 uint64_t usage = 0;
789 pid_t initpid;
790
791 initpid = lookup_initpid_in_store(task);
792 if (initpid <= 0)
793 return 0;
794
795 cgroup = get_pid_cgroup(initpid, "cpuacct");
796 if (!cgroup)
797 return 0;
798 prune_init_slice(cgroup);
799
800 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cgroup, "cpuacct.usage", &usage_str))
801 return 0;
802
803 if (safe_uint64(usage_str, &usage, 10) < 0)
804 lxcfs_error("Failed to convert usage %s", usage_str);
805
806 return ((double)usage / 1000000000);
807 }
808
809 static uint64_t get_reaper_start_time(pid_t pid)
810 {
811 __do_free void *fopen_cache = NULL;
812 __do_fclose FILE *f = NULL;
813 int ret;
814 uint64_t starttime;
815 char path[STRLITERALLEN("/proc/") + LXCFS_NUMSTRLEN64 +
816 STRLITERALLEN("/stat") + 1];
817 pid_t qpid;
818
819 qpid = lookup_initpid_in_store(pid);
820 if (qpid <= 0)
821 return ret_errno(EINVAL);
822
823 ret = snprintf(path, sizeof(path), "/proc/%d/stat", qpid);
824 if (ret < 0 || (size_t)ret >= sizeof(path))
825 return ret_errno(EINVAL);
826
827 f = fopen_cached(path, "re", &fopen_cache);
828 if (!f)
829 return ret_errno(EINVAL);
830
831 /* Note that the *scanf() argument supression requires that length
832 * modifiers such as "l" are omitted. Otherwise some compilers will yell
833 * at us. It's like telling someone you're not married and then asking
834 * if you can bring your wife to the party.
835 */
836 ret = fscanf(f, "%*d " /* (1) pid %d */
837 "%*s " /* (2) comm %s */
838 "%*c " /* (3) state %c */
839 "%*d " /* (4) ppid %d */
840 "%*d " /* (5) pgrp %d */
841 "%*d " /* (6) session %d */
842 "%*d " /* (7) tty_nr %d */
843 "%*d " /* (8) tpgid %d */
844 "%*u " /* (9) flags %u */
845 "%*u " /* (10) minflt %lu */
846 "%*u " /* (11) cminflt %lu */
847 "%*u " /* (12) majflt %lu */
848 "%*u " /* (13) cmajflt %lu */
849 "%*u " /* (14) utime %lu */
850 "%*u " /* (15) stime %lu */
851 "%*d " /* (16) cutime %ld */
852 "%*d " /* (17) cstime %ld */
853 "%*d " /* (18) priority %ld */
854 "%*d " /* (19) nice %ld */
855 "%*d " /* (20) num_threads %ld */
856 "%*d " /* (21) itrealvalue %ld */
857 "%" PRIu64, /* (22) starttime %llu */
858 &starttime);
859 if (ret != 1)
860 return ret_errno(EINVAL);
861
862 return ret_set_errno(starttime, 0);
863 }
864
865 static double get_reaper_start_time_in_sec(pid_t pid)
866 {
867 uint64_t clockticks, ticks_per_sec;
868 int64_t ret;
869 double res = 0;
870
871 clockticks = get_reaper_start_time(pid);
872 if (clockticks <= 0)
873 return log_debug(0, "Failed to retrieve start time of pid %d", pid);
874
875 ret = sysconf(_SC_CLK_TCK);
876 if (ret < 0)
877 return log_debug(0, "Failed to determine number of clock ticks in a second");
878
879 ticks_per_sec = (uint64_t)ret;
880 res = (double)clockticks / ticks_per_sec;
881 return res;
882 }
883
884 static double get_reaper_age(pid_t pid)
885 {
886 uint64_t uptime_ms;
887 double procstart, procage;
888
889 /*
890 * We need to substract the time the process has started since system
891 * boot minus the time when the system has started to get the actual
892 * reaper age.
893 */
894 procstart = get_reaper_start_time_in_sec(pid);
895 procage = procstart;
896 if (procstart > 0) {
897 int ret;
898 struct timespec spec;
899
900 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
901 if (ret < 0)
902 return 0;
903
904 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
905 procage = (uptime_ms - (procstart * 1000)) / 1000;
906 }
907
908 return procage;
909 }
910
911 /*
912 * We read /proc/uptime and reuse its second field.
913 * For the first field, we use the mtime for the reaper for
914 * the calling pid as returned by getreaperage
915 */
916 static int proc_uptime_read(char *buf, size_t size, off_t offset,
917 struct fuse_file_info *fi)
918 {
919 struct fuse_context *fc = fuse_get_context();
920 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
921 char *cache = d->buf;
922 ssize_t total_len = 0, ret = 0;
923 double busytime, idletime, reaperage;
924
925 #ifdef RELOADTEST
926 iwashere();
927 #endif
928
929 if (offset) {
930 size_t left;
931
932 if (offset > d->size)
933 return -EINVAL;
934
935 if (!d->cached)
936 return 0;
937
938 left = d->size - offset;
939 total_len = left > size ? size : left;
940 memcpy(buf, cache + offset, total_len);
941
942 return total_len;
943 }
944
945 reaperage = get_reaper_age(fc->pid);
946 /*
947 * To understand why this is done, please read the comment to the
948 * get_reaper_busy() function.
949 */
950 idletime = reaperage;
951 busytime = get_reaper_busy(fc->pid);
952 if (reaperage >= busytime)
953 idletime = reaperage - busytime;
954
955 ret = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
956 if (ret < 0 || ret >= d->buflen)
957 return read_file_fuse("/proc/uptime", buf, size, d);
958 total_len = ret;
959
960 d->cached = 1;
961 d->size = total_len;
962 if ((size_t)total_len > size)
963 total_len = size;
964 memcpy(buf, d->buf, total_len);
965
966 return total_len;
967 }
968
969 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
970 static int proc_stat_read(char *buf, size_t size, off_t offset,
971 struct fuse_file_info *fi)
972 {
973 __do_free char *cg = NULL, *cpu_cg = NULL, *cpuset = NULL, *line = NULL;
974 __do_free void *fopen_cache = NULL;
975 __do_free struct cpuacct_usage *cg_cpu_usage = NULL;
976 __do_fclose FILE *f = NULL;
977 struct fuse_context *fc = fuse_get_context();
978 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
979 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
980 size_t linelen = 0, total_len = 0;
981 int curcpu = -1; /* cpu numbering starts at 0 */
982 int physcpu = 0;
983 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
984 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
985 uint64_t user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0,
986 iowait_sum = 0, irq_sum = 0, softirq_sum = 0, steal_sum = 0,
987 guest_sum = 0, guest_nice_sum = 0;
988 char cpuall[CPUALL_MAX_SIZE];
989 /* reserve for cpu all */
990 char *cache = d->buf + CPUALL_MAX_SIZE;
991 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
992 int cg_cpu_usage_size = 0;
993
994 if (offset) {
995 size_t left;
996
997 if (offset > d->size)
998 return -EINVAL;
999
1000 if (!d->cached)
1001 return 0;
1002
1003 left = d->size - offset;
1004 total_len = left > size ? size : left;
1005 memcpy(buf, d->buf + offset, total_len);
1006
1007 return total_len;
1008 }
1009
1010 pid_t initpid = lookup_initpid_in_store(fc->pid);
1011 if (initpid <= 1 || is_shared_pidns(initpid))
1012 initpid = fc->pid;
1013
1014 /*
1015 * when container run with host pid namespace initpid == 1, cgroup will "/"
1016 * we should return host os's /proc contents.
1017 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
1018 */
1019 if (initpid == 1)
1020 return read_file_fuse("/proc/stat", buf, size, d);
1021
1022 cg = get_pid_cgroup(initpid, "cpuset");
1023 if (!cg)
1024 return read_file_fuse("/proc/stat", buf, size, d);
1025 prune_init_slice(cg);
1026 cpu_cg = get_pid_cgroup(initpid, "cpu");
1027 if (!cpu_cg)
1028 return read_file_fuse("/proc/stat", buf, size, d);
1029 prune_init_slice(cpu_cg);
1030 cpuset = get_cpuset(cg);
1031 if (!cpuset)
1032 return 0;
1033
1034 f = fopen_cached("/proc/stat", "re", &fopen_cache);
1035 if (!f)
1036 return 0;
1037
1038 /* Skip first system cpu line. */
1039 if (getline(&line, &linelen, f) < 0)
1040 return log_error(0, "proc_stat_read read first line failed");
1041
1042 /*
1043 * Read cpuacct.usage_all for all CPUs.
1044 * If the cpuacct cgroup is present, it is used to calculate the container's
1045 * CPU usage. If not, values from the host's /proc/stat are used.
1046 */
1047 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) == 0) {
1048 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs) {
1049 total_len = cpuview_proc_stat(cg, cpu_cg, cpuset, cg_cpu_usage,
1050 cg_cpu_usage_size, f,
1051 d->buf, d->buflen);
1052 goto out;
1053 }
1054 } else {
1055 lxcfs_v("proc_stat_read failed to read from cpuacct, falling back to the host's /proc/stat");
1056 }
1057
1058 while (getline(&line, &linelen, f) != -1) {
1059 ssize_t l;
1060 char cpu_char[10]; /* That's a lot of cores */
1061 char *c;
1062 uint64_t all_used, cg_used, new_idle;
1063 int ret, cpu_to_render;
1064
1065 if (strlen(line) == 0)
1066 continue;
1067 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
1068 /* not a ^cpuN line containing a number N, just print it */
1069 l = snprintf(cache, cache_size, "%s", line);
1070 if (l < 0)
1071 return log_error(0, "Failed to write cache");
1072 if ((size_t)l >= cache_size)
1073 return log_error(0, "Write to cache was truncated");
1074
1075 cache += l;
1076 cache_size -= l;
1077 total_len += l;
1078
1079 continue;
1080 }
1081
1082 if (sscanf(cpu_char, "%d", &physcpu) != 1)
1083 continue;
1084
1085 if (!cpu_in_cpuset(physcpu, cpuset))
1086 continue;
1087
1088 curcpu++;
1089
1090 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
1091 cpu_to_render = curcpu;
1092 else
1093 cpu_to_render = physcpu;
1094
1095 ret = sscanf(
1096 line,
1097 "%*s" /* <skip> */
1098 " %" PRIu64 /* user */
1099 " %" PRIu64 /* nice */
1100 " %" PRIu64 /* system */
1101 " %" PRIu64 /* idle */
1102 " %" PRIu64 /* iowait */
1103 " %" PRIu64 /* irq */
1104 " %" PRIu64 /* softirq */
1105 " %" PRIu64 /* steal */
1106 " %" PRIu64 /* guest */
1107 " %" PRIu64, /* guest_nice */
1108 &user,
1109 &nice,
1110 &system,
1111 &idle,
1112 &iowait,
1113 &irq,
1114 &softirq,
1115 &steal,
1116 &guest,
1117 &guest_nice);
1118 if (ret != 10 || !cg_cpu_usage) {
1119 c = strchr(line, ' ');
1120 if (!c)
1121 continue;
1122
1123 l = snprintf(cache, cache_size, "cpu%d%s", cpu_to_render, c);
1124 if (l < 0)
1125 return log_error(0, "Failed to write cache");
1126 if ((size_t)l >= cache_size)
1127 return log_error(0, "Write to cache was truncated");
1128
1129 cache += l;
1130 cache_size -= l;
1131 total_len += l;
1132
1133 if (ret != 10)
1134 continue;
1135 }
1136
1137 if (cg_cpu_usage) {
1138 if (physcpu >= cg_cpu_usage_size)
1139 break;
1140
1141 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
1142 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
1143
1144 if (all_used >= cg_used) {
1145 new_idle = idle + (all_used - cg_used);
1146 } else {
1147 lxcfs_debug("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
1148 cpu_to_render, cg, all_used, cg_used);
1149 new_idle = idle;
1150 }
1151
1152 l = snprintf(cache, cache_size,
1153 "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
1154 cpu_to_render, cg_cpu_usage[physcpu].user,
1155 cg_cpu_usage[physcpu].system, new_idle);
1156 if (l < 0)
1157 return log_error(0, "Failed to write cache");
1158 if ((size_t)l >= cache_size)
1159 return log_error(0, "Write to cache was truncated");
1160
1161 cache += l;
1162 cache_size -= l;
1163 total_len += l;
1164
1165 user_sum += cg_cpu_usage[physcpu].user;
1166 system_sum += cg_cpu_usage[physcpu].system;
1167 idle_sum += new_idle;
1168 } else {
1169 user_sum += user;
1170 nice_sum += nice;
1171 system_sum += system;
1172 idle_sum += idle;
1173 iowait_sum += iowait;
1174 irq_sum += irq;
1175 softirq_sum += softirq;
1176 steal_sum += steal;
1177 guest_sum += guest;
1178 guest_nice_sum += guest_nice;
1179 }
1180 }
1181
1182 cache = d->buf;
1183
1184 int cpuall_len = snprintf(
1185 cpuall,
1186 CPUALL_MAX_SIZE,
1187 "cpu "
1188 " %" PRIu64 /* user_sum */
1189 " %" PRIu64 /* nice_sum */
1190 " %" PRIu64 /* system_sum */
1191 " %" PRIu64 /* idle_sum */
1192 " %" PRIu64 /* iowait_sum */
1193 " %" PRIu64 /* irq_sum */
1194 " %" PRIu64 /* softirq_sum */
1195 " %" PRIu64 /* steal_sum */
1196 " %" PRIu64 /* guest_sum */
1197 " %" PRIu64 /* guest_nice_sum */
1198 "\n",
1199 user_sum,
1200 nice_sum,
1201 system_sum,
1202 idle_sum,
1203 iowait_sum,
1204 irq_sum,
1205 softirq_sum,
1206 steal_sum,
1207 guest_sum,
1208 guest_nice_sum);
1209 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
1210 memcpy(cache, cpuall, cpuall_len);
1211 cache += cpuall_len;
1212 } else {
1213 /* shouldn't happen */
1214 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d", cpuall_len);
1215 cpuall_len = 0;
1216 }
1217
1218 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
1219 total_len += cpuall_len;
1220
1221 out:
1222 d->cached = 1;
1223 d->size = total_len;
1224 if (total_len > size)
1225 total_len = size;
1226
1227 memcpy(buf, d->buf, total_len);
1228 return total_len;
1229 }
1230
1231 /* Note that "memory.stat" in cgroup2 is hierarchical by default. */
1232 static bool cgroup_parse_memory_stat(const char *cgroup, struct memory_stat *mstat)
1233 {
1234 __do_close int fd = -EBADF;
1235 __do_fclose FILE *f = NULL;
1236 __do_free char *line = NULL;
1237 __do_free void *fdopen_cache = NULL;
1238 bool unified;
1239 size_t len = 0;
1240 ssize_t linelen;
1241
1242 fd = cgroup_ops->get_memory_stats_fd(cgroup_ops, cgroup);
1243 if (fd < 0)
1244 return false;
1245
1246 f = fdopen_cached(fd, "re", &fdopen_cache);
1247 if (!f)
1248 return false;
1249
1250 unified = pure_unified_layout(cgroup_ops);
1251 while ((linelen = getline(&line, &len, f)) != -1) {
1252 if (!unified && startswith(line, "hierarchical_memory_limit")) {
1253 sscanf(line, "hierarchical_memory_limit %" PRIu64, &(mstat->hierarchical_memory_limit));
1254 } else if (!unified && startswith(line, "hierarchical_memsw_limit")) {
1255 sscanf(line, "hierarchical_memsw_limit %" PRIu64, &(mstat->hierarchical_memsw_limit));
1256 } else if (startswith(line, unified ? "file" :"total_cache")) {
1257 sscanf(line, unified ? "file %" PRIu64 : "total_cache %" PRIu64, &(mstat->total_cache));
1258 } else if (!unified && startswith(line, "total_rss")) {
1259 sscanf(line, "total_rss %" PRIu64, &(mstat->total_rss));
1260 } else if (!unified && startswith(line, "total_rss_huge")) {
1261 sscanf(line, "total_rss_huge %" PRIu64, &(mstat->total_rss_huge));
1262 } else if (startswith(line, unified ? "shmem" : "total_shmem")) {
1263 sscanf(line, unified ? "shmem %" PRIu64 : "total_shmem %" PRIu64, &(mstat->total_shmem));
1264 } else if (startswith(line, unified ? "file_mapped" : "total_mapped_file")) {
1265 sscanf(line, unified ? "file_mapped %" PRIu64 : "total_mapped_file %" PRIu64, &(mstat->total_mapped_file));
1266 } else if (!unified && startswith(line, "total_dirty")) {
1267 sscanf(line, "total_dirty %" PRIu64, &(mstat->total_dirty));
1268 } else if (!unified && startswith(line, "total_writeback")) {
1269 sscanf(line, "total_writeback %" PRIu64, &(mstat->total_writeback));
1270 } else if (!unified && startswith(line, "total_swap")) {
1271 sscanf(line, "total_swap %" PRIu64, &(mstat->total_swap));
1272 } else if (!unified && startswith(line, "total_pgpgin")) {
1273 sscanf(line, "total_pgpgin %" PRIu64, &(mstat->total_pgpgin));
1274 } else if (!unified && startswith(line, "total_pgpgout")) {
1275 sscanf(line, "total_pgpgout %" PRIu64, &(mstat->total_pgpgout));
1276 } else if (startswith(line, unified ? "pgfault" : "total_pgfault")) {
1277 sscanf(line, unified ? "pgfault %" PRIu64 : "total_pgfault %" PRIu64, &(mstat->total_pgfault));
1278 } else if (startswith(line, unified ? "pgmajfault" : "total_pgmajfault")) {
1279 sscanf(line, unified ? "pgmajfault %" PRIu64 : "total_pgmajfault %" PRIu64, &(mstat->total_pgmajfault));
1280 } else if (startswith(line, unified ? "inactive_anon" : "total_inactive_anon")) {
1281 sscanf(line, unified ? "inactive_anon %" PRIu64 : "total_inactive_anon %" PRIu64, &(mstat->total_inactive_anon));
1282 } else if (startswith(line, unified ? "active_anon" : "total_active_anon")) {
1283 sscanf(line, unified ? "active_anon %" PRIu64 : "total_active_anon %" PRIu64, &(mstat->total_active_anon));
1284 } else if (startswith(line, unified ? "inactive_file" : "total_inactive_file")) {
1285 sscanf(line, unified ? "inactive_file %" PRIu64 : "total_inactive_file %" PRIu64, &(mstat->total_inactive_file));
1286 } else if (startswith(line, unified ? "active_file" : "total_active_file")) {
1287 sscanf(line, unified ? "active_file %" PRIu64 : "total_active_file %" PRIu64, &(mstat->total_active_file));
1288 } else if (startswith(line, unified ? "unevictable" : "total_unevictable")) {
1289 sscanf(line, unified ? "unevictable %" PRIu64 : "total_unevictable %" PRIu64, &(mstat->total_unevictable));
1290 }
1291 }
1292
1293 return true;
1294 }
1295
1296 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1297 struct fuse_file_info *fi)
1298 {
1299 __do_free char *cgroup = NULL, *line = NULL, *memusage_str = NULL,
1300 *memswusage_str = NULL, *memswpriority_str = NULL;
1301 __do_free void *fopen_cache = NULL;
1302 __do_fclose FILE *f = NULL;
1303 struct fuse_context *fc = fuse_get_context();
1304 bool wants_swap = lxcfs_has_opt(fuse_get_context()->private_data, LXCFS_SWAP_ON);
1305 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1306 uint64_t memlimit = 0, memusage = 0,
1307 hosttotal = 0, swfree = 0, swusage = 0, swtotal = 0,
1308 memswpriority = 1;
1309 struct memory_stat mstat = {};
1310 size_t linelen = 0, total_len = 0;
1311 char *cache = d->buf;
1312 size_t cache_size = d->buflen;
1313 int ret;
1314
1315 if (offset) {
1316 size_t left;
1317
1318 if (offset > d->size)
1319 return -EINVAL;
1320
1321 if (!d->cached)
1322 return 0;
1323
1324 left = d->size - offset;
1325 total_len = left > size ? size : left;
1326 memcpy(buf, cache + offset, total_len);
1327
1328 return total_len;
1329 }
1330
1331 pid_t initpid = lookup_initpid_in_store(fc->pid);
1332 if (initpid <= 1 || is_shared_pidns(initpid))
1333 initpid = fc->pid;
1334
1335 cgroup = get_pid_cgroup(initpid, "memory");
1336 if (!cgroup)
1337 return read_file_fuse("/proc/meminfo", buf, size, d);
1338
1339 prune_init_slice(cgroup);
1340
1341 /* memory limits */
1342 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
1343 if (ret < 0)
1344 return read_file_fuse("/proc/meminfo", buf, size, d);
1345
1346 if (safe_uint64(memusage_str, &memusage, 10) < 0)
1347 lxcfs_error("Failed to convert memusage %s", memusage_str);
1348
1349 if (!cgroup_parse_memory_stat(cgroup, &mstat))
1350 return read_file_fuse("/proc/meminfo", buf, size, d);
1351
1352 ret = get_min_memlimit(cgroup, false, &memlimit);
1353 if (ret < 0)
1354 return read_file_fuse("/proc/meminfo", buf, size, d);
1355 /*
1356 * Following values are allowed to fail, because swapaccount might be
1357 * turned off for current kernel.
1358 */
1359 if (wants_swap)
1360 get_swap_info(cgroup, memlimit, memusage, &swtotal, &swusage, &memswpriority);
1361
1362 f = fopen_cached("/proc/meminfo", "re", &fopen_cache);
1363 if (!f)
1364 return read_file_fuse("/proc/meminfo", buf, size, d);
1365
1366 memusage /= 1024;
1367 memlimit /= 1024;
1368 while (getline(&line, &linelen, f) != -1) {
1369 ssize_t l;
1370 char *printme, lbuf[100];
1371
1372 memset(lbuf, 0, 100);
1373 if (startswith(line, "MemTotal:")) {
1374 sscanf(line+sizeof("MemTotal:")-1, "%" PRIu64, &hosttotal);
1375 if (memlimit == 0)
1376 memlimit = hosttotal;
1377
1378 if (hosttotal < memlimit)
1379 memlimit = hosttotal;
1380 snprintf(lbuf, 100, "MemTotal: %8" PRIu64 " kB\n", memlimit);
1381 printme = lbuf;
1382 } else if (startswith(line, "MemFree:")) {
1383 snprintf(lbuf, 100, "MemFree: %8" PRIu64 " kB\n", memlimit - memusage);
1384 printme = lbuf;
1385 } else if (startswith(line, "MemAvailable:")) {
1386 snprintf(lbuf, 100, "MemAvailable: %8" PRIu64 " kB\n", memlimit - memusage + (mstat.total_active_file + mstat.total_inactive_file) / 1024);
1387 printme = lbuf;
1388 } else if (startswith(line, "SwapTotal:")) {
1389 if (wants_swap) {
1390 uint64_t hostswtotal = 0;
1391
1392 sscanf(line + STRLITERALLEN("SwapTotal:"), "%" PRIu64, &hostswtotal);
1393
1394 /* In cgroups v1, the total amount of swap is always reported to be the
1395 lesser of the RAM+SWAP limit or the SWAP device size.
1396 This is because the kernel can swap as much as it
1397 wants and not only up to swtotal. */
1398 if (!liblxcfs_memory_is_cgroupv2())
1399 swtotal += memlimit;
1400
1401 if (hostswtotal < swtotal) {
1402 swtotal = hostswtotal;
1403 }
1404
1405 /* When swappiness is 0, pretend we can't swap. */
1406 if (memswpriority == 0) {
1407 swtotal = swusage;
1408 }
1409 }
1410
1411 snprintf(lbuf, 100, "SwapTotal: %8" PRIu64 " kB\n", swtotal);
1412 printme = lbuf;
1413 } else if (startswith(line, "SwapFree:")) {
1414 if (wants_swap) {
1415 swfree = swtotal - swusage;
1416 }
1417
1418 snprintf(lbuf, 100, "SwapFree: %8" PRIu64 " kB\n", swfree);
1419 printme = lbuf;
1420 } else if (startswith(line, "Slab:")) {
1421 snprintf(lbuf, 100, "Slab: %8" PRIu64 " kB\n", (uint64_t)0);
1422 printme = lbuf;
1423 } else if (startswith(line, "Buffers:")) {
1424 snprintf(lbuf, 100, "Buffers: %8" PRIu64 " kB\n", (uint64_t)0);
1425 printme = lbuf;
1426 } else if (startswith(line, "Cached:")) {
1427 snprintf(lbuf, 100, "Cached: %8" PRIu64 " kB\n",
1428 mstat.total_cache / 1024);
1429 printme = lbuf;
1430 } else if (startswith(line, "SwapCached:")) {
1431 snprintf(lbuf, 100, "SwapCached: %8" PRIu64 " kB\n", (uint64_t)0);
1432 printme = lbuf;
1433 } else if (startswith(line, "Active:")) {
1434 snprintf(lbuf, 100, "Active: %8" PRIu64 " kB\n",
1435 (mstat.total_active_anon +
1436 mstat.total_active_file) /
1437 1024);
1438 printme = lbuf;
1439 } else if (startswith(line, "Inactive:")) {
1440 snprintf(lbuf, 100, "Inactive: %8" PRIu64 " kB\n",
1441 (mstat.total_inactive_anon +
1442 mstat.total_inactive_file) /
1443 1024);
1444 printme = lbuf;
1445 } else if (startswith(line, "Active(anon):")) {
1446 snprintf(lbuf, 100, "Active(anon): %8" PRIu64 " kB\n",
1447 mstat.total_active_anon / 1024);
1448 printme = lbuf;
1449 } else if (startswith(line, "Inactive(anon):")) {
1450 snprintf(lbuf, 100, "Inactive(anon): %8" PRIu64 " kB\n",
1451 mstat.total_inactive_anon / 1024);
1452 printme = lbuf;
1453 } else if (startswith(line, "Active(file):")) {
1454 snprintf(lbuf, 100, "Active(file): %8" PRIu64 " kB\n",
1455 mstat.total_active_file / 1024);
1456 printme = lbuf;
1457 } else if (startswith(line, "Inactive(file):")) {
1458 snprintf(lbuf, 100, "Inactive(file): %8" PRIu64 " kB\n",
1459 mstat.total_inactive_file / 1024);
1460 printme = lbuf;
1461 } else if (startswith(line, "Unevictable:")) {
1462 snprintf(lbuf, 100, "Unevictable: %8" PRIu64 " kB\n",
1463 mstat.total_unevictable / 1024);
1464 printme = lbuf;
1465 } else if (startswith(line, "Dirty:")) {
1466 snprintf(lbuf, 100, "Dirty: %8" PRIu64 " kB\n",
1467 mstat.total_dirty / 1024);
1468 printme = lbuf;
1469 } else if (startswith(line, "Writeback:")) {
1470 snprintf(lbuf, 100, "Writeback: %8" PRIu64 " kB\n",
1471 mstat.total_writeback / 1024);
1472 printme = lbuf;
1473 } else if (startswith(line, "AnonPages:")) {
1474 snprintf(lbuf, 100, "AnonPages: %8" PRIu64 " kB\n",
1475 (mstat.total_active_anon +
1476 mstat.total_inactive_anon - mstat.total_shmem) /
1477 1024);
1478 printme = lbuf;
1479 } else if (startswith(line, "Mapped:")) {
1480 snprintf(lbuf, 100, "Mapped: %8" PRIu64 " kB\n",
1481 mstat.total_mapped_file / 1024);
1482 printme = lbuf;
1483 } else if (startswith(line, "SReclaimable:")) {
1484 snprintf(lbuf, 100, "SReclaimable: %8" PRIu64 " kB\n", (uint64_t)0);
1485 printme = lbuf;
1486 } else if (startswith(line, "SUnreclaim:")) {
1487 snprintf(lbuf, 100, "SUnreclaim: %8" PRIu64 " kB\n", (uint64_t)0);
1488 printme = lbuf;
1489 } else if (startswith(line, "Shmem:")) {
1490 snprintf(lbuf, 100, "Shmem: %8" PRIu64 " kB\n",
1491 mstat.total_shmem / 1024);
1492 printme = lbuf;
1493 } else if (startswith(line, "ShmemHugePages:")) {
1494 snprintf(lbuf, 100, "ShmemHugePages: %8" PRIu64 " kB\n", (uint64_t)0);
1495 printme = lbuf;
1496 } else if (startswith(line, "ShmemPmdMapped:")) {
1497 snprintf(lbuf, 100, "ShmemPmdMapped: %8" PRIu64 " kB\n", (uint64_t)0);
1498 printme = lbuf;
1499 } else if (startswith(line, "AnonHugePages:")) {
1500 snprintf(lbuf, 100, "AnonHugePages: %8" PRIu64 " kB\n",
1501 mstat.total_rss_huge / 1024);
1502 printme = lbuf;
1503 } else {
1504 printme = line;
1505 }
1506
1507 l = snprintf(cache, cache_size, "%s", printme);
1508 if (l < 0)
1509 return log_error(0, "Failed to write cache");
1510 if ((size_t)l >= cache_size)
1511 return log_error(0, "Write to cache was truncated");
1512
1513 cache += l;
1514 cache_size -= l;
1515 total_len += l;
1516 }
1517
1518 d->cached = 1;
1519 d->size = total_len;
1520 if (total_len > size)
1521 total_len = size;
1522 memcpy(buf, d->buf, total_len);
1523
1524 return total_len;
1525 }
1526
1527 static int proc_slabinfo_read(char *buf, size_t size, off_t offset,
1528 struct fuse_file_info *fi)
1529 {
1530 __do_free char *cgroup = NULL, *line = NULL;
1531 __do_free void *fopen_cache = NULL;
1532 __do_fclose FILE *f = NULL;
1533 __do_close int fd = -EBADF;
1534 struct fuse_context *fc = fuse_get_context();
1535 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1536 size_t linelen = 0, total_len = 0;
1537 char *cache = d->buf;
1538 size_t cache_size = d->buflen;
1539 pid_t initpid;
1540
1541 if (offset) {
1542 size_t left;
1543
1544 if (offset > d->size)
1545 return -EINVAL;
1546
1547 if (!d->cached)
1548 return 0;
1549
1550 left = d->size - offset;
1551 total_len = left > size ? size : left;
1552 memcpy(buf, cache + offset, total_len);
1553
1554 return total_len;
1555 }
1556
1557 initpid = lookup_initpid_in_store(fc->pid);
1558 if (initpid <= 1 || is_shared_pidns(initpid))
1559 initpid = fc->pid;
1560
1561 cgroup = get_pid_cgroup(initpid, "memory");
1562 if (!cgroup)
1563 return read_file_fuse("/proc/slabinfo", buf, size, d);
1564
1565 prune_init_slice(cgroup);
1566
1567 fd = cgroup_ops->get_memory_slabinfo_fd(cgroup_ops, cgroup);
1568 if (fd < 0)
1569 return read_file_fuse("/proc/slabinfo", buf, size, d);
1570
1571 f = fdopen_cached(fd, "re", &fopen_cache);
1572 if (!f)
1573 return read_file_fuse("/proc/slabinfo", buf, size, d);
1574
1575 while (getline(&line, &linelen, f) != -1) {
1576 ssize_t l = snprintf(cache, cache_size, "%s", line);
1577 if (l < 0)
1578 return log_error(0, "Failed to write cache");
1579 if ((size_t)l >= cache_size)
1580 return log_error(0, "Write to cache was truncated");
1581
1582 cache += l;
1583 cache_size -= l;
1584 total_len += l;
1585 }
1586
1587 d->cached = 1;
1588 d->size = total_len;
1589 if (total_len > size)
1590 total_len = size;
1591 memcpy(buf, d->buf, total_len);
1592
1593 return total_len;
1594 }
1595
1596 static int proc_read_with_personality(int (*do_proc_read)(char *, size_t, off_t,
1597 struct fuse_file_info *), char *buf, size_t size, off_t offset,
1598 struct fuse_file_info *fi)
1599 {
1600 struct fuse_context *fc = fuse_get_context();
1601 __u32 host_personality = liblxcfs_personality(), caller_personality;
1602 bool change_personality;
1603 int ret, read_ret;
1604
1605 if (get_task_personality(fc->pid, &caller_personality) < 0)
1606 return log_error(0, "Failed to get caller process (pid: %d) personality", fc->pid);
1607
1608 /* do we need to change thread personality? */
1609 change_personality = host_personality != caller_personality;
1610
1611 if (change_personality) {
1612 ret = personality(caller_personality);
1613 if (ret == -1)
1614 return log_error(0, "Call to personality(%d) failed: %s\n",
1615 caller_personality, strerror(errno));
1616
1617 lxcfs_debug("task (tid: %d) personality was changed %d -> %d\n",
1618 (int)syscall(SYS_gettid), ret, caller_personality);
1619 }
1620
1621 read_ret = do_proc_read(buf, size, offset, fi);
1622
1623 if (change_personality) {
1624 ret = personality(host_personality);
1625 if (ret == -1)
1626 return log_error(0, "Call to personality(%d) failed: %s\n",
1627 host_personality, strerror(errno));
1628
1629 lxcfs_debug("task (tid: %d) personality was restored %d -> %d\n",
1630 (int)syscall(SYS_gettid), ret, host_personality);
1631 }
1632
1633 return read_ret;
1634 }
1635
1636 __lxcfs_fuse_ops int proc_read(const char *path, char *buf, size_t size,
1637 off_t offset, struct fuse_file_info *fi)
1638 {
1639 struct file_info *f = INTTYPE_TO_PTR(fi->fh);
1640
1641 switch (f->type) {
1642 case LXC_TYPE_PROC_MEMINFO:
1643 if (liblxcfs_functional())
1644 return proc_meminfo_read(buf, size, offset, fi);
1645
1646 return read_file_fuse_with_offset(LXC_TYPE_PROC_MEMINFO_PATH,
1647 buf, size, offset, f);
1648 case LXC_TYPE_PROC_CPUINFO:
1649 if (liblxcfs_functional())
1650 return proc_read_with_personality(&proc_cpuinfo_read, buf, size, offset, fi);
1651
1652 return read_file_fuse_with_offset(LXC_TYPE_PROC_CPUINFO_PATH,
1653 buf, size, offset, f);
1654 case LXC_TYPE_PROC_UPTIME:
1655 if (liblxcfs_functional())
1656 return proc_uptime_read(buf, size, offset, fi);
1657
1658 return read_file_fuse_with_offset(LXC_TYPE_PROC_UPTIME_PATH,
1659 buf, size, offset, f);
1660 case LXC_TYPE_PROC_STAT:
1661 if (liblxcfs_functional())
1662 return proc_stat_read(buf, size, offset, fi);
1663
1664 return read_file_fuse_with_offset(LXC_TYPE_PROC_STAT_PATH, buf,
1665 size, offset, f);
1666 case LXC_TYPE_PROC_DISKSTATS:
1667 if (liblxcfs_functional())
1668 return proc_diskstats_read(buf, size, offset, fi);
1669
1670 return read_file_fuse_with_offset(LXC_TYPE_PROC_DISKSTATS_PATH,
1671 buf, size, offset, f);
1672 case LXC_TYPE_PROC_SWAPS:
1673 if (liblxcfs_functional())
1674 return proc_swaps_read(buf, size, offset, fi);
1675
1676 return read_file_fuse_with_offset(LXC_TYPE_PROC_SWAPS_PATH, buf,
1677 size, offset, f);
1678 case LXC_TYPE_PROC_LOADAVG:
1679 if (liblxcfs_functional())
1680 return proc_loadavg_read(buf, size, offset, fi);
1681
1682 return read_file_fuse_with_offset(LXC_TYPE_PROC_LOADAVG_PATH,
1683 buf, size, offset, f);
1684 case LXC_TYPE_PROC_SLABINFO:
1685 if (liblxcfs_functional())
1686 return proc_slabinfo_read(buf, size, offset, fi);
1687
1688 return read_file_fuse_with_offset(LXC_TYPE_PROC_SLABINFO_PATH,
1689 buf, size, offset, f);
1690 }
1691
1692 return -EINVAL;
1693 }