]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/proc_fuse.c
Merge pull request #624 from peppaJoeng/main
[mirror_lxcfs.git] / src / proc_fuse.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include "config.h"
4
5 #include <dirent.h>
6 #include <errno.h>
7 #include <fcntl.h>
8 #include <inttypes.h>
9 #include <libgen.h>
10 #include <pthread.h>
11 #include <sched.h>
12 #include <stdarg.h>
13 #include <stdbool.h>
14 #include <stdint.h>
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <time.h>
19 #include <unistd.h>
20 #include <wait.h>
21 #include <linux/magic.h>
22 #include <linux/sched.h>
23 #include <sys/epoll.h>
24 #include <sys/mman.h>
25 #include <sys/mount.h>
26 #include <sys/param.h>
27 #include <sys/personality.h>
28 #include <sys/socket.h>
29 #include <sys/syscall.h>
30 #include <sys/sysinfo.h>
31 #include <sys/vfs.h>
32
33 #include "proc_fuse.h"
34
35 #include "bindings.h"
36 #include "cgroup_fuse.h"
37 #include "cgroups/cgroup.h"
38 #include "cgroups/cgroup_utils.h"
39 #include "cpuset_parse.h"
40 #include "lxcfs_fuse_compat.h"
41 #include "memory_utils.h"
42 #include "proc_loadavg.h"
43 #include "proc_cpuview.h"
44 #include "utils.h"
45
46 struct memory_stat {
47 uint64_t hierarchical_memory_limit;
48 uint64_t hierarchical_memsw_limit;
49 uint64_t total_cache;
50 uint64_t total_rss;
51 uint64_t total_rss_huge;
52 uint64_t total_shmem;
53 uint64_t total_mapped_file;
54 uint64_t total_dirty;
55 uint64_t total_writeback;
56 uint64_t total_swap;
57 uint64_t total_pgpgin;
58 uint64_t total_pgpgout;
59 uint64_t total_pgfault;
60 uint64_t total_pgmajfault;
61 uint64_t total_inactive_anon;
62 uint64_t total_active_anon;
63 uint64_t total_inactive_file;
64 uint64_t total_active_file;
65 uint64_t total_unevictable;
66 };
67
68 static off_t get_procfile_size(const char *path)
69 {
70 __do_fclose FILE *f = NULL;
71 __do_free char *line = NULL;
72 size_t len = 0;
73 ssize_t sz, answer = 0;
74
75 f = fopen(path, "re");
76 if (!f)
77 return 0;
78
79 while ((sz = getline(&line, &len, f)) != -1)
80 answer += sz;
81
82 return answer;
83 }
84
85 static off_t get_procfile_size_with_personality(const char *path)
86 {
87 struct fuse_context *fc = fuse_get_context();
88 __u32 host_personality = liblxcfs_personality(), caller_personality;
89 bool change_personality;
90 int ret;
91 off_t procfile_size_ret;
92
93 if (get_task_personality(fc->pid, &caller_personality) < 0)
94 return log_error(0, "Failed to get caller process (pid: %d) personality", fc->pid);
95
96 /* do we need to change thread personality? */
97 change_personality = host_personality != caller_personality;
98
99 if (change_personality) {
100 ret = personality(caller_personality);
101 if (ret == -1)
102 return log_error(0, "Call to personality(%d) failed: %s\n",
103 caller_personality, strerror(errno));
104
105 lxcfs_debug("task (tid: %d) personality was changed %d -> %d\n",
106 (int)syscall(SYS_gettid), ret, caller_personality);
107 }
108
109 procfile_size_ret = get_procfile_size(path);
110
111 if (change_personality) {
112 ret = personality(host_personality);
113 if (ret == -1)
114 return log_error(0, "Call to personality(%d) failed: %s\n",
115 host_personality, strerror(errno));
116
117 lxcfs_debug("task (tid: %d) personality was restored %d -> %d\n",
118 (int)syscall(SYS_gettid), ret, host_personality);
119 }
120
121 return procfile_size_ret;
122 }
123
124 __lxcfs_fuse_ops int proc_getattr(const char *path, struct stat *sb)
125 {
126 struct timespec now;
127
128 memset(sb, 0, sizeof(struct stat));
129 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
130 return -EINVAL;
131
132 sb->st_uid = sb->st_gid = 0;
133 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
134 if (strcmp(path, "/proc") == 0) {
135 sb->st_mode = S_IFDIR | 00555;
136 sb->st_nlink = 2;
137 return 0;
138 }
139
140 if (strcmp(path, "/proc/meminfo") == 0 ||
141 strcmp(path, "/proc/cpuinfo") == 0 ||
142 strcmp(path, "/proc/uptime") == 0 ||
143 strcmp(path, "/proc/stat") == 0 ||
144 strcmp(path, "/proc/diskstats") == 0 ||
145 strcmp(path, "/proc/swaps") == 0 ||
146 strcmp(path, "/proc/loadavg") == 0 ||
147 strcmp(path, "/proc/slabinfo") == 0) {
148 if (liblxcfs_functional())
149 sb->st_size = get_procfile_size_with_personality(path);
150 else
151 sb->st_size = get_procfile_size(path);
152 sb->st_mode = S_IFREG | 00444;
153 sb->st_nlink = 1;
154 return 0;
155 }
156
157 return -ENOENT;
158 }
159
160 __lxcfs_fuse_ops int proc_readdir(const char *path, void *buf,
161 fuse_fill_dir_t filler, off_t offset,
162 struct fuse_file_info *fi)
163 {
164 if (dir_filler(filler, buf, ".", 0) != 0 ||
165 dir_filler(filler, buf, "..", 0) != 0 ||
166 dir_filler(filler, buf, "cpuinfo", 0) != 0 ||
167 dir_filler(filler, buf, "meminfo", 0) != 0 ||
168 dir_filler(filler, buf, "stat", 0) != 0 ||
169 dir_filler(filler, buf, "uptime", 0) != 0 ||
170 dir_filler(filler, buf, "diskstats", 0) != 0 ||
171 dir_filler(filler, buf, "swaps", 0) != 0 ||
172 dir_filler(filler, buf, "loadavg", 0) != 0 ||
173 dir_filler(filler, buf, "slabinfo", 0) != 0)
174 return -EINVAL;
175
176 return 0;
177 }
178
179 __lxcfs_fuse_ops int proc_open(const char *path, struct fuse_file_info *fi)
180 {
181 __do_free struct file_info *info = NULL;
182 int type = -1;
183
184 if (strcmp(path, "/proc/meminfo") == 0)
185 type = LXC_TYPE_PROC_MEMINFO;
186 else if (strcmp(path, "/proc/cpuinfo") == 0)
187 type = LXC_TYPE_PROC_CPUINFO;
188 else if (strcmp(path, "/proc/uptime") == 0)
189 type = LXC_TYPE_PROC_UPTIME;
190 else if (strcmp(path, "/proc/stat") == 0)
191 type = LXC_TYPE_PROC_STAT;
192 else if (strcmp(path, "/proc/diskstats") == 0)
193 type = LXC_TYPE_PROC_DISKSTATS;
194 else if (strcmp(path, "/proc/swaps") == 0)
195 type = LXC_TYPE_PROC_SWAPS;
196 else if (strcmp(path, "/proc/loadavg") == 0)
197 type = LXC_TYPE_PROC_LOADAVG;
198 else if (strcmp(path, "/proc/slabinfo") == 0)
199 type = LXC_TYPE_PROC_SLABINFO;
200 if (type == -1)
201 return -ENOENT;
202
203 info = zalloc(sizeof(*info));
204 if (!info)
205 return -ENOMEM;
206
207 info->type = type;
208
209 if (liblxcfs_functional())
210 info->buflen = get_procfile_size_with_personality(path) + BUF_RESERVE_SIZE;
211 else
212 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
213
214 info->buf = zalloc(info->buflen);
215 if (!info->buf)
216 return -ENOMEM;
217 /* set actual size to buffer size */
218 info->size = info->buflen;
219
220 fi->fh = PTR_TO_UINT64(move_ptr(info));
221 return 0;
222 }
223
224 __lxcfs_fuse_ops int proc_access(const char *path, int mask)
225 {
226 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
227 return 0;
228
229 /* these are all read-only */
230 if ((mask & ~R_OK) != 0)
231 return -EACCES;
232
233 return 0;
234 }
235
236 __lxcfs_fuse_ops int proc_release(const char *path, struct fuse_file_info *fi)
237 {
238 do_release_file_info(fi);
239 return 0;
240 }
241
242 static uint64_t get_memlimit(const char *cgroup, bool swap)
243 {
244 __do_free char *memlimit_str = NULL;
245 uint64_t memlimit = 0;
246 int ret;
247
248 if (swap)
249 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
250 else
251 ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
252 if (ret > 0 && memlimit_str[0] && safe_uint64(memlimit_str, &memlimit, 10) < 0)
253 lxcfs_error("Failed to convert memory%s.max=%s for cgroup %s",
254 swap ? ".swap" : "", memlimit_str, cgroup);
255
256 return memlimit;
257 }
258
259 /*
260 * This function taken from glibc-2.32, as POSIX dirname("/some-dir") will
261 * return "/some-dir" as opposed to "/", which breaks `get_min_memlimit()`
262 */
263 static char *gnu_dirname(char *path)
264 {
265 static const char dot[] = ".";
266 char *last_slash;
267
268 /* Find last '/'. */
269 last_slash = path != NULL ? strrchr(path, '/') : NULL;
270
271 if (last_slash != NULL && last_slash != path && last_slash[1] == '\0') {
272 /* Determine whether all remaining characters are slashes. */
273 char *runp;
274
275 for (runp = last_slash; runp != path; --runp)
276 if (runp[-1] != '/')
277 break;
278
279 /* The '/' is the last character, we have to look further. */
280 if (runp != path)
281 last_slash = memrchr(path, '/', runp - path);
282 }
283
284 if (last_slash != NULL) {
285 /* Determine whether all remaining characters are slashes. */
286 char *runp;
287
288 for (runp = last_slash; runp != path; --runp)
289 if (runp[-1] != '/')
290 break;
291
292 /* Terminate the path. */
293 if (runp == path) {
294 /*
295 * The last slash is the first character in the string.
296 * We have to return "/". As a special case we have to
297 * return "//" if there are exactly two slashes at the
298 * beginning of the string. See XBD 4.10 Path Name
299 * Resolution for more information
300 */
301 if (last_slash == path + 1)
302 ++last_slash;
303 else
304 last_slash = path + 1;
305 } else
306 last_slash = runp;
307
308 last_slash[0] = '\0';
309 } else {
310 /*
311 * This assignment is ill-designed but the XPG specs require to
312 * return a string containing "." in any case no directory part
313 * is found and so a static and constant string is required.
314 */
315 path = (char *)dot;
316 }
317
318 return path;
319 }
320
321 static uint64_t get_min_memlimit(const char *cgroup, bool swap)
322 {
323 __do_free char *copy = NULL;
324 uint64_t memlimit = 0, retlimit = 0;
325
326 copy = strdup(cgroup);
327 if (!copy)
328 return log_error_errno(0, ENOMEM, "Failed to allocate memory");
329
330 retlimit = get_memlimit(copy, swap);
331
332 /*
333 * If the cgroup doesn't start with / (probably won't happen), dirname()
334 * will terminate with "" instead of "/"
335 */
336 while (*copy && strcmp(copy, "/") != 0) {
337 char *it = copy;
338
339 it = gnu_dirname(it);
340 memlimit = get_memlimit(it, swap);
341 if (memlimit > 0 && memlimit < retlimit)
342 retlimit = memlimit;
343 };
344
345 return retlimit;
346 }
347
348 static inline bool startswith(const char *line, const char *pref)
349 {
350 return strncmp(line, pref, strlen(pref)) == 0;
351 }
352
353 static void get_swap_info(const char *cgroup, uint64_t memlimit,
354 uint64_t memusage, uint64_t *swtotal,
355 uint64_t *swusage, uint64_t *memswpriority)
356 {
357 __do_free char *memswusage_str = NULL, *memswpriority_str = NULL;
358 uint64_t memswlimit = 0, memswusage = 0;
359 int ret;
360
361 *swtotal = *swusage = 0;
362 *memswpriority = 1;
363
364 memswlimit = get_min_memlimit(cgroup, true);
365 if (memswlimit > 0) {
366 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
367 if (ret < 0 || safe_uint64(memswusage_str, &memswusage, 10) != 0)
368 return;
369
370 if (liblxcfs_memory_is_cgroupv2()) {
371 *swtotal = memswlimit / 1024;
372 *swusage = memswusage / 1024;
373 } else {
374 if (memlimit > memswlimit)
375 *swtotal = 0;
376 else
377 *swtotal = (memswlimit - memlimit) / 1024;
378 if (memusage > memswusage || swtotal == 0)
379 *swusage = 0;
380 else
381 *swusage = (memswusage - memusage) / 1024;
382 }
383
384 ret = cgroup_ops->get_memory_swappiness(cgroup_ops, cgroup, &memswpriority_str);
385 if (ret >= 0)
386 safe_uint64(memswpriority_str, memswpriority, 10);
387 }
388 }
389
390 static int proc_swaps_read(char *buf, size_t size, off_t offset,
391 struct fuse_file_info *fi)
392 {
393 __do_free char *cgroup = NULL, *memusage_str = NULL,
394 *memswusage_str = NULL, *memswpriority_str = NULL;
395 struct fuse_context *fc = fuse_get_context();
396 bool wants_swap = lxcfs_has_opt(fuse_get_context()->private_data, LXCFS_SWAP_ON);
397 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
398 uint64_t memlimit = 0, memusage = 0,
399 swtotal = 0, swusage = 0, memswpriority = 1,
400 hostswtotal = 0, hostswfree = 0;
401 ssize_t total_len = 0;
402 ssize_t l = 0;
403 char *cache = d->buf;
404 int ret;
405 __do_free char *line = NULL;
406 __do_free void *fopen_cache = NULL;
407 __do_fclose FILE *f = NULL;
408 size_t linelen = 0;
409
410 if (offset) {
411 size_t left;
412
413 if (offset > d->size)
414 return -EINVAL;
415
416 if (!d->cached)
417 return 0;
418
419 left = d->size - offset;
420 total_len = left > size ? size: left;
421 memcpy(buf, cache + offset, total_len);
422
423 return total_len;
424 }
425
426 pid_t initpid = lookup_initpid_in_store(fc->pid);
427 if (initpid <= 1 || is_shared_pidns(initpid))
428 initpid = fc->pid;
429
430 cgroup = get_pid_cgroup(initpid, "memory");
431 if (!cgroup)
432 return read_file_fuse("/proc/swaps", buf, size, d);
433 prune_init_slice(cgroup);
434
435 memlimit = get_min_memlimit(cgroup, false);
436
437 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
438 if (ret < 0)
439 return 0;
440
441 if (safe_uint64(memusage_str, &memusage, 10) < 0)
442 lxcfs_error("Failed to convert memusage %s", memusage_str);
443
444 if (wants_swap)
445 get_swap_info(cgroup, memlimit, memusage, &swtotal, &swusage, &memswpriority);
446
447 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
448
449 /* Read host total and free values */
450 f = fopen_cached("/proc/meminfo", "re", &fopen_cache);
451 if (!f)
452 return 0;
453
454 while (getline(&line, &linelen, f) != -1) {
455 if (startswith(line, "SwapTotal:"))
456 sscanf(line, "SwapTotal: %8" PRIu64 " kB", &hostswtotal);
457 else if (startswith(line, "SwapFree:"))
458 sscanf(line, "SwapFree: %8" PRIu64 " kB", &hostswfree);
459 }
460
461 if (wants_swap) {
462 /* The total amount of swap is always reported to be the
463 lesser of the RAM+SWAP limit or the SWAP device size.
464 This is because the kernel can swap as much as it
465 wants and not only up to swtotal. */
466 swtotal = memlimit / 1024 + swtotal;
467 if (hostswtotal < swtotal) {
468 swtotal = hostswtotal;
469 }
470
471 /* When swappiness is 0, pretend we can't swap. */
472 if (memswpriority == 0) {
473 swtotal = swusage;
474 }
475 }
476
477 if (swtotal > 0) {
478 l = snprintf(d->buf + total_len, d->size - total_len,
479 "none%*svirtual\t\t%" PRIu64 "\t%" PRIu64 "\t0\n",
480 36, " ", swtotal, swusage);
481 total_len += l;
482 }
483
484 if (total_len < 0 || l < 0)
485 return log_error(0, "Failed writing to cache");
486
487 d->cached = 1;
488 d->size = (int)total_len;
489
490 if ((size_t)total_len > size)
491 total_len = size;
492 memcpy(buf, d->buf, total_len);
493
494 return total_len;
495 }
496
497 static void get_blkio_io_value(char *str, unsigned major, unsigned minor,
498 char *iotype, uint64_t *v)
499 {
500 char *eol;
501 char key[32];
502 size_t len;
503
504 memset(key, 0, 32);
505 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
506
507 *v = 0;
508 len = strlen(key);
509 while (*str) {
510 if (startswith(str, key)) {
511 sscanf(str + len, "%" PRIu64, v);
512 return;
513 }
514 eol = strchr(str, '\n');
515 if (!eol)
516 return;
517 str = eol + 1;
518 }
519 }
520
521 struct lxcfs_diskstats {
522 unsigned int major; /* 1 - major number */
523 unsigned int minor; /* 2 - minor mumber */
524 char dev_name[72]; /* 3 - device name */
525 uint64_t read; /* 4 - reads completed successfully */
526 uint64_t read_merged; /* 5 - reads merged */
527 uint64_t read_sectors; /* 6 - sectors read */
528 uint64_t read_ticks; /* 7 - time spent reading (ms) */
529 uint64_t write; /* 8 - writes completed */
530 uint64_t write_merged; /* 9 - writes merged */
531 uint64_t write_sectors; /* 10 - sectors written */
532 uint64_t write_ticks; /* 11 - time spent writing (ms) */
533 uint64_t ios_pgr; /* 12 - I/Os currently in progress */
534 uint64_t total_ticks; /* 13 - time spent doing I/Os (ms) */
535 uint64_t rq_ticks; /* 14 - weighted time spent doing I/Os (ms) */
536 uint64_t discard; /* 15 - discards completed successfully (4.18+) */
537 uint64_t discard_merged; /* 16 - discards merged (4.18+) */
538 uint64_t discard_sectors; /* 17 - sectors discarded (4.18+) */
539 uint64_t discard_ticks; /* 18 - time spent discarding (4.18+) */
540 };
541
542 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
543 struct fuse_file_info *fi)
544 {
545 __do_free char *cg = NULL, *io_serviced_str = NULL,
546 *io_merged_str = NULL, *io_service_bytes_str = NULL,
547 *io_wait_time_str = NULL, *io_service_time_str = NULL,
548 *line = NULL;
549 __do_free void *fopen_cache = NULL;
550 __do_fclose FILE *f = NULL;
551 struct fuse_context *fc = fuse_get_context();
552 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
553 struct lxcfs_diskstats stats = {};
554 /* helper fields */
555 uint64_t read_service_time, write_service_time, discard_service_time, read_wait_time,
556 write_wait_time, discard_wait_time;
557 char *cache = d->buf;
558 size_t cache_size = d->buflen;
559 size_t linelen = 0, total_len = 0;
560 int i = 0;
561 int ret;
562
563 if (offset) {
564 size_t left;
565
566 if (offset > d->size)
567 return -EINVAL;
568
569 if (!d->cached)
570 return 0;
571
572 left = d->size - offset;
573 total_len = left > size ? size: left;
574 memcpy(buf, cache + offset, total_len);
575
576 return total_len;
577 }
578
579 pid_t initpid = lookup_initpid_in_store(fc->pid);
580 if (initpid <= 1 || is_shared_pidns(initpid))
581 initpid = fc->pid;
582
583 cg = get_pid_cgroup(initpid, "blkio");
584 if (!cg)
585 return read_file_fuse("/proc/diskstats", buf, size, d);
586 prune_init_slice(cg);
587
588 ret = cgroup_ops->get_io_serviced(cgroup_ops, cg, &io_serviced_str);
589 if (ret < 0) {
590 if (ret == -EOPNOTSUPP)
591 return read_file_fuse("/proc/diskstats", buf, size, d);
592 }
593
594 ret = cgroup_ops->get_io_merged(cgroup_ops, cg, &io_merged_str);
595 if (ret < 0) {
596 if (ret == -EOPNOTSUPP)
597 return read_file_fuse("/proc/diskstats", buf, size, d);
598 }
599
600 ret = cgroup_ops->get_io_service_bytes(cgroup_ops, cg, &io_service_bytes_str);
601 if (ret < 0) {
602 if (ret == -EOPNOTSUPP)
603 return read_file_fuse("/proc/diskstats", buf, size, d);
604 }
605
606 ret = cgroup_ops->get_io_wait_time(cgroup_ops, cg, &io_wait_time_str);
607 if (ret < 0) {
608 if (ret == -EOPNOTSUPP)
609 return read_file_fuse("/proc/diskstats", buf, size, d);
610 }
611
612 ret = cgroup_ops->get_io_service_time(cgroup_ops, cg, &io_service_time_str);
613 if (ret < 0) {
614 if (ret == -EOPNOTSUPP)
615 return read_file_fuse("/proc/diskstats", buf, size, d);
616 }
617
618 f = fopen_cached("/proc/diskstats", "re", &fopen_cache);
619 if (!f)
620 return 0;
621
622 while (getline(&line, &linelen, f) != -1) {
623 ssize_t l;
624 char lbuf[256];
625
626 i = sscanf(line, "%u %u %71s", &stats.major, &stats.minor, stats.dev_name);
627 if (i != 3)
628 continue;
629
630 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Read", &stats.read);
631 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Write", &stats.write);
632 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Discard", &stats.discard);
633
634 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Read", &stats.read_merged);
635 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Write", &stats.write_merged);
636 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Discard", &stats.discard_merged);
637
638 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Read", &stats.read_sectors);
639 stats.read_sectors = stats.read_sectors / 512;
640 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Write", &stats.write_sectors);
641 stats.write_sectors = stats.write_sectors / 512;
642 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Discard", &stats.discard_sectors);
643 stats.discard_sectors = stats.discard_sectors / 512;
644
645 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Read", &read_service_time);
646 read_service_time = read_service_time / 1000000;
647 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Read", &read_wait_time);
648 read_wait_time = read_wait_time / 1000000;
649 stats.read_ticks = read_service_time + read_wait_time;
650
651 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Write", &write_service_time);
652 write_service_time = write_service_time / 1000000;
653 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Write", &write_wait_time);
654 write_wait_time = write_wait_time / 1000000;
655 stats.write_ticks = write_service_time + write_wait_time;
656
657 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Discard", &discard_service_time);
658 discard_service_time = discard_service_time / 1000000;
659 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Discard", &discard_wait_time);
660 discard_wait_time = discard_wait_time / 1000000;
661 stats.discard_ticks = discard_service_time + discard_wait_time;
662
663 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Total", &stats.total_ticks);
664 stats.total_ticks = stats.total_ticks / 1000000;
665
666 memset(lbuf, 0, sizeof(lbuf));
667 if (stats.read || stats.write || stats.read_merged || stats.write_merged ||
668 stats.read_sectors || stats.write_sectors || stats.read_ticks ||
669 stats.write_ticks || stats.ios_pgr || stats.total_ticks || stats.rq_ticks || stats.discard ||
670 stats.discard_merged || stats.discard_sectors || stats.discard_ticks) {
671 ret = strnprintf(
672 lbuf,
673 sizeof(lbuf),
674 "%u %u" /* major, minor */
675 " %s" /* dev_name */
676 " %" PRIu64 /* read */
677 " %" PRIu64 /* read_merged */
678 " %" PRIu64 /* read_sectors */
679 " %" PRIu64 /* read_ticks */
680 " %" PRIu64 /* write */
681 " %" PRIu64 /* write_merged */
682 " %" PRIu64 /* write_sectors */
683 " %" PRIu64 /* write_ticks */
684 " %" PRIu64 /* ios_pgr */
685 " %" PRIu64 /* total_ticks */
686 " %" PRIu64 /* rq_ticks */
687 " %" PRIu64 /* discard */
688 " %" PRIu64 /* discard_merged */
689 " %" PRIu64 /* discard_sectors */
690 " %" PRIu64 /* discard_ticks */
691 "\n",
692 stats.major,
693 stats.minor,
694 stats.dev_name,
695 stats.read,
696 stats.read_merged,
697 stats.read_sectors,
698 stats.read_ticks,
699 stats.write,
700 stats.write_merged,
701 stats.write_sectors,
702 stats.write_ticks,
703 stats.ios_pgr,
704 stats.total_ticks,
705 stats.rq_ticks,
706 stats.discard,
707 stats.discard_merged,
708 stats.discard_sectors,
709 stats.discard_ticks);
710 if (ret < 0) {
711 lxcfs_error("Insufficient buffer for %u:%u %s diskstats",
712 stats.major, stats.minor, stats.dev_name);
713 continue;
714 }
715 } else {
716 continue;
717 }
718
719 l = snprintf(cache, cache_size, "%s", lbuf);
720 if (l < 0)
721 return log_error(0, "Failed to write cache");
722 if ((size_t)l >= cache_size)
723 return log_error(0, "Write to cache was truncated");
724
725 cache += l;
726 cache_size -= l;
727 total_len += l;
728 }
729
730 d->cached = 1;
731 d->size = total_len;
732 if (total_len > size)
733 total_len = size;
734 memcpy(buf, d->buf, total_len);
735
736 return total_len;
737 }
738
739 #ifdef RELOADTEST
740 static inline void iwashere(void)
741 {
742 mknod("/tmp/lxcfs-iwashere", S_IFREG, 0644);
743 }
744 #endif
745
746 /*
747 * This function retrieves the busy time of a group of tasks by looking at
748 * cpuacct.usage. Unfortunately, this only makes sense when the container has
749 * been given it's own cpuacct cgroup. If not, this function will take the busy
750 * time of all other taks that do not actually belong to the container into
751 * account as well. If someone has a clever solution for this please send a
752 * patch!
753 */
754 static double get_reaper_busy(pid_t task)
755 {
756 __do_free char *cgroup = NULL, *usage_str = NULL;
757 uint64_t usage = 0;
758 pid_t initpid;
759
760 initpid = lookup_initpid_in_store(task);
761 if (initpid <= 0)
762 return 0;
763
764 cgroup = get_pid_cgroup(initpid, "cpuacct");
765 if (!cgroup)
766 return 0;
767 prune_init_slice(cgroup);
768
769 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cgroup, "cpuacct.usage", &usage_str))
770 return 0;
771
772 if (safe_uint64(usage_str, &usage, 10) < 0)
773 lxcfs_error("Failed to convert usage %s", usage_str);
774
775 return ((double)usage / 1000000000);
776 }
777
778 static uint64_t get_reaper_start_time(pid_t pid)
779 {
780 __do_free void *fopen_cache = NULL;
781 __do_fclose FILE *f = NULL;
782 int ret;
783 uint64_t starttime;
784 char path[STRLITERALLEN("/proc/") + LXCFS_NUMSTRLEN64 +
785 STRLITERALLEN("/stat") + 1];
786 pid_t qpid;
787
788 qpid = lookup_initpid_in_store(pid);
789 if (qpid <= 0)
790 return ret_errno(EINVAL);
791
792 ret = snprintf(path, sizeof(path), "/proc/%d/stat", qpid);
793 if (ret < 0 || (size_t)ret >= sizeof(path))
794 return ret_errno(EINVAL);
795
796 f = fopen_cached(path, "re", &fopen_cache);
797 if (!f)
798 return ret_errno(EINVAL);
799
800 /* Note that the *scanf() argument supression requires that length
801 * modifiers such as "l" are omitted. Otherwise some compilers will yell
802 * at us. It's like telling someone you're not married and then asking
803 * if you can bring your wife to the party.
804 */
805 ret = fscanf(f, "%*d " /* (1) pid %d */
806 "%*s " /* (2) comm %s */
807 "%*c " /* (3) state %c */
808 "%*d " /* (4) ppid %d */
809 "%*d " /* (5) pgrp %d */
810 "%*d " /* (6) session %d */
811 "%*d " /* (7) tty_nr %d */
812 "%*d " /* (8) tpgid %d */
813 "%*u " /* (9) flags %u */
814 "%*u " /* (10) minflt %lu */
815 "%*u " /* (11) cminflt %lu */
816 "%*u " /* (12) majflt %lu */
817 "%*u " /* (13) cmajflt %lu */
818 "%*u " /* (14) utime %lu */
819 "%*u " /* (15) stime %lu */
820 "%*d " /* (16) cutime %ld */
821 "%*d " /* (17) cstime %ld */
822 "%*d " /* (18) priority %ld */
823 "%*d " /* (19) nice %ld */
824 "%*d " /* (20) num_threads %ld */
825 "%*d " /* (21) itrealvalue %ld */
826 "%" PRIu64, /* (22) starttime %llu */
827 &starttime);
828 if (ret != 1)
829 return ret_errno(EINVAL);
830
831 return ret_set_errno(starttime, 0);
832 }
833
834 static double get_reaper_start_time_in_sec(pid_t pid)
835 {
836 uint64_t clockticks, ticks_per_sec;
837 int64_t ret;
838 double res = 0;
839
840 clockticks = get_reaper_start_time(pid);
841 if (clockticks <= 0)
842 return log_debug(0, "Failed to retrieve start time of pid %d", pid);
843
844 ret = sysconf(_SC_CLK_TCK);
845 if (ret < 0)
846 return log_debug(0, "Failed to determine number of clock ticks in a second");
847
848 ticks_per_sec = (uint64_t)ret;
849 res = (double)clockticks / ticks_per_sec;
850 return res;
851 }
852
853 static double get_reaper_age(pid_t pid)
854 {
855 uint64_t uptime_ms;
856 double procstart, procage;
857
858 /*
859 * We need to substract the time the process has started since system
860 * boot minus the time when the system has started to get the actual
861 * reaper age.
862 */
863 procstart = get_reaper_start_time_in_sec(pid);
864 procage = procstart;
865 if (procstart > 0) {
866 int ret;
867 struct timespec spec;
868
869 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
870 if (ret < 0)
871 return 0;
872
873 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
874 procage = (uptime_ms - (procstart * 1000)) / 1000;
875 }
876
877 return procage;
878 }
879
880 /*
881 * We read /proc/uptime and reuse its second field.
882 * For the first field, we use the mtime for the reaper for
883 * the calling pid as returned by getreaperage
884 */
885 static int proc_uptime_read(char *buf, size_t size, off_t offset,
886 struct fuse_file_info *fi)
887 {
888 struct fuse_context *fc = fuse_get_context();
889 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
890 char *cache = d->buf;
891 ssize_t total_len = 0, ret = 0;
892 double busytime, idletime, reaperage;
893
894 #ifdef RELOADTEST
895 iwashere();
896 #endif
897
898 if (offset) {
899 size_t left;
900
901 if (offset > d->size)
902 return -EINVAL;
903
904 if (!d->cached)
905 return 0;
906
907 left = d->size - offset;
908 total_len = left > size ? size : left;
909 memcpy(buf, cache + offset, total_len);
910
911 return total_len;
912 }
913
914 reaperage = get_reaper_age(fc->pid);
915 /*
916 * To understand why this is done, please read the comment to the
917 * get_reaper_busy() function.
918 */
919 idletime = reaperage;
920 busytime = get_reaper_busy(fc->pid);
921 if (reaperage >= busytime)
922 idletime = reaperage - busytime;
923
924 ret = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
925 if (ret < 0 || ret >= d->buflen)
926 return read_file_fuse("/proc/uptime", buf, size, d);
927 total_len = ret;
928
929 d->cached = 1;
930 d->size = total_len;
931 if ((size_t)total_len > size)
932 total_len = size;
933 memcpy(buf, d->buf, total_len);
934
935 return total_len;
936 }
937
938 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
939 static int proc_stat_read(char *buf, size_t size, off_t offset,
940 struct fuse_file_info *fi)
941 {
942 __do_free char *cg = NULL, *cpu_cg = NULL, *cpuset = NULL, *line = NULL;
943 __do_free void *fopen_cache = NULL;
944 __do_free struct cpuacct_usage *cg_cpu_usage = NULL;
945 __do_fclose FILE *f = NULL;
946 struct fuse_context *fc = fuse_get_context();
947 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
948 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
949 size_t linelen = 0, total_len = 0;
950 int curcpu = -1; /* cpu numbering starts at 0 */
951 int physcpu = 0;
952 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
953 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
954 uint64_t user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0,
955 iowait_sum = 0, irq_sum = 0, softirq_sum = 0, steal_sum = 0,
956 guest_sum = 0, guest_nice_sum = 0;
957 char cpuall[CPUALL_MAX_SIZE];
958 /* reserve for cpu all */
959 char *cache = d->buf + CPUALL_MAX_SIZE;
960 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
961 int cg_cpu_usage_size = 0;
962
963 if (offset) {
964 size_t left;
965
966 if (offset > d->size)
967 return -EINVAL;
968
969 if (!d->cached)
970 return 0;
971
972 left = d->size - offset;
973 total_len = left > size ? size : left;
974 memcpy(buf, d->buf + offset, total_len);
975
976 return total_len;
977 }
978
979 pid_t initpid = lookup_initpid_in_store(fc->pid);
980 if (initpid <= 1 || is_shared_pidns(initpid))
981 initpid = fc->pid;
982
983 /*
984 * when container run with host pid namespace initpid == 1, cgroup will "/"
985 * we should return host os's /proc contents.
986 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
987 */
988 if (initpid == 1)
989 return read_file_fuse("/proc/stat", buf, size, d);
990
991 cg = get_pid_cgroup(initpid, "cpuset");
992 if (!cg)
993 return read_file_fuse("/proc/stat", buf, size, d);
994 prune_init_slice(cg);
995 cpu_cg = get_pid_cgroup(initpid, "cpu");
996 if (!cpu_cg)
997 return read_file_fuse("/proc/stat", buf, size, d);
998 prune_init_slice(cpu_cg);
999 cpuset = get_cpuset(cg);
1000 if (!cpuset)
1001 return 0;
1002
1003 f = fopen_cached("/proc/stat", "re", &fopen_cache);
1004 if (!f)
1005 return 0;
1006
1007 /* Skip first system cpu line. */
1008 if (getline(&line, &linelen, f) < 0)
1009 return log_error(0, "proc_stat_read read first line failed");
1010
1011 /*
1012 * Read cpuacct.usage_all for all CPUs.
1013 * If the cpuacct cgroup is present, it is used to calculate the container's
1014 * CPU usage. If not, values from the host's /proc/stat are used.
1015 */
1016 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) == 0) {
1017 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs) {
1018 total_len = cpuview_proc_stat(cg, cpu_cg, cpuset, cg_cpu_usage,
1019 cg_cpu_usage_size, f,
1020 d->buf, d->buflen);
1021 goto out;
1022 }
1023 } else {
1024 lxcfs_v("proc_stat_read failed to read from cpuacct, falling back to the host's /proc/stat");
1025 }
1026
1027 while (getline(&line, &linelen, f) != -1) {
1028 ssize_t l;
1029 char cpu_char[10]; /* That's a lot of cores */
1030 char *c;
1031 uint64_t all_used, cg_used, new_idle;
1032 int ret, cpu_to_render;
1033
1034 if (strlen(line) == 0)
1035 continue;
1036 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
1037 /* not a ^cpuN line containing a number N, just print it */
1038 l = snprintf(cache, cache_size, "%s", line);
1039 if (l < 0)
1040 return log_error(0, "Failed to write cache");
1041 if ((size_t)l >= cache_size)
1042 return log_error(0, "Write to cache was truncated");
1043
1044 cache += l;
1045 cache_size -= l;
1046 total_len += l;
1047
1048 continue;
1049 }
1050
1051 if (sscanf(cpu_char, "%d", &physcpu) != 1)
1052 continue;
1053
1054 if (!cpu_in_cpuset(physcpu, cpuset))
1055 continue;
1056
1057 curcpu++;
1058
1059 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
1060 cpu_to_render = curcpu;
1061 else
1062 cpu_to_render = physcpu;
1063
1064 ret = sscanf(
1065 line,
1066 "%*s" /* <skip> */
1067 " %" PRIu64 /* user */
1068 " %" PRIu64 /* nice */
1069 " %" PRIu64 /* system */
1070 " %" PRIu64 /* idle */
1071 " %" PRIu64 /* iowait */
1072 " %" PRIu64 /* irq */
1073 " %" PRIu64 /* softirq */
1074 " %" PRIu64 /* steal */
1075 " %" PRIu64 /* guest */
1076 " %" PRIu64, /* guest_nice */
1077 &user,
1078 &nice,
1079 &system,
1080 &idle,
1081 &iowait,
1082 &irq,
1083 &softirq,
1084 &steal,
1085 &guest,
1086 &guest_nice);
1087 if (ret != 10 || !cg_cpu_usage) {
1088 c = strchr(line, ' ');
1089 if (!c)
1090 continue;
1091
1092 l = snprintf(cache, cache_size, "cpu%d%s", cpu_to_render, c);
1093 if (l < 0)
1094 return log_error(0, "Failed to write cache");
1095 if ((size_t)l >= cache_size)
1096 return log_error(0, "Write to cache was truncated");
1097
1098 cache += l;
1099 cache_size -= l;
1100 total_len += l;
1101
1102 if (ret != 10)
1103 continue;
1104 }
1105
1106 if (cg_cpu_usage) {
1107 if (physcpu >= cg_cpu_usage_size)
1108 break;
1109
1110 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
1111 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
1112
1113 if (all_used >= cg_used) {
1114 new_idle = idle + (all_used - cg_used);
1115 } else {
1116 lxcfs_debug("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
1117 cpu_to_render, cg, all_used, cg_used);
1118 new_idle = idle;
1119 }
1120
1121 l = snprintf(cache, cache_size,
1122 "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
1123 cpu_to_render, cg_cpu_usage[physcpu].user,
1124 cg_cpu_usage[physcpu].system, new_idle);
1125 if (l < 0)
1126 return log_error(0, "Failed to write cache");
1127 if ((size_t)l >= cache_size)
1128 return log_error(0, "Write to cache was truncated");
1129
1130 cache += l;
1131 cache_size -= l;
1132 total_len += l;
1133
1134 user_sum += cg_cpu_usage[physcpu].user;
1135 system_sum += cg_cpu_usage[physcpu].system;
1136 idle_sum += new_idle;
1137 } else {
1138 user_sum += user;
1139 nice_sum += nice;
1140 system_sum += system;
1141 idle_sum += idle;
1142 iowait_sum += iowait;
1143 irq_sum += irq;
1144 softirq_sum += softirq;
1145 steal_sum += steal;
1146 guest_sum += guest;
1147 guest_nice_sum += guest_nice;
1148 }
1149 }
1150
1151 cache = d->buf;
1152
1153 int cpuall_len = snprintf(
1154 cpuall,
1155 CPUALL_MAX_SIZE,
1156 "cpu "
1157 " %" PRIu64 /* user_sum */
1158 " %" PRIu64 /* nice_sum */
1159 " %" PRIu64 /* system_sum */
1160 " %" PRIu64 /* idle_sum */
1161 " %" PRIu64 /* iowait_sum */
1162 " %" PRIu64 /* irq_sum */
1163 " %" PRIu64 /* softirq_sum */
1164 " %" PRIu64 /* steal_sum */
1165 " %" PRIu64 /* guest_sum */
1166 " %" PRIu64 /* guest_nice_sum */
1167 "\n",
1168 user_sum,
1169 nice_sum,
1170 system_sum,
1171 idle_sum,
1172 iowait_sum,
1173 irq_sum,
1174 softirq_sum,
1175 steal_sum,
1176 guest_sum,
1177 guest_nice_sum);
1178 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
1179 memcpy(cache, cpuall, cpuall_len);
1180 cache += cpuall_len;
1181 } else {
1182 /* shouldn't happen */
1183 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d", cpuall_len);
1184 cpuall_len = 0;
1185 }
1186
1187 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
1188 total_len += cpuall_len;
1189
1190 out:
1191 d->cached = 1;
1192 d->size = total_len;
1193 if (total_len > size)
1194 total_len = size;
1195
1196 memcpy(buf, d->buf, total_len);
1197 return total_len;
1198 }
1199
1200 /* Note that "memory.stat" in cgroup2 is hierarchical by default. */
1201 static bool cgroup_parse_memory_stat(const char *cgroup, struct memory_stat *mstat)
1202 {
1203 __do_close int fd = -EBADF;
1204 __do_fclose FILE *f = NULL;
1205 __do_free char *line = NULL;
1206 __do_free void *fdopen_cache = NULL;
1207 bool unified;
1208 size_t len = 0;
1209 ssize_t linelen;
1210
1211 fd = cgroup_ops->get_memory_stats_fd(cgroup_ops, cgroup);
1212 if (fd < 0)
1213 return false;
1214
1215 f = fdopen_cached(fd, "re", &fdopen_cache);
1216 if (!f)
1217 return false;
1218
1219 unified = pure_unified_layout(cgroup_ops);
1220 while ((linelen = getline(&line, &len, f)) != -1) {
1221 if (!unified && startswith(line, "hierarchical_memory_limit")) {
1222 sscanf(line, "hierarchical_memory_limit %" PRIu64, &(mstat->hierarchical_memory_limit));
1223 } else if (!unified && startswith(line, "hierarchical_memsw_limit")) {
1224 sscanf(line, "hierarchical_memsw_limit %" PRIu64, &(mstat->hierarchical_memsw_limit));
1225 } else if (startswith(line, unified ? "file" :"total_cache")) {
1226 sscanf(line, unified ? "file %" PRIu64 : "total_cache %" PRIu64, &(mstat->total_cache));
1227 } else if (!unified && startswith(line, "total_rss")) {
1228 sscanf(line, "total_rss %" PRIu64, &(mstat->total_rss));
1229 } else if (!unified && startswith(line, "total_rss_huge")) {
1230 sscanf(line, "total_rss_huge %" PRIu64, &(mstat->total_rss_huge));
1231 } else if (startswith(line, unified ? "shmem" : "total_shmem")) {
1232 sscanf(line, unified ? "shmem %" PRIu64 : "total_shmem %" PRIu64, &(mstat->total_shmem));
1233 } else if (startswith(line, unified ? "file_mapped" : "total_mapped_file")) {
1234 sscanf(line, unified ? "file_mapped %" PRIu64 : "total_mapped_file %" PRIu64, &(mstat->total_mapped_file));
1235 } else if (!unified && startswith(line, "total_dirty")) {
1236 sscanf(line, "total_dirty %" PRIu64, &(mstat->total_dirty));
1237 } else if (!unified && startswith(line, "total_writeback")) {
1238 sscanf(line, "total_writeback %" PRIu64, &(mstat->total_writeback));
1239 } else if (!unified && startswith(line, "total_swap")) {
1240 sscanf(line, "total_swap %" PRIu64, &(mstat->total_swap));
1241 } else if (!unified && startswith(line, "total_pgpgin")) {
1242 sscanf(line, "total_pgpgin %" PRIu64, &(mstat->total_pgpgin));
1243 } else if (!unified && startswith(line, "total_pgpgout")) {
1244 sscanf(line, "total_pgpgout %" PRIu64, &(mstat->total_pgpgout));
1245 } else if (startswith(line, unified ? "pgfault" : "total_pgfault")) {
1246 sscanf(line, unified ? "pgfault %" PRIu64 : "total_pgfault %" PRIu64, &(mstat->total_pgfault));
1247 } else if (startswith(line, unified ? "pgmajfault" : "total_pgmajfault")) {
1248 sscanf(line, unified ? "pgmajfault %" PRIu64 : "total_pgmajfault %" PRIu64, &(mstat->total_pgmajfault));
1249 } else if (startswith(line, unified ? "inactive_anon" : "total_inactive_anon")) {
1250 sscanf(line, unified ? "inactive_anon %" PRIu64 : "total_inactive_anon %" PRIu64, &(mstat->total_inactive_anon));
1251 } else if (startswith(line, unified ? "active_anon" : "total_active_anon")) {
1252 sscanf(line, unified ? "active_anon %" PRIu64 : "total_active_anon %" PRIu64, &(mstat->total_active_anon));
1253 } else if (startswith(line, unified ? "inactive_file" : "total_inactive_file")) {
1254 sscanf(line, unified ? "inactive_file %" PRIu64 : "total_inactive_file %" PRIu64, &(mstat->total_inactive_file));
1255 } else if (startswith(line, unified ? "active_file" : "total_active_file")) {
1256 sscanf(line, unified ? "active_file %" PRIu64 : "total_active_file %" PRIu64, &(mstat->total_active_file));
1257 } else if (startswith(line, unified ? "unevictable" : "total_unevictable")) {
1258 sscanf(line, unified ? "unevictable %" PRIu64 : "total_unevictable %" PRIu64, &(mstat->total_unevictable));
1259 }
1260 }
1261
1262 return true;
1263 }
1264
1265 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1266 struct fuse_file_info *fi)
1267 {
1268 __do_free char *cgroup = NULL, *line = NULL, *memusage_str = NULL,
1269 *memswusage_str = NULL, *memswpriority_str = NULL;
1270 __do_free void *fopen_cache = NULL;
1271 __do_fclose FILE *f = NULL;
1272 struct fuse_context *fc = fuse_get_context();
1273 bool wants_swap = lxcfs_has_opt(fuse_get_context()->private_data, LXCFS_SWAP_ON);
1274 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1275 uint64_t memlimit = 0, memusage = 0,
1276 hosttotal = 0, swfree = 0, swusage = 0, swtotal = 0,
1277 memswpriority = 1;
1278 struct memory_stat mstat = {};
1279 size_t linelen = 0, total_len = 0;
1280 char *cache = d->buf;
1281 size_t cache_size = d->buflen;
1282 int ret;
1283
1284 if (offset) {
1285 size_t left;
1286
1287 if (offset > d->size)
1288 return -EINVAL;
1289
1290 if (!d->cached)
1291 return 0;
1292
1293 left = d->size - offset;
1294 total_len = left > size ? size : left;
1295 memcpy(buf, cache + offset, total_len);
1296
1297 return total_len;
1298 }
1299
1300 pid_t initpid = lookup_initpid_in_store(fc->pid);
1301 if (initpid <= 1 || is_shared_pidns(initpid))
1302 initpid = fc->pid;
1303
1304 cgroup = get_pid_cgroup(initpid, "memory");
1305 if (!cgroup)
1306 return read_file_fuse("/proc/meminfo", buf, size, d);
1307
1308 prune_init_slice(cgroup);
1309
1310 /* memory limits */
1311 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
1312 if (ret < 0)
1313 return read_file_fuse("/proc/meminfo", buf, size, d);
1314
1315 if (safe_uint64(memusage_str, &memusage, 10) < 0)
1316 lxcfs_error("Failed to convert memusage %s", memusage_str);
1317
1318 if (!cgroup_parse_memory_stat(cgroup, &mstat))
1319 return read_file_fuse("/proc/meminfo", buf, size, d);
1320
1321 memlimit = get_min_memlimit(cgroup, false);
1322
1323 /*
1324 * Following values are allowed to fail, because swapaccount might be
1325 * turned off for current kernel.
1326 */
1327 if (wants_swap)
1328 get_swap_info(cgroup, memlimit, memusage, &swtotal, &swusage, &memswpriority);
1329
1330 f = fopen_cached("/proc/meminfo", "re", &fopen_cache);
1331 if (!f)
1332 return read_file_fuse("/proc/meminfo", buf, size, d);
1333
1334 memusage /= 1024;
1335 memlimit /= 1024;
1336 while (getline(&line, &linelen, f) != -1) {
1337 ssize_t l;
1338 char *printme, lbuf[100];
1339
1340 memset(lbuf, 0, 100);
1341 if (startswith(line, "MemTotal:")) {
1342 sscanf(line+sizeof("MemTotal:")-1, "%" PRIu64, &hosttotal);
1343 if (memlimit == 0)
1344 memlimit = hosttotal;
1345
1346 if (hosttotal < memlimit)
1347 memlimit = hosttotal;
1348 snprintf(lbuf, 100, "MemTotal: %8" PRIu64 " kB\n", memlimit);
1349 printme = lbuf;
1350 } else if (startswith(line, "MemFree:")) {
1351 snprintf(lbuf, 100, "MemFree: %8" PRIu64 " kB\n", memlimit - memusage);
1352 printme = lbuf;
1353 } else if (startswith(line, "MemAvailable:")) {
1354 snprintf(lbuf, 100, "MemAvailable: %8" PRIu64 " kB\n", memlimit - memusage + (mstat.total_active_file + mstat.total_inactive_file) / 1024);
1355 printme = lbuf;
1356 } else if (startswith(line, "SwapTotal:")) {
1357 if (wants_swap) {
1358 uint64_t hostswtotal = 0;
1359
1360 sscanf(line + STRLITERALLEN("SwapTotal:"), "%" PRIu64, &hostswtotal);
1361
1362 /* The total amount of swap is always reported to be the
1363 lesser of the RAM+SWAP limit or the SWAP device size.
1364 This is because the kernel can swap as much as it
1365 wants and not only up to swtotal. */
1366
1367 if (!liblxcfs_memory_is_cgroupv2())
1368 swtotal += memlimit;
1369
1370 if (hostswtotal < swtotal) {
1371 swtotal = hostswtotal;
1372 }
1373
1374 /* When swappiness is 0, pretend we can't swap. */
1375 if (memswpriority == 0) {
1376 swtotal = swusage;
1377 }
1378 }
1379
1380 snprintf(lbuf, 100, "SwapTotal: %8" PRIu64 " kB\n", swtotal);
1381 printme = lbuf;
1382 } else if (startswith(line, "SwapFree:")) {
1383 if (wants_swap) {
1384 swfree = swtotal - swusage;
1385 }
1386
1387 snprintf(lbuf, 100, "SwapFree: %8" PRIu64 " kB\n", swfree);
1388 printme = lbuf;
1389 } else if (startswith(line, "Slab:")) {
1390 snprintf(lbuf, 100, "Slab: %8" PRIu64 " kB\n", (uint64_t)0);
1391 printme = lbuf;
1392 } else if (startswith(line, "Buffers:")) {
1393 snprintf(lbuf, 100, "Buffers: %8" PRIu64 " kB\n", (uint64_t)0);
1394 printme = lbuf;
1395 } else if (startswith(line, "Cached:")) {
1396 snprintf(lbuf, 100, "Cached: %8" PRIu64 " kB\n",
1397 mstat.total_cache / 1024);
1398 printme = lbuf;
1399 } else if (startswith(line, "SwapCached:")) {
1400 snprintf(lbuf, 100, "SwapCached: %8" PRIu64 " kB\n", (uint64_t)0);
1401 printme = lbuf;
1402 } else if (startswith(line, "Active:")) {
1403 snprintf(lbuf, 100, "Active: %8" PRIu64 " kB\n",
1404 (mstat.total_active_anon +
1405 mstat.total_active_file) /
1406 1024);
1407 printme = lbuf;
1408 } else if (startswith(line, "Inactive:")) {
1409 snprintf(lbuf, 100, "Inactive: %8" PRIu64 " kB\n",
1410 (mstat.total_inactive_anon +
1411 mstat.total_inactive_file) /
1412 1024);
1413 printme = lbuf;
1414 } else if (startswith(line, "Active(anon):")) {
1415 snprintf(lbuf, 100, "Active(anon): %8" PRIu64 " kB\n",
1416 mstat.total_active_anon / 1024);
1417 printme = lbuf;
1418 } else if (startswith(line, "Inactive(anon):")) {
1419 snprintf(lbuf, 100, "Inactive(anon): %8" PRIu64 " kB\n",
1420 mstat.total_inactive_anon / 1024);
1421 printme = lbuf;
1422 } else if (startswith(line, "Active(file):")) {
1423 snprintf(lbuf, 100, "Active(file): %8" PRIu64 " kB\n",
1424 mstat.total_active_file / 1024);
1425 printme = lbuf;
1426 } else if (startswith(line, "Inactive(file):")) {
1427 snprintf(lbuf, 100, "Inactive(file): %8" PRIu64 " kB\n",
1428 mstat.total_inactive_file / 1024);
1429 printme = lbuf;
1430 } else if (startswith(line, "Unevictable:")) {
1431 snprintf(lbuf, 100, "Unevictable: %8" PRIu64 " kB\n",
1432 mstat.total_unevictable / 1024);
1433 printme = lbuf;
1434 } else if (startswith(line, "Dirty:")) {
1435 snprintf(lbuf, 100, "Dirty: %8" PRIu64 " kB\n",
1436 mstat.total_dirty / 1024);
1437 printme = lbuf;
1438 } else if (startswith(line, "Writeback:")) {
1439 snprintf(lbuf, 100, "Writeback: %8" PRIu64 " kB\n",
1440 mstat.total_writeback / 1024);
1441 printme = lbuf;
1442 } else if (startswith(line, "AnonPages:")) {
1443 snprintf(lbuf, 100, "AnonPages: %8" PRIu64 " kB\n",
1444 (mstat.total_active_anon +
1445 mstat.total_inactive_anon - mstat.total_shmem) /
1446 1024);
1447 printme = lbuf;
1448 } else if (startswith(line, "Mapped:")) {
1449 snprintf(lbuf, 100, "Mapped: %8" PRIu64 " kB\n",
1450 mstat.total_mapped_file / 1024);
1451 printme = lbuf;
1452 } else if (startswith(line, "SReclaimable:")) {
1453 snprintf(lbuf, 100, "SReclaimable: %8" PRIu64 " kB\n", (uint64_t)0);
1454 printme = lbuf;
1455 } else if (startswith(line, "SUnreclaim:")) {
1456 snprintf(lbuf, 100, "SUnreclaim: %8" PRIu64 " kB\n", (uint64_t)0);
1457 printme = lbuf;
1458 } else if (startswith(line, "Shmem:")) {
1459 snprintf(lbuf, 100, "Shmem: %8" PRIu64 " kB\n",
1460 mstat.total_shmem / 1024);
1461 printme = lbuf;
1462 } else if (startswith(line, "ShmemHugePages:")) {
1463 snprintf(lbuf, 100, "ShmemHugePages: %8" PRIu64 " kB\n", (uint64_t)0);
1464 printme = lbuf;
1465 } else if (startswith(line, "ShmemPmdMapped:")) {
1466 snprintf(lbuf, 100, "ShmemPmdMapped: %8" PRIu64 " kB\n", (uint64_t)0);
1467 printme = lbuf;
1468 } else if (startswith(line, "AnonHugePages:")) {
1469 snprintf(lbuf, 100, "AnonHugePages: %8" PRIu64 " kB\n",
1470 mstat.total_rss_huge / 1024);
1471 printme = lbuf;
1472 } else {
1473 printme = line;
1474 }
1475
1476 l = snprintf(cache, cache_size, "%s", printme);
1477 if (l < 0)
1478 return log_error(0, "Failed to write cache");
1479 if ((size_t)l >= cache_size)
1480 return log_error(0, "Write to cache was truncated");
1481
1482 cache += l;
1483 cache_size -= l;
1484 total_len += l;
1485 }
1486
1487 d->cached = 1;
1488 d->size = total_len;
1489 if (total_len > size)
1490 total_len = size;
1491 memcpy(buf, d->buf, total_len);
1492
1493 return total_len;
1494 }
1495
1496 static int proc_slabinfo_read(char *buf, size_t size, off_t offset,
1497 struct fuse_file_info *fi)
1498 {
1499 __do_free char *cgroup = NULL, *line = NULL;
1500 __do_free void *fopen_cache = NULL;
1501 __do_fclose FILE *f = NULL;
1502 __do_close int fd = -EBADF;
1503 struct fuse_context *fc = fuse_get_context();
1504 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1505 size_t linelen = 0, total_len = 0;
1506 char *cache = d->buf;
1507 size_t cache_size = d->buflen;
1508 pid_t initpid;
1509
1510 if (offset) {
1511 size_t left;
1512
1513 if (offset > d->size)
1514 return -EINVAL;
1515
1516 if (!d->cached)
1517 return 0;
1518
1519 left = d->size - offset;
1520 total_len = left > size ? size : left;
1521 memcpy(buf, cache + offset, total_len);
1522
1523 return total_len;
1524 }
1525
1526 initpid = lookup_initpid_in_store(fc->pid);
1527 if (initpid <= 1 || is_shared_pidns(initpid))
1528 initpid = fc->pid;
1529
1530 cgroup = get_pid_cgroup(initpid, "memory");
1531 if (!cgroup)
1532 return read_file_fuse("/proc/slabinfo", buf, size, d);
1533
1534 prune_init_slice(cgroup);
1535
1536 fd = cgroup_ops->get_memory_slabinfo_fd(cgroup_ops, cgroup);
1537 if (fd < 0)
1538 return read_file_fuse("/proc/slabinfo", buf, size, d);
1539
1540 f = fdopen_cached(fd, "re", &fopen_cache);
1541 if (!f)
1542 return read_file_fuse("/proc/slabinfo", buf, size, d);
1543
1544 while (getline(&line, &linelen, f) != -1) {
1545 ssize_t l = snprintf(cache, cache_size, "%s", line);
1546 if (l < 0)
1547 return log_error(0, "Failed to write cache");
1548 if ((size_t)l >= cache_size)
1549 return log_error(0, "Write to cache was truncated");
1550
1551 cache += l;
1552 cache_size -= l;
1553 total_len += l;
1554 }
1555
1556 d->cached = 1;
1557 d->size = total_len;
1558 if (total_len > size)
1559 total_len = size;
1560 memcpy(buf, d->buf, total_len);
1561
1562 return total_len;
1563 }
1564
1565 static int proc_read_with_personality(int (*do_proc_read)(char *, size_t, off_t,
1566 struct fuse_file_info *), char *buf, size_t size, off_t offset,
1567 struct fuse_file_info *fi)
1568 {
1569 struct fuse_context *fc = fuse_get_context();
1570 __u32 host_personality = liblxcfs_personality(), caller_personality;
1571 bool change_personality;
1572 int ret, read_ret;
1573
1574 if (get_task_personality(fc->pid, &caller_personality) < 0)
1575 return log_error(0, "Failed to get caller process (pid: %d) personality", fc->pid);
1576
1577 /* do we need to change thread personality? */
1578 change_personality = host_personality != caller_personality;
1579
1580 if (change_personality) {
1581 ret = personality(caller_personality);
1582 if (ret == -1)
1583 return log_error(0, "Call to personality(%d) failed: %s\n",
1584 caller_personality, strerror(errno));
1585
1586 lxcfs_debug("task (tid: %d) personality was changed %d -> %d\n",
1587 (int)syscall(SYS_gettid), ret, caller_personality);
1588 }
1589
1590 read_ret = do_proc_read(buf, size, offset, fi);
1591
1592 if (change_personality) {
1593 ret = personality(host_personality);
1594 if (ret == -1)
1595 return log_error(0, "Call to personality(%d) failed: %s\n",
1596 host_personality, strerror(errno));
1597
1598 lxcfs_debug("task (tid: %d) personality was restored %d -> %d\n",
1599 (int)syscall(SYS_gettid), ret, host_personality);
1600 }
1601
1602 return read_ret;
1603 }
1604
1605 __lxcfs_fuse_ops int proc_read(const char *path, char *buf, size_t size,
1606 off_t offset, struct fuse_file_info *fi)
1607 {
1608 struct file_info *f = INTTYPE_TO_PTR(fi->fh);
1609
1610 switch (f->type) {
1611 case LXC_TYPE_PROC_MEMINFO:
1612 if (liblxcfs_functional())
1613 return proc_meminfo_read(buf, size, offset, fi);
1614
1615 return read_file_fuse_with_offset(LXC_TYPE_PROC_MEMINFO_PATH,
1616 buf, size, offset, f);
1617 case LXC_TYPE_PROC_CPUINFO:
1618 if (liblxcfs_functional())
1619 return proc_read_with_personality(&proc_cpuinfo_read, buf, size, offset, fi);
1620
1621 return read_file_fuse_with_offset(LXC_TYPE_PROC_CPUINFO_PATH,
1622 buf, size, offset, f);
1623 case LXC_TYPE_PROC_UPTIME:
1624 if (liblxcfs_functional())
1625 return proc_uptime_read(buf, size, offset, fi);
1626
1627 return read_file_fuse_with_offset(LXC_TYPE_PROC_UPTIME_PATH,
1628 buf, size, offset, f);
1629 case LXC_TYPE_PROC_STAT:
1630 if (liblxcfs_functional())
1631 return proc_stat_read(buf, size, offset, fi);
1632
1633 return read_file_fuse_with_offset(LXC_TYPE_PROC_STAT_PATH, buf,
1634 size, offset, f);
1635 case LXC_TYPE_PROC_DISKSTATS:
1636 if (liblxcfs_functional())
1637 return proc_diskstats_read(buf, size, offset, fi);
1638
1639 return read_file_fuse_with_offset(LXC_TYPE_PROC_DISKSTATS_PATH,
1640 buf, size, offset, f);
1641 case LXC_TYPE_PROC_SWAPS:
1642 if (liblxcfs_functional())
1643 return proc_swaps_read(buf, size, offset, fi);
1644
1645 return read_file_fuse_with_offset(LXC_TYPE_PROC_SWAPS_PATH, buf,
1646 size, offset, f);
1647 case LXC_TYPE_PROC_LOADAVG:
1648 if (liblxcfs_functional())
1649 return proc_loadavg_read(buf, size, offset, fi);
1650
1651 return read_file_fuse_with_offset(LXC_TYPE_PROC_LOADAVG_PATH,
1652 buf, size, offset, f);
1653 case LXC_TYPE_PROC_SLABINFO:
1654 if (liblxcfs_functional())
1655 return proc_slabinfo_read(buf, size, offset, fi);
1656
1657 return read_file_fuse_with_offset(LXC_TYPE_PROC_SLABINFO_PATH,
1658 buf, size, offset, f);
1659 }
1660
1661 return -EINVAL;
1662 }