]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/proc_fuse.c
25af10a1b4550b39b1d81f88574a6945e3d5262b
[mirror_lxcfs.git] / src / proc_fuse.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include "config.h"
4
5 #include <dirent.h>
6 #include <errno.h>
7 #include <fcntl.h>
8 #include <inttypes.h>
9 #include <libgen.h>
10 #include <pthread.h>
11 #include <sched.h>
12 #include <stdarg.h>
13 #include <stdbool.h>
14 #include <stdint.h>
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <time.h>
19 #include <unistd.h>
20 #include <wait.h>
21 #include <linux/magic.h>
22 #include <linux/sched.h>
23 #include <sys/epoll.h>
24 #include <sys/mman.h>
25 #include <sys/mount.h>
26 #include <sys/param.h>
27 #include <sys/personality.h>
28 #include <sys/socket.h>
29 #include <sys/syscall.h>
30 #include <sys/sysinfo.h>
31 #include <sys/vfs.h>
32
33 #include "proc_fuse.h"
34
35 #include "bindings.h"
36 #include "cgroup_fuse.h"
37 #include "cgroups/cgroup.h"
38 #include "cgroups/cgroup_utils.h"
39 #include "cpuset_parse.h"
40 #include "lxcfs_fuse_compat.h"
41 #include "memory_utils.h"
42 #include "proc_loadavg.h"
43 #include "proc_cpuview.h"
44 #include "utils.h"
45
46 struct memory_stat {
47 uint64_t hierarchical_memory_limit;
48 uint64_t hierarchical_memsw_limit;
49 uint64_t total_cache;
50 uint64_t total_rss;
51 uint64_t total_rss_huge;
52 uint64_t total_shmem;
53 uint64_t total_mapped_file;
54 uint64_t total_dirty;
55 uint64_t total_writeback;
56 uint64_t total_swap;
57 uint64_t total_pgpgin;
58 uint64_t total_pgpgout;
59 uint64_t total_pgfault;
60 uint64_t total_pgmajfault;
61 uint64_t total_inactive_anon;
62 uint64_t total_active_anon;
63 uint64_t total_inactive_file;
64 uint64_t total_active_file;
65 uint64_t total_unevictable;
66 };
67
68 static off_t get_procfile_size(const char *path)
69 {
70 __do_fclose FILE *f = NULL;
71 __do_free char *line = NULL;
72 size_t len = 0;
73 ssize_t sz, answer = 0;
74
75 f = fopen(path, "re");
76 if (!f)
77 return 0;
78
79 while ((sz = getline(&line, &len, f)) != -1)
80 answer += sz;
81
82 return answer;
83 }
84
85 __lxcfs_fuse_ops int proc_getattr(const char *path, struct stat *sb)
86 {
87 struct timespec now;
88
89 memset(sb, 0, sizeof(struct stat));
90 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
91 return -EINVAL;
92
93 sb->st_uid = sb->st_gid = 0;
94 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
95 if (strcmp(path, "/proc") == 0) {
96 sb->st_mode = S_IFDIR | 00555;
97 sb->st_nlink = 2;
98 return 0;
99 }
100
101 if (strcmp(path, "/proc/meminfo") == 0 ||
102 strcmp(path, "/proc/cpuinfo") == 0 ||
103 strcmp(path, "/proc/uptime") == 0 ||
104 strcmp(path, "/proc/stat") == 0 ||
105 strcmp(path, "/proc/diskstats") == 0 ||
106 strcmp(path, "/proc/swaps") == 0 ||
107 strcmp(path, "/proc/loadavg") == 0 ||
108 strcmp(path, "/proc/slabinfo") == 0) {
109 sb->st_size = get_procfile_size(path);
110 sb->st_mode = S_IFREG | 00444;
111 sb->st_nlink = 1;
112 return 0;
113 }
114
115 return -ENOENT;
116 }
117
118 __lxcfs_fuse_ops int proc_readdir(const char *path, void *buf,
119 fuse_fill_dir_t filler, off_t offset,
120 struct fuse_file_info *fi)
121 {
122 if (dir_filler(filler, buf, ".", 0) != 0 ||
123 dir_filler(filler, buf, "..", 0) != 0 ||
124 dir_filler(filler, buf, "cpuinfo", 0) != 0 ||
125 dir_filler(filler, buf, "meminfo", 0) != 0 ||
126 dir_filler(filler, buf, "stat", 0) != 0 ||
127 dir_filler(filler, buf, "uptime", 0) != 0 ||
128 dir_filler(filler, buf, "diskstats", 0) != 0 ||
129 dir_filler(filler, buf, "swaps", 0) != 0 ||
130 dir_filler(filler, buf, "loadavg", 0) != 0 ||
131 dir_filler(filler, buf, "slabinfo", 0) != 0)
132 return -EINVAL;
133
134 return 0;
135 }
136
137 __lxcfs_fuse_ops int proc_open(const char *path, struct fuse_file_info *fi)
138 {
139 __do_free struct file_info *info = NULL;
140 int type = -1;
141
142 if (strcmp(path, "/proc/meminfo") == 0)
143 type = LXC_TYPE_PROC_MEMINFO;
144 else if (strcmp(path, "/proc/cpuinfo") == 0)
145 type = LXC_TYPE_PROC_CPUINFO;
146 else if (strcmp(path, "/proc/uptime") == 0)
147 type = LXC_TYPE_PROC_UPTIME;
148 else if (strcmp(path, "/proc/stat") == 0)
149 type = LXC_TYPE_PROC_STAT;
150 else if (strcmp(path, "/proc/diskstats") == 0)
151 type = LXC_TYPE_PROC_DISKSTATS;
152 else if (strcmp(path, "/proc/swaps") == 0)
153 type = LXC_TYPE_PROC_SWAPS;
154 else if (strcmp(path, "/proc/loadavg") == 0)
155 type = LXC_TYPE_PROC_LOADAVG;
156 else if (strcmp(path, "/proc/slabinfo") == 0)
157 type = LXC_TYPE_PROC_SLABINFO;
158 if (type == -1)
159 return -ENOENT;
160
161 info = zalloc(sizeof(*info));
162 if (!info)
163 return -ENOMEM;
164
165 info->type = type;
166
167 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
168
169 info->buf = zalloc(info->buflen);
170 if (!info->buf)
171 return -ENOMEM;
172 /* set actual size to buffer size */
173 info->size = info->buflen;
174
175 fi->fh = PTR_TO_UINT64(move_ptr(info));
176 return 0;
177 }
178
179 __lxcfs_fuse_ops int proc_access(const char *path, int mask)
180 {
181 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
182 return 0;
183
184 /* these are all read-only */
185 if ((mask & ~R_OK) != 0)
186 return -EACCES;
187
188 return 0;
189 }
190
191 __lxcfs_fuse_ops int proc_release(const char *path, struct fuse_file_info *fi)
192 {
193 do_release_file_info(fi);
194 return 0;
195 }
196
197 static uint64_t get_memlimit(const char *cgroup, bool swap)
198 {
199 __do_free char *memlimit_str = NULL;
200 uint64_t memlimit = 0;
201 int ret;
202
203 if (swap)
204 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
205 else
206 ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
207 if (ret > 0 && memlimit_str[0] && safe_uint64(memlimit_str, &memlimit, 10) < 0)
208 lxcfs_error("Failed to convert memory%s.max=%s for cgroup %s",
209 swap ? ".swap" : "", memlimit_str, cgroup);
210
211 return memlimit;
212 }
213
214 /*
215 * This function taken from glibc-2.32, as POSIX dirname("/some-dir") will
216 * return "/some-dir" as opposed to "/", which breaks `get_min_memlimit()`
217 */
218 static char *gnu_dirname(char *path)
219 {
220 static const char dot[] = ".";
221 char *last_slash;
222
223 /* Find last '/'. */
224 last_slash = path != NULL ? strrchr(path, '/') : NULL;
225
226 if (last_slash != NULL && last_slash != path && last_slash[1] == '\0') {
227 /* Determine whether all remaining characters are slashes. */
228 char *runp;
229
230 for (runp = last_slash; runp != path; --runp)
231 if (runp[-1] != '/')
232 break;
233
234 /* The '/' is the last character, we have to look further. */
235 if (runp != path)
236 last_slash = memrchr(path, '/', runp - path);
237 }
238
239 if (last_slash != NULL) {
240 /* Determine whether all remaining characters are slashes. */
241 char *runp;
242
243 for (runp = last_slash; runp != path; --runp)
244 if (runp[-1] != '/')
245 break;
246
247 /* Terminate the path. */
248 if (runp == path) {
249 /*
250 * The last slash is the first character in the string.
251 * We have to return "/". As a special case we have to
252 * return "//" if there are exactly two slashes at the
253 * beginning of the string. See XBD 4.10 Path Name
254 * Resolution for more information
255 */
256 if (last_slash == path + 1)
257 ++last_slash;
258 else
259 last_slash = path + 1;
260 } else
261 last_slash = runp;
262
263 last_slash[0] = '\0';
264 } else {
265 /*
266 * This assignment is ill-designed but the XPG specs require to
267 * return a string containing "." in any case no directory part
268 * is found and so a static and constant string is required.
269 */
270 path = (char *)dot;
271 }
272
273 return path;
274 }
275
276 static uint64_t get_min_memlimit(const char *cgroup, bool swap)
277 {
278 __do_free char *copy = NULL;
279 uint64_t memlimit = 0, retlimit = 0;
280
281 copy = strdup(cgroup);
282 if (!copy)
283 return log_error_errno(0, ENOMEM, "Failed to allocate memory");
284
285 retlimit = get_memlimit(copy, swap);
286
287 /*
288 * If the cgroup doesn't start with / (probably won't happen), dirname()
289 * will terminate with "" instead of "/"
290 */
291 while (*copy && strcmp(copy, "/") != 0) {
292 char *it = copy;
293
294 it = gnu_dirname(it);
295 memlimit = get_memlimit(it, swap);
296 if (memlimit > 0 && memlimit < retlimit)
297 retlimit = memlimit;
298 };
299
300 return retlimit;
301 }
302
303 static inline bool startswith(const char *line, const char *pref)
304 {
305 return strncmp(line, pref, strlen(pref)) == 0;
306 }
307
308 static void get_swap_info(const char *cgroup, uint64_t memlimit,
309 uint64_t memusage, uint64_t *swtotal,
310 uint64_t *swusage, uint64_t *memswpriority)
311 {
312 __do_free char *memswusage_str = NULL, *memswpriority_str = NULL;
313 uint64_t memswlimit = 0, memswusage = 0;
314 int ret;
315
316 *swtotal = *swusage = 0;
317 *memswpriority = 1;
318
319 memswlimit = get_min_memlimit(cgroup, true);
320 if (memswlimit > 0) {
321 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
322 if (ret < 0 || safe_uint64(memswusage_str, &memswusage, 10) != 0)
323 return;
324
325 if (liblxcfs_memory_is_cgroupv2()) {
326 *swtotal = memswlimit / 1024;
327 *swusage = memswusage / 1024;
328 } else {
329 if (memlimit > memswlimit)
330 *swtotal = 0;
331 else
332 *swtotal = (memswlimit - memlimit) / 1024;
333 if (memusage > memswusage || swtotal == 0)
334 *swusage = 0;
335 else
336 *swusage = (memswusage - memusage) / 1024;
337 }
338
339 ret = cgroup_ops->get_memory_swappiness(cgroup_ops, cgroup, &memswpriority_str);
340 if (ret >= 0)
341 safe_uint64(memswpriority_str, memswpriority, 10);
342 }
343 }
344
345 static int proc_swaps_read(char *buf, size_t size, off_t offset,
346 struct fuse_file_info *fi)
347 {
348 __do_free char *cgroup = NULL, *memusage_str = NULL,
349 *memswusage_str = NULL, *memswpriority_str = NULL;
350 struct fuse_context *fc = fuse_get_context();
351 bool wants_swap = lxcfs_has_opt(fuse_get_context()->private_data, LXCFS_SWAP_ON);
352 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
353 uint64_t memlimit = 0, memusage = 0,
354 swtotal = 0, swusage = 0, memswpriority = 1,
355 hostswtotal = 0, hostswfree = 0;
356 ssize_t total_len = 0;
357 ssize_t l = 0;
358 char *cache = d->buf;
359 int ret;
360 __do_free char *line = NULL;
361 __do_free void *fopen_cache = NULL;
362 __do_fclose FILE *f = NULL;
363 size_t linelen = 0;
364
365 if (offset) {
366 size_t left;
367
368 if (offset > d->size)
369 return -EINVAL;
370
371 if (!d->cached)
372 return 0;
373
374 left = d->size - offset;
375 total_len = left > size ? size: left;
376 memcpy(buf, cache + offset, total_len);
377
378 return total_len;
379 }
380
381 pid_t initpid = lookup_initpid_in_store(fc->pid);
382 if (initpid <= 1 || is_shared_pidns(initpid))
383 initpid = fc->pid;
384
385 cgroup = get_pid_cgroup(initpid, "memory");
386 if (!cgroup)
387 return read_file_fuse("/proc/swaps", buf, size, d);
388 prune_init_slice(cgroup);
389
390 memlimit = get_min_memlimit(cgroup, false);
391
392 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
393 if (ret < 0)
394 return 0;
395
396 if (safe_uint64(memusage_str, &memusage, 10) < 0)
397 lxcfs_error("Failed to convert memusage %s", memusage_str);
398
399 if (wants_swap)
400 get_swap_info(cgroup, memlimit, memusage, &swtotal, &swusage, &memswpriority);
401
402 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
403
404 /* Read host total and free values */
405 f = fopen_cached("/proc/meminfo", "re", &fopen_cache);
406 if (!f)
407 return 0;
408
409 while (getline(&line, &linelen, f) != -1) {
410 if (startswith(line, "SwapTotal:"))
411 sscanf(line, "SwapTotal: %8" PRIu64 " kB", &hostswtotal);
412 else if (startswith(line, "SwapFree:"))
413 sscanf(line, "SwapFree: %8" PRIu64 " kB", &hostswfree);
414 }
415
416 if (wants_swap) {
417 /* The total amount of swap is always reported to be the
418 lesser of the RAM+SWAP limit or the SWAP device size.
419 This is because the kernel can swap as much as it
420 wants and not only up to swtotal. */
421 swtotal = memlimit / 1024 + swtotal;
422 if (hostswtotal < swtotal) {
423 swtotal = hostswtotal;
424 }
425
426 /* When swappiness is 0, pretend we can't swap. */
427 if (memswpriority == 0) {
428 swtotal = swusage;
429 }
430 }
431
432 if (swtotal > 0) {
433 l = snprintf(d->buf + total_len, d->size - total_len,
434 "none%*svirtual\t\t%" PRIu64 "\t%" PRIu64 "\t0\n",
435 36, " ", swtotal, swusage);
436 total_len += l;
437 }
438
439 if (total_len < 0 || l < 0)
440 return log_error(0, "Failed writing to cache");
441
442 d->cached = 1;
443 d->size = (int)total_len;
444
445 if ((size_t)total_len > size)
446 total_len = size;
447 memcpy(buf, d->buf, total_len);
448
449 return total_len;
450 }
451
452 static void get_blkio_io_value(char *str, unsigned major, unsigned minor,
453 char *iotype, uint64_t *v)
454 {
455 char *eol;
456 char key[32];
457 size_t len;
458
459 memset(key, 0, 32);
460 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
461
462 *v = 0;
463 len = strlen(key);
464 while (*str) {
465 if (startswith(str, key)) {
466 sscanf(str + len, "%" PRIu64, v);
467 return;
468 }
469 eol = strchr(str, '\n');
470 if (!eol)
471 return;
472 str = eol + 1;
473 }
474 }
475
476 struct lxcfs_diskstats {
477 unsigned int major; /* 1 - major number */
478 unsigned int minor; /* 2 - minor mumber */
479 char dev_name[72]; /* 3 - device name */
480 uint64_t read; /* 4 - reads completed successfully */
481 uint64_t read_merged; /* 5 - reads merged */
482 uint64_t read_sectors; /* 6 - sectors read */
483 uint64_t read_ticks; /* 7 - time spent reading (ms) */
484 uint64_t write; /* 8 - writes completed */
485 uint64_t write_merged; /* 9 - writes merged */
486 uint64_t write_sectors; /* 10 - sectors written */
487 uint64_t write_ticks; /* 11 - time spent writing (ms) */
488 uint64_t ios_pgr; /* 12 - I/Os currently in progress */
489 uint64_t total_ticks; /* 13 - time spent doing I/Os (ms) */
490 uint64_t rq_ticks; /* 14 - weighted time spent doing I/Os (ms) */
491 uint64_t discard; /* 15 - discards completed successfully (4.18+) */
492 uint64_t discard_merged; /* 16 - discards merged (4.18+) */
493 uint64_t discard_sectors; /* 17 - sectors discarded (4.18+) */
494 uint64_t discard_ticks; /* 18 - time spent discarding (4.18+) */
495 };
496
497 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
498 struct fuse_file_info *fi)
499 {
500 __do_free char *cg = NULL, *io_serviced_str = NULL,
501 *io_merged_str = NULL, *io_service_bytes_str = NULL,
502 *io_wait_time_str = NULL, *io_service_time_str = NULL,
503 *line = NULL;
504 __do_free void *fopen_cache = NULL;
505 __do_fclose FILE *f = NULL;
506 struct fuse_context *fc = fuse_get_context();
507 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
508 struct lxcfs_diskstats stats = {};
509 /* helper fields */
510 uint64_t read_service_time, write_service_time, discard_service_time, read_wait_time,
511 write_wait_time, discard_wait_time;
512 char *cache = d->buf;
513 size_t cache_size = d->buflen;
514 size_t linelen = 0, total_len = 0;
515 int i = 0;
516 int ret;
517
518 if (offset) {
519 size_t left;
520
521 if (offset > d->size)
522 return -EINVAL;
523
524 if (!d->cached)
525 return 0;
526
527 left = d->size - offset;
528 total_len = left > size ? size: left;
529 memcpy(buf, cache + offset, total_len);
530
531 return total_len;
532 }
533
534 pid_t initpid = lookup_initpid_in_store(fc->pid);
535 if (initpid <= 1 || is_shared_pidns(initpid))
536 initpid = fc->pid;
537
538 cg = get_pid_cgroup(initpid, "blkio");
539 if (!cg)
540 return read_file_fuse("/proc/diskstats", buf, size, d);
541 prune_init_slice(cg);
542
543 ret = cgroup_ops->get_io_serviced(cgroup_ops, cg, &io_serviced_str);
544 if (ret < 0) {
545 if (ret == -EOPNOTSUPP)
546 return read_file_fuse("/proc/diskstats", buf, size, d);
547 }
548
549 ret = cgroup_ops->get_io_merged(cgroup_ops, cg, &io_merged_str);
550 if (ret < 0) {
551 if (ret == -EOPNOTSUPP)
552 return read_file_fuse("/proc/diskstats", buf, size, d);
553 }
554
555 ret = cgroup_ops->get_io_service_bytes(cgroup_ops, cg, &io_service_bytes_str);
556 if (ret < 0) {
557 if (ret == -EOPNOTSUPP)
558 return read_file_fuse("/proc/diskstats", buf, size, d);
559 }
560
561 ret = cgroup_ops->get_io_wait_time(cgroup_ops, cg, &io_wait_time_str);
562 if (ret < 0) {
563 if (ret == -EOPNOTSUPP)
564 return read_file_fuse("/proc/diskstats", buf, size, d);
565 }
566
567 ret = cgroup_ops->get_io_service_time(cgroup_ops, cg, &io_service_time_str);
568 if (ret < 0) {
569 if (ret == -EOPNOTSUPP)
570 return read_file_fuse("/proc/diskstats", buf, size, d);
571 }
572
573 f = fopen_cached("/proc/diskstats", "re", &fopen_cache);
574 if (!f)
575 return 0;
576
577 while (getline(&line, &linelen, f) != -1) {
578 ssize_t l;
579 char lbuf[256];
580
581 i = sscanf(line, "%u %u %71s", &stats.major, &stats.minor, stats.dev_name);
582 if (i != 3)
583 continue;
584
585 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Read", &stats.read);
586 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Write", &stats.write);
587 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Discard", &stats.discard);
588
589 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Read", &stats.read_merged);
590 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Write", &stats.write_merged);
591 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Discard", &stats.discard_merged);
592
593 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Read", &stats.read_sectors);
594 stats.read_sectors = stats.read_sectors / 512;
595 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Write", &stats.write_sectors);
596 stats.write_sectors = stats.write_sectors / 512;
597 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Discard", &stats.discard_sectors);
598 stats.discard_sectors = stats.discard_sectors / 512;
599
600 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Read", &read_service_time);
601 read_service_time = read_service_time / 1000000;
602 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Read", &read_wait_time);
603 read_wait_time = read_wait_time / 1000000;
604 stats.read_ticks = read_service_time + read_wait_time;
605
606 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Write", &write_service_time);
607 write_service_time = write_service_time / 1000000;
608 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Write", &write_wait_time);
609 write_wait_time = write_wait_time / 1000000;
610 stats.write_ticks = write_service_time + write_wait_time;
611
612 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Discard", &discard_service_time);
613 discard_service_time = discard_service_time / 1000000;
614 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Discard", &discard_wait_time);
615 discard_wait_time = discard_wait_time / 1000000;
616 stats.discard_ticks = discard_service_time + discard_wait_time;
617
618 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Total", &stats.total_ticks);
619 stats.total_ticks = stats.total_ticks / 1000000;
620
621 memset(lbuf, 0, sizeof(lbuf));
622 if (stats.read || stats.write || stats.read_merged || stats.write_merged ||
623 stats.read_sectors || stats.write_sectors || stats.read_ticks ||
624 stats.write_ticks || stats.ios_pgr || stats.total_ticks || stats.rq_ticks || stats.discard ||
625 stats.discard_merged || stats.discard_sectors || stats.discard_ticks) {
626 ret = strnprintf(
627 lbuf,
628 sizeof(lbuf),
629 "%u %u" /* major, minor */
630 " %s" /* dev_name */
631 " %" PRIu64 /* read */
632 " %" PRIu64 /* read_merged */
633 " %" PRIu64 /* read_sectors */
634 " %" PRIu64 /* read_ticks */
635 " %" PRIu64 /* write */
636 " %" PRIu64 /* write_merged */
637 " %" PRIu64 /* write_sectors */
638 " %" PRIu64 /* write_ticks */
639 " %" PRIu64 /* ios_pgr */
640 " %" PRIu64 /* total_ticks */
641 " %" PRIu64 /* rq_ticks */
642 " %" PRIu64 /* discard */
643 " %" PRIu64 /* discard_merged */
644 " %" PRIu64 /* discard_sectors */
645 " %" PRIu64 /* discard_ticks */
646 "\n",
647 stats.major,
648 stats.minor,
649 stats.dev_name,
650 stats.read,
651 stats.read_merged,
652 stats.read_sectors,
653 stats.read_ticks,
654 stats.write,
655 stats.write_merged,
656 stats.write_sectors,
657 stats.write_ticks,
658 stats.ios_pgr,
659 stats.total_ticks,
660 stats.rq_ticks,
661 stats.discard,
662 stats.discard_merged,
663 stats.discard_sectors,
664 stats.discard_ticks);
665 if (ret < 0) {
666 lxcfs_error("Insufficient buffer for %u:%u %s diskstats",
667 stats.major, stats.minor, stats.dev_name);
668 continue;
669 }
670 } else {
671 continue;
672 }
673
674 l = snprintf(cache, cache_size, "%s", lbuf);
675 if (l < 0)
676 return log_error(0, "Failed to write cache");
677 if ((size_t)l >= cache_size)
678 return log_error(0, "Write to cache was truncated");
679
680 cache += l;
681 cache_size -= l;
682 total_len += l;
683 }
684
685 d->cached = 1;
686 d->size = total_len;
687 if (total_len > size)
688 total_len = size;
689 memcpy(buf, d->buf, total_len);
690
691 return total_len;
692 }
693
694 #ifdef RELOADTEST
695 static inline void iwashere(void)
696 {
697 mknod("/tmp/lxcfs-iwashere", S_IFREG, 0644);
698 }
699 #endif
700
701 /*
702 * This function retrieves the busy time of a group of tasks by looking at
703 * cpuacct.usage. Unfortunately, this only makes sense when the container has
704 * been given it's own cpuacct cgroup. If not, this function will take the busy
705 * time of all other taks that do not actually belong to the container into
706 * account as well. If someone has a clever solution for this please send a
707 * patch!
708 */
709 static double get_reaper_busy(pid_t task)
710 {
711 __do_free char *cgroup = NULL, *usage_str = NULL;
712 uint64_t usage = 0;
713 pid_t initpid;
714
715 initpid = lookup_initpid_in_store(task);
716 if (initpid <= 0)
717 return 0;
718
719 cgroup = get_pid_cgroup(initpid, "cpuacct");
720 if (!cgroup)
721 return 0;
722 prune_init_slice(cgroup);
723
724 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cgroup, "cpuacct.usage", &usage_str))
725 return 0;
726
727 if (safe_uint64(usage_str, &usage, 10) < 0)
728 lxcfs_error("Failed to convert usage %s", usage_str);
729
730 return ((double)usage / 1000000000);
731 }
732
733 static uint64_t get_reaper_start_time(pid_t pid)
734 {
735 __do_free void *fopen_cache = NULL;
736 __do_fclose FILE *f = NULL;
737 int ret;
738 uint64_t starttime;
739 char path[STRLITERALLEN("/proc/") + LXCFS_NUMSTRLEN64 +
740 STRLITERALLEN("/stat") + 1];
741 pid_t qpid;
742
743 qpid = lookup_initpid_in_store(pid);
744 if (qpid <= 0)
745 return ret_errno(EINVAL);
746
747 ret = snprintf(path, sizeof(path), "/proc/%d/stat", qpid);
748 if (ret < 0 || (size_t)ret >= sizeof(path))
749 return ret_errno(EINVAL);
750
751 f = fopen_cached(path, "re", &fopen_cache);
752 if (!f)
753 return ret_errno(EINVAL);
754
755 /* Note that the *scanf() argument supression requires that length
756 * modifiers such as "l" are omitted. Otherwise some compilers will yell
757 * at us. It's like telling someone you're not married and then asking
758 * if you can bring your wife to the party.
759 */
760 ret = fscanf(f, "%*d " /* (1) pid %d */
761 "%*s " /* (2) comm %s */
762 "%*c " /* (3) state %c */
763 "%*d " /* (4) ppid %d */
764 "%*d " /* (5) pgrp %d */
765 "%*d " /* (6) session %d */
766 "%*d " /* (7) tty_nr %d */
767 "%*d " /* (8) tpgid %d */
768 "%*u " /* (9) flags %u */
769 "%*u " /* (10) minflt %lu */
770 "%*u " /* (11) cminflt %lu */
771 "%*u " /* (12) majflt %lu */
772 "%*u " /* (13) cmajflt %lu */
773 "%*u " /* (14) utime %lu */
774 "%*u " /* (15) stime %lu */
775 "%*d " /* (16) cutime %ld */
776 "%*d " /* (17) cstime %ld */
777 "%*d " /* (18) priority %ld */
778 "%*d " /* (19) nice %ld */
779 "%*d " /* (20) num_threads %ld */
780 "%*d " /* (21) itrealvalue %ld */
781 "%" PRIu64, /* (22) starttime %llu */
782 &starttime);
783 if (ret != 1)
784 return ret_errno(EINVAL);
785
786 return ret_set_errno(starttime, 0);
787 }
788
789 static double get_reaper_start_time_in_sec(pid_t pid)
790 {
791 uint64_t clockticks, ticks_per_sec;
792 int64_t ret;
793 double res = 0;
794
795 clockticks = get_reaper_start_time(pid);
796 if (clockticks <= 0)
797 return log_debug(0, "Failed to retrieve start time of pid %d", pid);
798
799 ret = sysconf(_SC_CLK_TCK);
800 if (ret < 0)
801 return log_debug(0, "Failed to determine number of clock ticks in a second");
802
803 ticks_per_sec = (uint64_t)ret;
804 res = (double)clockticks / ticks_per_sec;
805 return res;
806 }
807
808 static double get_reaper_age(pid_t pid)
809 {
810 uint64_t uptime_ms;
811 double procstart, procage;
812
813 /*
814 * We need to substract the time the process has started since system
815 * boot minus the time when the system has started to get the actual
816 * reaper age.
817 */
818 procstart = get_reaper_start_time_in_sec(pid);
819 procage = procstart;
820 if (procstart > 0) {
821 int ret;
822 struct timespec spec;
823
824 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
825 if (ret < 0)
826 return 0;
827
828 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
829 procage = (uptime_ms - (procstart * 1000)) / 1000;
830 }
831
832 return procage;
833 }
834
835 /*
836 * We read /proc/uptime and reuse its second field.
837 * For the first field, we use the mtime for the reaper for
838 * the calling pid as returned by getreaperage
839 */
840 static int proc_uptime_read(char *buf, size_t size, off_t offset,
841 struct fuse_file_info *fi)
842 {
843 struct fuse_context *fc = fuse_get_context();
844 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
845 char *cache = d->buf;
846 ssize_t total_len = 0, ret = 0;
847 double busytime, idletime, reaperage;
848
849 #ifdef RELOADTEST
850 iwashere();
851 #endif
852
853 if (offset) {
854 size_t left;
855
856 if (offset > d->size)
857 return -EINVAL;
858
859 if (!d->cached)
860 return 0;
861
862 left = d->size - offset;
863 total_len = left > size ? size : left;
864 memcpy(buf, cache + offset, total_len);
865
866 return total_len;
867 }
868
869 reaperage = get_reaper_age(fc->pid);
870 /*
871 * To understand why this is done, please read the comment to the
872 * get_reaper_busy() function.
873 */
874 idletime = reaperage;
875 busytime = get_reaper_busy(fc->pid);
876 if (reaperage >= busytime)
877 idletime = reaperage - busytime;
878
879 ret = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
880 if (ret < 0 || ret >= d->buflen)
881 return read_file_fuse("/proc/uptime", buf, size, d);
882 total_len = ret;
883
884 d->cached = 1;
885 d->size = total_len;
886 if ((size_t)total_len > size)
887 total_len = size;
888 memcpy(buf, d->buf, total_len);
889
890 return total_len;
891 }
892
893 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
894 static int proc_stat_read(char *buf, size_t size, off_t offset,
895 struct fuse_file_info *fi)
896 {
897 __do_free char *cg = NULL, *cpu_cg = NULL, *cpuset = NULL, *line = NULL;
898 __do_free void *fopen_cache = NULL;
899 __do_free struct cpuacct_usage *cg_cpu_usage = NULL;
900 __do_fclose FILE *f = NULL;
901 struct fuse_context *fc = fuse_get_context();
902 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
903 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
904 size_t linelen = 0, total_len = 0;
905 int curcpu = -1; /* cpu numbering starts at 0 */
906 int physcpu = 0;
907 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
908 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
909 uint64_t user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0,
910 iowait_sum = 0, irq_sum = 0, softirq_sum = 0, steal_sum = 0,
911 guest_sum = 0, guest_nice_sum = 0;
912 char cpuall[CPUALL_MAX_SIZE];
913 /* reserve for cpu all */
914 char *cache = d->buf + CPUALL_MAX_SIZE;
915 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
916 int cg_cpu_usage_size = 0;
917
918 if (offset) {
919 size_t left;
920
921 if (offset > d->size)
922 return -EINVAL;
923
924 if (!d->cached)
925 return 0;
926
927 left = d->size - offset;
928 total_len = left > size ? size : left;
929 memcpy(buf, d->buf + offset, total_len);
930
931 return total_len;
932 }
933
934 pid_t initpid = lookup_initpid_in_store(fc->pid);
935 if (initpid <= 1 || is_shared_pidns(initpid))
936 initpid = fc->pid;
937
938 /*
939 * when container run with host pid namespace initpid == 1, cgroup will "/"
940 * we should return host os's /proc contents.
941 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
942 */
943 if (initpid == 1)
944 return read_file_fuse("/proc/stat", buf, size, d);
945
946 cg = get_pid_cgroup(initpid, "cpuset");
947 if (!cg)
948 return read_file_fuse("/proc/stat", buf, size, d);
949 prune_init_slice(cg);
950 cpu_cg = get_pid_cgroup(initpid, "cpu");
951 if (!cpu_cg)
952 return read_file_fuse("/proc/stat", buf, size, d);
953 prune_init_slice(cpu_cg);
954 cpuset = get_cpuset(cg);
955 if (!cpuset)
956 return 0;
957
958 f = fopen_cached("/proc/stat", "re", &fopen_cache);
959 if (!f)
960 return 0;
961
962 /* Skip first system cpu line. */
963 if (getline(&line, &linelen, f) < 0)
964 return log_error(0, "proc_stat_read read first line failed");
965
966 /*
967 * Read cpuacct.usage_all for all CPUs.
968 * If the cpuacct cgroup is present, it is used to calculate the container's
969 * CPU usage. If not, values from the host's /proc/stat are used.
970 */
971 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) == 0) {
972 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs) {
973 total_len = cpuview_proc_stat(cg, cpu_cg, cpuset, cg_cpu_usage,
974 cg_cpu_usage_size, f,
975 d->buf, d->buflen);
976 goto out;
977 }
978 } else {
979 lxcfs_v("proc_stat_read failed to read from cpuacct, falling back to the host's /proc/stat");
980 }
981
982 while (getline(&line, &linelen, f) != -1) {
983 ssize_t l;
984 char cpu_char[10]; /* That's a lot of cores */
985 char *c;
986 uint64_t all_used, cg_used, new_idle;
987 int ret, cpu_to_render;
988
989 if (strlen(line) == 0)
990 continue;
991 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
992 /* not a ^cpuN line containing a number N, just print it */
993 l = snprintf(cache, cache_size, "%s", line);
994 if (l < 0)
995 return log_error(0, "Failed to write cache");
996 if ((size_t)l >= cache_size)
997 return log_error(0, "Write to cache was truncated");
998
999 cache += l;
1000 cache_size -= l;
1001 total_len += l;
1002
1003 continue;
1004 }
1005
1006 if (sscanf(cpu_char, "%d", &physcpu) != 1)
1007 continue;
1008
1009 if (!cpu_in_cpuset(physcpu, cpuset))
1010 continue;
1011
1012 curcpu++;
1013
1014 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
1015 cpu_to_render = curcpu;
1016 else
1017 cpu_to_render = physcpu;
1018
1019 ret = sscanf(
1020 line,
1021 "%*s" /* <skip> */
1022 " %" PRIu64 /* user */
1023 " %" PRIu64 /* nice */
1024 " %" PRIu64 /* system */
1025 " %" PRIu64 /* idle */
1026 " %" PRIu64 /* iowait */
1027 " %" PRIu64 /* irq */
1028 " %" PRIu64 /* softirq */
1029 " %" PRIu64 /* steal */
1030 " %" PRIu64 /* guest */
1031 " %" PRIu64, /* guest_nice */
1032 &user,
1033 &nice,
1034 &system,
1035 &idle,
1036 &iowait,
1037 &irq,
1038 &softirq,
1039 &steal,
1040 &guest,
1041 &guest_nice);
1042 if (ret != 10 || !cg_cpu_usage) {
1043 c = strchr(line, ' ');
1044 if (!c)
1045 continue;
1046
1047 l = snprintf(cache, cache_size, "cpu%d%s", cpu_to_render, c);
1048 if (l < 0)
1049 return log_error(0, "Failed to write cache");
1050 if ((size_t)l >= cache_size)
1051 return log_error(0, "Write to cache was truncated");
1052
1053 cache += l;
1054 cache_size -= l;
1055 total_len += l;
1056
1057 if (ret != 10)
1058 continue;
1059 }
1060
1061 if (cg_cpu_usage) {
1062 if (physcpu >= cg_cpu_usage_size)
1063 break;
1064
1065 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
1066 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
1067
1068 if (all_used >= cg_used) {
1069 new_idle = idle + (all_used - cg_used);
1070 } else {
1071 lxcfs_debug("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
1072 cpu_to_render, cg, all_used, cg_used);
1073 new_idle = idle;
1074 }
1075
1076 l = snprintf(cache, cache_size,
1077 "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
1078 cpu_to_render, cg_cpu_usage[physcpu].user,
1079 cg_cpu_usage[physcpu].system, new_idle);
1080 if (l < 0)
1081 return log_error(0, "Failed to write cache");
1082 if ((size_t)l >= cache_size)
1083 return log_error(0, "Write to cache was truncated");
1084
1085 cache += l;
1086 cache_size -= l;
1087 total_len += l;
1088
1089 user_sum += cg_cpu_usage[physcpu].user;
1090 system_sum += cg_cpu_usage[physcpu].system;
1091 idle_sum += new_idle;
1092 } else {
1093 user_sum += user;
1094 nice_sum += nice;
1095 system_sum += system;
1096 idle_sum += idle;
1097 iowait_sum += iowait;
1098 irq_sum += irq;
1099 softirq_sum += softirq;
1100 steal_sum += steal;
1101 guest_sum += guest;
1102 guest_nice_sum += guest_nice;
1103 }
1104 }
1105
1106 cache = d->buf;
1107
1108 int cpuall_len = snprintf(
1109 cpuall,
1110 CPUALL_MAX_SIZE,
1111 "cpu "
1112 " %" PRIu64 /* user_sum */
1113 " %" PRIu64 /* nice_sum */
1114 " %" PRIu64 /* system_sum */
1115 " %" PRIu64 /* idle_sum */
1116 " %" PRIu64 /* iowait_sum */
1117 " %" PRIu64 /* irq_sum */
1118 " %" PRIu64 /* softirq_sum */
1119 " %" PRIu64 /* steal_sum */
1120 " %" PRIu64 /* guest_sum */
1121 " %" PRIu64 /* guest_nice_sum */
1122 "\n",
1123 user_sum,
1124 nice_sum,
1125 system_sum,
1126 idle_sum,
1127 iowait_sum,
1128 irq_sum,
1129 softirq_sum,
1130 steal_sum,
1131 guest_sum,
1132 guest_nice_sum);
1133 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
1134 memcpy(cache, cpuall, cpuall_len);
1135 cache += cpuall_len;
1136 } else {
1137 /* shouldn't happen */
1138 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d", cpuall_len);
1139 cpuall_len = 0;
1140 }
1141
1142 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
1143 total_len += cpuall_len;
1144
1145 out:
1146 d->cached = 1;
1147 d->size = total_len;
1148 if (total_len > size)
1149 total_len = size;
1150
1151 memcpy(buf, d->buf, total_len);
1152 return total_len;
1153 }
1154
1155 /* Note that "memory.stat" in cgroup2 is hierarchical by default. */
1156 static bool cgroup_parse_memory_stat(const char *cgroup, struct memory_stat *mstat)
1157 {
1158 __do_close int fd = -EBADF;
1159 __do_fclose FILE *f = NULL;
1160 __do_free char *line = NULL;
1161 __do_free void *fdopen_cache = NULL;
1162 bool unified;
1163 size_t len = 0;
1164 ssize_t linelen;
1165
1166 fd = cgroup_ops->get_memory_stats_fd(cgroup_ops, cgroup);
1167 if (fd < 0)
1168 return false;
1169
1170 f = fdopen_cached(fd, "re", &fdopen_cache);
1171 if (!f)
1172 return false;
1173
1174 unified = pure_unified_layout(cgroup_ops);
1175 while ((linelen = getline(&line, &len, f)) != -1) {
1176 if (!unified && startswith(line, "hierarchical_memory_limit")) {
1177 sscanf(line, "hierarchical_memory_limit %" PRIu64, &(mstat->hierarchical_memory_limit));
1178 } else if (!unified && startswith(line, "hierarchical_memsw_limit")) {
1179 sscanf(line, "hierarchical_memsw_limit %" PRIu64, &(mstat->hierarchical_memsw_limit));
1180 } else if (startswith(line, unified ? "file" :"total_cache")) {
1181 sscanf(line, unified ? "file %" PRIu64 : "total_cache %" PRIu64, &(mstat->total_cache));
1182 } else if (!unified && startswith(line, "total_rss")) {
1183 sscanf(line, "total_rss %" PRIu64, &(mstat->total_rss));
1184 } else if (!unified && startswith(line, "total_rss_huge")) {
1185 sscanf(line, "total_rss_huge %" PRIu64, &(mstat->total_rss_huge));
1186 } else if (startswith(line, unified ? "shmem" : "total_shmem")) {
1187 sscanf(line, unified ? "shmem %" PRIu64 : "total_shmem %" PRIu64, &(mstat->total_shmem));
1188 } else if (startswith(line, unified ? "file_mapped" : "total_mapped_file")) {
1189 sscanf(line, unified ? "file_mapped %" PRIu64 : "total_mapped_file %" PRIu64, &(mstat->total_mapped_file));
1190 } else if (!unified && startswith(line, "total_dirty")) {
1191 sscanf(line, "total_dirty %" PRIu64, &(mstat->total_dirty));
1192 } else if (!unified && startswith(line, "total_writeback")) {
1193 sscanf(line, "total_writeback %" PRIu64, &(mstat->total_writeback));
1194 } else if (!unified && startswith(line, "total_swap")) {
1195 sscanf(line, "total_swap %" PRIu64, &(mstat->total_swap));
1196 } else if (!unified && startswith(line, "total_pgpgin")) {
1197 sscanf(line, "total_pgpgin %" PRIu64, &(mstat->total_pgpgin));
1198 } else if (!unified && startswith(line, "total_pgpgout")) {
1199 sscanf(line, "total_pgpgout %" PRIu64, &(mstat->total_pgpgout));
1200 } else if (startswith(line, unified ? "pgfault" : "total_pgfault")) {
1201 sscanf(line, unified ? "pgfault %" PRIu64 : "total_pgfault %" PRIu64, &(mstat->total_pgfault));
1202 } else if (startswith(line, unified ? "pgmajfault" : "total_pgmajfault")) {
1203 sscanf(line, unified ? "pgmajfault %" PRIu64 : "total_pgmajfault %" PRIu64, &(mstat->total_pgmajfault));
1204 } else if (startswith(line, unified ? "inactive_anon" : "total_inactive_anon")) {
1205 sscanf(line, unified ? "inactive_anon %" PRIu64 : "total_inactive_anon %" PRIu64, &(mstat->total_inactive_anon));
1206 } else if (startswith(line, unified ? "active_anon" : "total_active_anon")) {
1207 sscanf(line, unified ? "active_anon %" PRIu64 : "total_active_anon %" PRIu64, &(mstat->total_active_anon));
1208 } else if (startswith(line, unified ? "inactive_file" : "total_inactive_file")) {
1209 sscanf(line, unified ? "inactive_file %" PRIu64 : "total_inactive_file %" PRIu64, &(mstat->total_inactive_file));
1210 } else if (startswith(line, unified ? "active_file" : "total_active_file")) {
1211 sscanf(line, unified ? "active_file %" PRIu64 : "total_active_file %" PRIu64, &(mstat->total_active_file));
1212 } else if (startswith(line, unified ? "unevictable" : "total_unevictable")) {
1213 sscanf(line, unified ? "unevictable %" PRIu64 : "total_unevictable %" PRIu64, &(mstat->total_unevictable));
1214 }
1215 }
1216
1217 return true;
1218 }
1219
1220 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1221 struct fuse_file_info *fi)
1222 {
1223 __do_free char *cgroup = NULL, *line = NULL, *memusage_str = NULL,
1224 *memswusage_str = NULL, *memswpriority_str = NULL;
1225 __do_free void *fopen_cache = NULL;
1226 __do_fclose FILE *f = NULL;
1227 struct fuse_context *fc = fuse_get_context();
1228 bool wants_swap = lxcfs_has_opt(fuse_get_context()->private_data, LXCFS_SWAP_ON);
1229 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1230 uint64_t memlimit = 0, memusage = 0,
1231 hosttotal = 0, swfree = 0, swusage = 0, swtotal = 0,
1232 memswpriority = 1;
1233 struct memory_stat mstat = {};
1234 size_t linelen = 0, total_len = 0;
1235 char *cache = d->buf;
1236 size_t cache_size = d->buflen;
1237 int ret;
1238
1239 if (offset) {
1240 size_t left;
1241
1242 if (offset > d->size)
1243 return -EINVAL;
1244
1245 if (!d->cached)
1246 return 0;
1247
1248 left = d->size - offset;
1249 total_len = left > size ? size : left;
1250 memcpy(buf, cache + offset, total_len);
1251
1252 return total_len;
1253 }
1254
1255 pid_t initpid = lookup_initpid_in_store(fc->pid);
1256 if (initpid <= 1 || is_shared_pidns(initpid))
1257 initpid = fc->pid;
1258
1259 cgroup = get_pid_cgroup(initpid, "memory");
1260 if (!cgroup)
1261 return read_file_fuse("/proc/meminfo", buf, size, d);
1262
1263 prune_init_slice(cgroup);
1264
1265 /* memory limits */
1266 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
1267 if (ret < 0)
1268 return read_file_fuse("/proc/meminfo", buf, size, d);
1269
1270 if (safe_uint64(memusage_str, &memusage, 10) < 0)
1271 lxcfs_error("Failed to convert memusage %s", memusage_str);
1272
1273 if (!cgroup_parse_memory_stat(cgroup, &mstat))
1274 return read_file_fuse("/proc/meminfo", buf, size, d);
1275
1276 memlimit = get_min_memlimit(cgroup, false);
1277
1278 /*
1279 * Following values are allowed to fail, because swapaccount might be
1280 * turned off for current kernel.
1281 */
1282 if (wants_swap)
1283 get_swap_info(cgroup, memlimit, memusage, &swtotal, &swusage, &memswpriority);
1284
1285 f = fopen_cached("/proc/meminfo", "re", &fopen_cache);
1286 if (!f)
1287 return read_file_fuse("/proc/meminfo", buf, size, d);
1288
1289 memusage /= 1024;
1290 memlimit /= 1024;
1291 while (getline(&line, &linelen, f) != -1) {
1292 ssize_t l;
1293 char *printme, lbuf[100];
1294
1295 memset(lbuf, 0, 100);
1296 if (startswith(line, "MemTotal:")) {
1297 sscanf(line+sizeof("MemTotal:")-1, "%" PRIu64, &hosttotal);
1298 if (memlimit == 0)
1299 memlimit = hosttotal;
1300
1301 if (hosttotal < memlimit)
1302 memlimit = hosttotal;
1303 snprintf(lbuf, 100, "MemTotal: %8" PRIu64 " kB\n", memlimit);
1304 printme = lbuf;
1305 } else if (startswith(line, "MemFree:")) {
1306 snprintf(lbuf, 100, "MemFree: %8" PRIu64 " kB\n", memlimit - memusage);
1307 printme = lbuf;
1308 } else if (startswith(line, "MemAvailable:")) {
1309 snprintf(lbuf, 100, "MemAvailable: %8" PRIu64 " kB\n", memlimit - memusage + mstat.total_cache / 1024);
1310 printme = lbuf;
1311 } else if (startswith(line, "SwapTotal:")) {
1312 if (wants_swap) {
1313 uint64_t hostswtotal = 0;
1314
1315 sscanf(line + STRLITERALLEN("SwapTotal:"), "%" PRIu64, &hostswtotal);
1316
1317 /* The total amount of swap is always reported to be the
1318 lesser of the RAM+SWAP limit or the SWAP device size.
1319 This is because the kernel can swap as much as it
1320 wants and not only up to swtotal. */
1321
1322 if (!liblxcfs_memory_is_cgroupv2())
1323 swtotal += memlimit;
1324
1325 if (hostswtotal < swtotal) {
1326 swtotal = hostswtotal;
1327 }
1328
1329 /* When swappiness is 0, pretend we can't swap. */
1330 if (memswpriority == 0) {
1331 swtotal = swusage;
1332 }
1333 }
1334
1335 snprintf(lbuf, 100, "SwapTotal: %8" PRIu64 " kB\n", swtotal);
1336 printme = lbuf;
1337 } else if (startswith(line, "SwapFree:")) {
1338 if (wants_swap) {
1339 swfree = swtotal - swusage;
1340 }
1341
1342 snprintf(lbuf, 100, "SwapFree: %8" PRIu64 " kB\n", swfree);
1343 printme = lbuf;
1344 } else if (startswith(line, "Slab:")) {
1345 snprintf(lbuf, 100, "Slab: %8" PRIu64 " kB\n", (uint64_t)0);
1346 printme = lbuf;
1347 } else if (startswith(line, "Buffers:")) {
1348 snprintf(lbuf, 100, "Buffers: %8" PRIu64 " kB\n", (uint64_t)0);
1349 printme = lbuf;
1350 } else if (startswith(line, "Cached:")) {
1351 snprintf(lbuf, 100, "Cached: %8" PRIu64 " kB\n",
1352 mstat.total_cache / 1024);
1353 printme = lbuf;
1354 } else if (startswith(line, "SwapCached:")) {
1355 snprintf(lbuf, 100, "SwapCached: %8" PRIu64 " kB\n", (uint64_t)0);
1356 printme = lbuf;
1357 } else if (startswith(line, "Active:")) {
1358 snprintf(lbuf, 100, "Active: %8" PRIu64 " kB\n",
1359 (mstat.total_active_anon +
1360 mstat.total_active_file) /
1361 1024);
1362 printme = lbuf;
1363 } else if (startswith(line, "Inactive:")) {
1364 snprintf(lbuf, 100, "Inactive: %8" PRIu64 " kB\n",
1365 (mstat.total_inactive_anon +
1366 mstat.total_inactive_file) /
1367 1024);
1368 printme = lbuf;
1369 } else if (startswith(line, "Active(anon):")) {
1370 snprintf(lbuf, 100, "Active(anon): %8" PRIu64 " kB\n",
1371 mstat.total_active_anon / 1024);
1372 printme = lbuf;
1373 } else if (startswith(line, "Inactive(anon):")) {
1374 snprintf(lbuf, 100, "Inactive(anon): %8" PRIu64 " kB\n",
1375 mstat.total_inactive_anon / 1024);
1376 printme = lbuf;
1377 } else if (startswith(line, "Active(file):")) {
1378 snprintf(lbuf, 100, "Active(file): %8" PRIu64 " kB\n",
1379 mstat.total_active_file / 1024);
1380 printme = lbuf;
1381 } else if (startswith(line, "Inactive(file):")) {
1382 snprintf(lbuf, 100, "Inactive(file): %8" PRIu64 " kB\n",
1383 mstat.total_inactive_file / 1024);
1384 printme = lbuf;
1385 } else if (startswith(line, "Unevictable:")) {
1386 snprintf(lbuf, 100, "Unevictable: %8" PRIu64 " kB\n",
1387 mstat.total_unevictable / 1024);
1388 printme = lbuf;
1389 } else if (startswith(line, "Dirty:")) {
1390 snprintf(lbuf, 100, "Dirty: %8" PRIu64 " kB\n",
1391 mstat.total_dirty / 1024);
1392 printme = lbuf;
1393 } else if (startswith(line, "Writeback:")) {
1394 snprintf(lbuf, 100, "Writeback: %8" PRIu64 " kB\n",
1395 mstat.total_writeback / 1024);
1396 printme = lbuf;
1397 } else if (startswith(line, "AnonPages:")) {
1398 snprintf(lbuf, 100, "AnonPages: %8" PRIu64 " kB\n",
1399 (mstat.total_active_anon +
1400 mstat.total_inactive_anon - mstat.total_shmem) /
1401 1024);
1402 printme = lbuf;
1403 } else if (startswith(line, "Mapped:")) {
1404 snprintf(lbuf, 100, "Mapped: %8" PRIu64 " kB\n",
1405 mstat.total_mapped_file / 1024);
1406 printme = lbuf;
1407 } else if (startswith(line, "SReclaimable:")) {
1408 snprintf(lbuf, 100, "SReclaimable: %8" PRIu64 " kB\n", (uint64_t)0);
1409 printme = lbuf;
1410 } else if (startswith(line, "SUnreclaim:")) {
1411 snprintf(lbuf, 100, "SUnreclaim: %8" PRIu64 " kB\n", (uint64_t)0);
1412 printme = lbuf;
1413 } else if (startswith(line, "Shmem:")) {
1414 snprintf(lbuf, 100, "Shmem: %8" PRIu64 " kB\n",
1415 mstat.total_shmem / 1024);
1416 printme = lbuf;
1417 } else if (startswith(line, "ShmemHugePages:")) {
1418 snprintf(lbuf, 100, "ShmemHugePages: %8" PRIu64 " kB\n", (uint64_t)0);
1419 printme = lbuf;
1420 } else if (startswith(line, "ShmemPmdMapped:")) {
1421 snprintf(lbuf, 100, "ShmemPmdMapped: %8" PRIu64 " kB\n", (uint64_t)0);
1422 printme = lbuf;
1423 } else if (startswith(line, "AnonHugePages:")) {
1424 snprintf(lbuf, 100, "AnonHugePages: %8" PRIu64 " kB\n",
1425 mstat.total_rss_huge / 1024);
1426 printme = lbuf;
1427 } else {
1428 printme = line;
1429 }
1430
1431 l = snprintf(cache, cache_size, "%s", printme);
1432 if (l < 0)
1433 return log_error(0, "Failed to write cache");
1434 if ((size_t)l >= cache_size)
1435 return log_error(0, "Write to cache was truncated");
1436
1437 cache += l;
1438 cache_size -= l;
1439 total_len += l;
1440 }
1441
1442 d->cached = 1;
1443 d->size = total_len;
1444 if (total_len > size)
1445 total_len = size;
1446 memcpy(buf, d->buf, total_len);
1447
1448 return total_len;
1449 }
1450
1451 static int proc_slabinfo_read(char *buf, size_t size, off_t offset,
1452 struct fuse_file_info *fi)
1453 {
1454 __do_free char *cgroup = NULL, *line = NULL;
1455 __do_free void *fopen_cache = NULL;
1456 __do_fclose FILE *f = NULL;
1457 __do_close int fd = -EBADF;
1458 struct fuse_context *fc = fuse_get_context();
1459 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1460 size_t linelen = 0, total_len = 0;
1461 char *cache = d->buf;
1462 size_t cache_size = d->buflen;
1463 pid_t initpid;
1464
1465 if (offset) {
1466 size_t left;
1467
1468 if (offset > d->size)
1469 return -EINVAL;
1470
1471 if (!d->cached)
1472 return 0;
1473
1474 left = d->size - offset;
1475 total_len = left > size ? size : left;
1476 memcpy(buf, cache + offset, total_len);
1477
1478 return total_len;
1479 }
1480
1481 initpid = lookup_initpid_in_store(fc->pid);
1482 if (initpid <= 1 || is_shared_pidns(initpid))
1483 initpid = fc->pid;
1484
1485 cgroup = get_pid_cgroup(initpid, "memory");
1486 if (!cgroup)
1487 return read_file_fuse("/proc/slabinfo", buf, size, d);
1488
1489 prune_init_slice(cgroup);
1490
1491 fd = cgroup_ops->get_memory_slabinfo_fd(cgroup_ops, cgroup);
1492 if (fd < 0)
1493 return read_file_fuse("/proc/slabinfo", buf, size, d);
1494
1495 f = fdopen_cached(fd, "re", &fopen_cache);
1496 if (!f)
1497 return read_file_fuse("/proc/slabinfo", buf, size, d);
1498
1499 while (getline(&line, &linelen, f) != -1) {
1500 ssize_t l = snprintf(cache, cache_size, "%s", line);
1501 if (l < 0)
1502 return log_error(0, "Failed to write cache");
1503 if ((size_t)l >= cache_size)
1504 return log_error(0, "Write to cache was truncated");
1505
1506 cache += l;
1507 cache_size -= l;
1508 total_len += l;
1509 }
1510
1511 d->cached = 1;
1512 d->size = total_len;
1513 if (total_len > size)
1514 total_len = size;
1515 memcpy(buf, d->buf, total_len);
1516
1517 return total_len;
1518 }
1519
1520 static int proc_read_with_personality(int (*do_proc_read)(char *, size_t, off_t,
1521 struct fuse_file_info *), char *buf, size_t size, off_t offset,
1522 struct fuse_file_info *fi)
1523 {
1524 struct fuse_context *fc = fuse_get_context();
1525 __u32 host_personality = liblxcfs_personality(), caller_personality;
1526 bool change_personality;
1527 int ret, read_ret;
1528
1529 if (get_task_personality(fc->pid, &caller_personality) < 0)
1530 return log_error(0, "Failed to get caller process (pid: %d) personality", fc->pid);
1531
1532 /* do we need to change thread personality? */
1533 change_personality = host_personality != caller_personality;
1534
1535 if (change_personality) {
1536 ret = personality(caller_personality);
1537 if (ret == -1)
1538 return log_error(0, "Call to personality(%d) failed: %s\n",
1539 caller_personality, strerror(errno));
1540
1541 lxcfs_debug("task (tid: %d) personality was changed %d -> %d\n",
1542 (int)syscall(SYS_gettid), ret, caller_personality);
1543 }
1544
1545 read_ret = do_proc_read(buf, size, offset, fi);
1546
1547 if (change_personality) {
1548 ret = personality(host_personality);
1549 if (ret == -1)
1550 return log_error(0, "Call to personality(%d) failed: %s\n",
1551 host_personality, strerror(errno));
1552
1553 lxcfs_debug("task (tid: %d) personality was restored %d -> %d\n",
1554 (int)syscall(SYS_gettid), ret, host_personality);
1555 }
1556
1557 return read_ret;
1558 }
1559
1560 __lxcfs_fuse_ops int proc_read(const char *path, char *buf, size_t size,
1561 off_t offset, struct fuse_file_info *fi)
1562 {
1563 struct file_info *f = INTTYPE_TO_PTR(fi->fh);
1564
1565 switch (f->type) {
1566 case LXC_TYPE_PROC_MEMINFO:
1567 if (liblxcfs_functional())
1568 return proc_meminfo_read(buf, size, offset, fi);
1569
1570 return read_file_fuse_with_offset(LXC_TYPE_PROC_MEMINFO_PATH,
1571 buf, size, offset, f);
1572 case LXC_TYPE_PROC_CPUINFO:
1573 if (liblxcfs_functional())
1574 return proc_read_with_personality(&proc_cpuinfo_read, buf, size, offset, fi);
1575
1576 return read_file_fuse_with_offset(LXC_TYPE_PROC_CPUINFO_PATH,
1577 buf, size, offset, f);
1578 case LXC_TYPE_PROC_UPTIME:
1579 if (liblxcfs_functional())
1580 return proc_uptime_read(buf, size, offset, fi);
1581
1582 return read_file_fuse_with_offset(LXC_TYPE_PROC_UPTIME_PATH,
1583 buf, size, offset, f);
1584 case LXC_TYPE_PROC_STAT:
1585 if (liblxcfs_functional())
1586 return proc_stat_read(buf, size, offset, fi);
1587
1588 return read_file_fuse_with_offset(LXC_TYPE_PROC_STAT_PATH, buf,
1589 size, offset, f);
1590 case LXC_TYPE_PROC_DISKSTATS:
1591 if (liblxcfs_functional())
1592 return proc_diskstats_read(buf, size, offset, fi);
1593
1594 return read_file_fuse_with_offset(LXC_TYPE_PROC_DISKSTATS_PATH,
1595 buf, size, offset, f);
1596 case LXC_TYPE_PROC_SWAPS:
1597 if (liblxcfs_functional())
1598 return proc_swaps_read(buf, size, offset, fi);
1599
1600 return read_file_fuse_with_offset(LXC_TYPE_PROC_SWAPS_PATH, buf,
1601 size, offset, f);
1602 case LXC_TYPE_PROC_LOADAVG:
1603 if (liblxcfs_functional())
1604 return proc_loadavg_read(buf, size, offset, fi);
1605
1606 return read_file_fuse_with_offset(LXC_TYPE_PROC_LOADAVG_PATH,
1607 buf, size, offset, f);
1608 case LXC_TYPE_PROC_SLABINFO:
1609 if (liblxcfs_functional())
1610 return proc_slabinfo_read(buf, size, offset, fi);
1611
1612 return read_file_fuse_with_offset(LXC_TYPE_PROC_SLABINFO_PATH,
1613 buf, size, offset, f);
1614 }
1615
1616 return -EINVAL;
1617 }