]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/proc_fuse.c
make meminfo and swaps cgroupv2 aware
[mirror_lxcfs.git] / src / proc_fuse.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include "config.h"
4
5 #include <dirent.h>
6 #include <errno.h>
7 #include <fcntl.h>
8 #include <inttypes.h>
9 #include <libgen.h>
10 #include <pthread.h>
11 #include <sched.h>
12 #include <stdarg.h>
13 #include <stdbool.h>
14 #include <stdint.h>
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <time.h>
19 #include <unistd.h>
20 #include <wait.h>
21 #include <linux/magic.h>
22 #include <linux/sched.h>
23 #include <sys/epoll.h>
24 #include <sys/mman.h>
25 #include <sys/mount.h>
26 #include <sys/param.h>
27 #include <sys/socket.h>
28 #include <sys/syscall.h>
29 #include <sys/sysinfo.h>
30 #include <sys/vfs.h>
31
32 #include "proc_fuse.h"
33
34 #include "bindings.h"
35 #include "cgroup_fuse.h"
36 #include "cgroups/cgroup.h"
37 #include "cgroups/cgroup_utils.h"
38 #include "cpuset_parse.h"
39 #include "lxcfs_fuse_compat.h"
40 #include "memory_utils.h"
41 #include "proc_loadavg.h"
42 #include "proc_cpuview.h"
43 #include "utils.h"
44
45 struct memory_stat {
46 uint64_t hierarchical_memory_limit;
47 uint64_t hierarchical_memsw_limit;
48 uint64_t total_cache;
49 uint64_t total_rss;
50 uint64_t total_rss_huge;
51 uint64_t total_shmem;
52 uint64_t total_mapped_file;
53 uint64_t total_dirty;
54 uint64_t total_writeback;
55 uint64_t total_swap;
56 uint64_t total_pgpgin;
57 uint64_t total_pgpgout;
58 uint64_t total_pgfault;
59 uint64_t total_pgmajfault;
60 uint64_t total_inactive_anon;
61 uint64_t total_active_anon;
62 uint64_t total_inactive_file;
63 uint64_t total_active_file;
64 uint64_t total_unevictable;
65 };
66
67 static off_t get_procfile_size(const char *path)
68 {
69 __do_fclose FILE *f = NULL;
70 __do_free char *line = NULL;
71 size_t len = 0;
72 ssize_t sz, answer = 0;
73
74 f = fopen(path, "re");
75 if (!f)
76 return 0;
77
78 while ((sz = getline(&line, &len, f)) != -1)
79 answer += sz;
80
81 return answer;
82 }
83
84 __lxcfs_fuse_ops int proc_getattr(const char *path, struct stat *sb)
85 {
86 struct timespec now;
87
88 memset(sb, 0, sizeof(struct stat));
89 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
90 return -EINVAL;
91
92 sb->st_uid = sb->st_gid = 0;
93 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
94 if (strcmp(path, "/proc") == 0) {
95 sb->st_mode = S_IFDIR | 00555;
96 sb->st_nlink = 2;
97 return 0;
98 }
99
100 if (strcmp(path, "/proc/meminfo") == 0 ||
101 strcmp(path, "/proc/cpuinfo") == 0 ||
102 strcmp(path, "/proc/uptime") == 0 ||
103 strcmp(path, "/proc/stat") == 0 ||
104 strcmp(path, "/proc/diskstats") == 0 ||
105 strcmp(path, "/proc/swaps") == 0 ||
106 strcmp(path, "/proc/loadavg") == 0 ||
107 strcmp(path, "/proc/slabinfo") == 0) {
108 sb->st_size = get_procfile_size(path);
109 sb->st_mode = S_IFREG | 00444;
110 sb->st_nlink = 1;
111 return 0;
112 }
113
114 return -ENOENT;
115 }
116
117 __lxcfs_fuse_ops int proc_readdir(const char *path, void *buf,
118 fuse_fill_dir_t filler, off_t offset,
119 struct fuse_file_info *fi)
120 {
121 if (DIR_FILLER(filler, buf, ".", NULL, 0) != 0 ||
122 DIR_FILLER(filler, buf, "..", NULL, 0) != 0 ||
123 DIR_FILLER(filler, buf, "cpuinfo", NULL, 0) != 0 ||
124 DIR_FILLER(filler, buf, "meminfo", NULL, 0) != 0 ||
125 DIR_FILLER(filler, buf, "stat", NULL, 0) != 0 ||
126 DIR_FILLER(filler, buf, "uptime", NULL, 0) != 0 ||
127 DIR_FILLER(filler, buf, "diskstats", NULL, 0) != 0 ||
128 DIR_FILLER(filler, buf, "swaps", NULL, 0) != 0 ||
129 DIR_FILLER(filler, buf, "loadavg", NULL, 0) != 0 ||
130 DIR_FILLER(filler, buf, "slabinfo", NULL, 0) != 0)
131 return -EINVAL;
132
133 return 0;
134 }
135
136 __lxcfs_fuse_ops int proc_open(const char *path, struct fuse_file_info *fi)
137 {
138 __do_free struct file_info *info = NULL;
139 int type = -1;
140
141 if (strcmp(path, "/proc/meminfo") == 0)
142 type = LXC_TYPE_PROC_MEMINFO;
143 else if (strcmp(path, "/proc/cpuinfo") == 0)
144 type = LXC_TYPE_PROC_CPUINFO;
145 else if (strcmp(path, "/proc/uptime") == 0)
146 type = LXC_TYPE_PROC_UPTIME;
147 else if (strcmp(path, "/proc/stat") == 0)
148 type = LXC_TYPE_PROC_STAT;
149 else if (strcmp(path, "/proc/diskstats") == 0)
150 type = LXC_TYPE_PROC_DISKSTATS;
151 else if (strcmp(path, "/proc/swaps") == 0)
152 type = LXC_TYPE_PROC_SWAPS;
153 else if (strcmp(path, "/proc/loadavg") == 0)
154 type = LXC_TYPE_PROC_LOADAVG;
155 else if (strcmp(path, "/proc/slabinfo") == 0)
156 type = LXC_TYPE_PROC_SLABINFO;
157 if (type == -1)
158 return -ENOENT;
159
160 info = zalloc(sizeof(*info));
161 if (!info)
162 return -ENOMEM;
163
164 info->type = type;
165
166 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
167
168 info->buf = zalloc(info->buflen);
169 if (!info->buf)
170 return -ENOMEM;
171 /* set actual size to buffer size */
172 info->size = info->buflen;
173
174 fi->fh = PTR_TO_UINT64(move_ptr(info));
175 return 0;
176 }
177
178 __lxcfs_fuse_ops int proc_access(const char *path, int mask)
179 {
180 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
181 return 0;
182
183 /* these are all read-only */
184 if ((mask & ~R_OK) != 0)
185 return -EACCES;
186
187 return 0;
188 }
189
190 __lxcfs_fuse_ops int proc_release(const char *path, struct fuse_file_info *fi)
191 {
192 do_release_file_info(fi);
193 return 0;
194 }
195
196 static uint64_t get_memlimit(const char *cgroup, bool swap)
197 {
198 __do_free char *memlimit_str = NULL;
199 uint64_t memlimit = 0;
200 int ret;
201
202 if (swap)
203 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
204 else
205 ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
206 if (ret > 0 && memlimit_str[0] && safe_uint64(memlimit_str, &memlimit, 10) < 0)
207 lxcfs_error("Failed to convert memlimit %s", memlimit_str);
208
209 return memlimit;
210 }
211
212 /*
213 * This function taken from glibc-2.32, as POSIX dirname("/some-dir") will
214 * return "/some-dir" as opposed to "/", which breaks `get_min_memlimit()`
215 */
216 static char *gnu_dirname(char *path)
217 {
218 static const char dot[] = ".";
219 char *last_slash;
220
221 /* Find last '/'. */
222 last_slash = path != NULL ? strrchr(path, '/') : NULL;
223
224 if (last_slash != NULL && last_slash != path && last_slash[1] == '\0') {
225 /* Determine whether all remaining characters are slashes. */
226 char *runp;
227
228 for (runp = last_slash; runp != path; --runp)
229 if (runp[-1] != '/')
230 break;
231
232 /* The '/' is the last character, we have to look further. */
233 if (runp != path)
234 last_slash = memrchr(path, '/', runp - path);
235 }
236
237 if (last_slash != NULL) {
238 /* Determine whether all remaining characters are slashes. */
239 char *runp;
240
241 for (runp = last_slash; runp != path; --runp)
242 if (runp[-1] != '/')
243 break;
244
245 /* Terminate the path. */
246 if (runp == path) {
247 /*
248 * The last slash is the first character in the string.
249 * We have to return "/". As a special case we have to
250 * return "//" if there are exactly two slashes at the
251 * beginning of the string. See XBD 4.10 Path Name
252 * Resolution for more information
253 */
254 if (last_slash == path + 1)
255 ++last_slash;
256 else
257 last_slash = path + 1;
258 } else
259 last_slash = runp;
260
261 last_slash[0] = '\0';
262 } else {
263 /*
264 * This assignment is ill-designed but the XPG specs require to
265 * return a string containing "." in any case no directory part
266 * is found and so a static and constant string is required.
267 */
268 path = (char *)dot;
269 }
270
271 return path;
272 }
273
274 static uint64_t get_min_memlimit(const char *cgroup, bool swap)
275 {
276 __do_free char *copy = NULL;
277 uint64_t memlimit = 0, retlimit = 0;
278
279 copy = strdup(cgroup);
280 if (!copy)
281 return log_error_errno(0, ENOMEM, "Failed to allocate memory");
282
283 retlimit = get_memlimit(copy, swap);
284
285 /*
286 * If the cgroup doesn't start with / (probably won't happen), dirname()
287 * will terminate with "" instead of "/"
288 */
289 while (*copy && strcmp(copy, "/") != 0) {
290 char *it = copy;
291
292 it = gnu_dirname(it);
293 memlimit = get_memlimit(it, swap);
294 if (memlimit > 0 && memlimit < retlimit)
295 retlimit = memlimit;
296 };
297
298 return retlimit;
299 }
300
301 static inline bool startswith(const char *line, const char *pref)
302 {
303 return strncmp(line, pref, strlen(pref)) == 0;
304 }
305
306 static void get_swap_info(const char *cgroup, uint64_t memlimit,
307 uint64_t memusage, uint64_t *swtotal,
308 uint64_t *swusage, uint64_t *memswpriority)
309 {
310 __do_free char *memswusage_str = NULL, *memswpriority_str = NULL;
311 size_t memswlimit = 0, memswusage = 0;
312 int ret;
313
314 *swtotal = *swusage = 0;
315 *memswpriority = 1;
316
317 memswlimit = get_min_memlimit(cgroup, true);
318 if (memswlimit > 0) {
319 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
320 if (ret < 0 || safe_uint64(memswusage_str, &memswusage, 10) != 0)
321 return;
322
323 if (liblxcfs_memory_is_cgroupv2()) {
324 *swtotal = memswlimit / 1024;
325 *swusage = memswusage / 1024;
326 } else {
327 if (memlimit > memswlimit)
328 *swtotal = 0;
329 else
330 *swtotal = (memswlimit - memlimit) / 1024;
331 if (memusage > memswusage || swtotal == 0)
332 *swusage = 0;
333 else
334 *swusage = (memswusage - memusage) / 1024;
335 }
336
337 ret = cgroup_ops->get_memory_swappiness(cgroup_ops, cgroup, &memswpriority_str);
338 if (ret >= 0)
339 safe_uint64(memswpriority_str, memswpriority, 10);
340 }
341 }
342
343 static int proc_swaps_read(char *buf, size_t size, off_t offset,
344 struct fuse_file_info *fi)
345 {
346 __do_free char *cgroup = NULL, *memusage_str = NULL,
347 *memswusage_str = NULL, *memswpriority_str = NULL;
348 struct fuse_context *fc = fuse_get_context();
349 bool wants_swap = lxcfs_has_opt(fuse_get_context()->private_data, LXCFS_SWAP_ON);
350 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
351 uint64_t memlimit = 0, memusage = 0,
352 swtotal = 0, swusage = 0, memswpriority = 1,
353 hostswtotal = 0, hostswfree = 0;
354 ssize_t total_len = 0;
355 ssize_t l = 0;
356 char *cache = d->buf;
357 int ret;
358 __do_free char *line = NULL;
359 __do_free void *fopen_cache = NULL;
360 __do_fclose FILE *f = NULL;
361 size_t linelen = 0;
362
363 if (offset) {
364 size_t left;
365
366 if (offset > d->size)
367 return -EINVAL;
368
369 if (!d->cached)
370 return 0;
371
372 left = d->size - offset;
373 total_len = left > size ? size: left;
374 memcpy(buf, cache + offset, total_len);
375
376 return total_len;
377 }
378
379 pid_t initpid = lookup_initpid_in_store(fc->pid);
380 if (initpid <= 1 || is_shared_pidns(initpid))
381 initpid = fc->pid;
382
383 cgroup = get_pid_cgroup(initpid, "memory");
384 if (!cgroup)
385 return read_file_fuse("/proc/swaps", buf, size, d);
386 prune_init_slice(cgroup);
387
388 memlimit = get_min_memlimit(cgroup, false);
389
390 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
391 if (ret < 0)
392 return 0;
393
394 if (safe_uint64(memusage_str, &memusage, 10) < 0)
395 lxcfs_error("Failed to convert memusage %s", memusage_str);
396
397 if (wants_swap)
398 get_swap_info(cgroup, memlimit, memusage, &swtotal, &swusage, &memswpriority);
399
400 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
401
402 /* Read host total and free values */
403 f = fopen_cached("/proc/meminfo", "re", &fopen_cache);
404 if (!f)
405 return 0;
406
407 while (getline(&line, &linelen, f) != -1) {
408 if (startswith(line, "SwapTotal:"))
409 sscanf(line, "SwapTotal: %8" PRIu64 " kB", &hostswtotal);
410 else if (startswith(line, "SwapFree:"))
411 sscanf(line, "SwapFree: %8" PRIu64 " kB", &hostswfree);
412 }
413
414 if (wants_swap) {
415 /* The total amount of swap is always reported to be the
416 lesser of the RAM+SWAP limit or the SWAP device size.
417 This is because the kernel can swap as much as it
418 wants and not only up to swtotal. */
419 swtotal = memlimit / 1024 + swtotal;
420 if (hostswtotal < swtotal) {
421 swtotal = hostswtotal;
422 }
423
424 /* When swappiness is 0, pretend we can't swap. */
425 if (memswpriority == 0) {
426 swtotal = swusage;
427 }
428 }
429
430 if (swtotal > 0) {
431 l = snprintf(d->buf + total_len, d->size - total_len,
432 "none%*svirtual\t\t%" PRIu64 "\t%" PRIu64 "\t0\n",
433 36, " ", swtotal, swusage);
434 total_len += l;
435 }
436
437 if (total_len < 0 || l < 0)
438 return log_error(0, "Failed writing to cache");
439
440 d->cached = 1;
441 d->size = (int)total_len;
442
443 if ((size_t)total_len > size)
444 total_len = size;
445 memcpy(buf, d->buf, total_len);
446
447 return total_len;
448 }
449
450 static void get_blkio_io_value(char *str, unsigned major, unsigned minor,
451 char *iotype, uint64_t *v)
452 {
453 char *eol;
454 char key[32];
455 size_t len;
456
457 memset(key, 0, 32);
458 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
459
460 *v = 0;
461 len = strlen(key);
462 while (*str) {
463 if (startswith(str, key)) {
464 sscanf(str + len, "%lu", v);
465 return;
466 }
467 eol = strchr(str, '\n');
468 if (!eol)
469 return;
470 str = eol+1;
471 }
472 }
473
474 struct lxcfs_diskstats {
475 unsigned int major; /* 1 - major number */
476 unsigned int minor; /* 2 - minor mumber */
477 char dev_name[72]; /* 3 - device name */
478 uint64_t read; /* 4 - reads completed successfully */
479 uint64_t read_merged; /* 5 - reads merged */
480 uint64_t read_sectors; /* 6 - sectors read */
481 uint64_t read_ticks; /* 7 - time spent reading (ms) */
482 uint64_t write; /* 8 - writes completed */
483 uint64_t write_merged; /* 9 - writes merged */
484 uint64_t write_sectors; /* 10 - sectors written */
485 uint64_t write_ticks; /* 11 - time spent writing (ms) */
486 uint64_t ios_pgr; /* 12 - I/Os currently in progress */
487 uint64_t total_ticks; /* 13 - time spent doing I/Os (ms) */
488 uint64_t rq_ticks; /* 14 - weighted time spent doing I/Os (ms) */
489 uint64_t discard; /* 15 - discards completed successfully (4.18+) */
490 uint64_t discard_merged; /* 16 - discards merged (4.18+) */
491 uint64_t discard_sectors; /* 17 - sectors discarded (4.18+) */
492 uint64_t discard_ticks; /* 18 - time spent discarding (4.18+) */
493 };
494
495 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
496 struct fuse_file_info *fi)
497 {
498 __do_free char *cg = NULL, *io_serviced_str = NULL,
499 *io_merged_str = NULL, *io_service_bytes_str = NULL,
500 *io_wait_time_str = NULL, *io_service_time_str = NULL,
501 *line = NULL;
502 __do_free void *fopen_cache = NULL;
503 __do_fclose FILE *f = NULL;
504 struct fuse_context *fc = fuse_get_context();
505 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
506 struct lxcfs_diskstats stats = {};
507 /* helper fields */
508 uint64_t read_service_time, write_service_time, discard_service_time, read_wait_time,
509 write_wait_time, discard_wait_time;
510 char *cache = d->buf;
511 size_t cache_size = d->buflen;
512 size_t linelen = 0, total_len = 0;
513 int i = 0;
514 int ret;
515
516 if (offset) {
517 size_t left;
518
519 if (offset > d->size)
520 return -EINVAL;
521
522 if (!d->cached)
523 return 0;
524
525 left = d->size - offset;
526 total_len = left > size ? size: left;
527 memcpy(buf, cache + offset, total_len);
528
529 return total_len;
530 }
531
532 pid_t initpid = lookup_initpid_in_store(fc->pid);
533 if (initpid <= 1 || is_shared_pidns(initpid))
534 initpid = fc->pid;
535
536 cg = get_pid_cgroup(initpid, "blkio");
537 if (!cg)
538 return read_file_fuse("/proc/diskstats", buf, size, d);
539 prune_init_slice(cg);
540
541 ret = cgroup_ops->get_io_serviced(cgroup_ops, cg, &io_serviced_str);
542 if (ret < 0) {
543 if (ret == -EOPNOTSUPP)
544 return read_file_fuse("/proc/diskstats", buf, size, d);
545 }
546
547 ret = cgroup_ops->get_io_merged(cgroup_ops, cg, &io_merged_str);
548 if (ret < 0) {
549 if (ret == -EOPNOTSUPP)
550 return read_file_fuse("/proc/diskstats", buf, size, d);
551 }
552
553 ret = cgroup_ops->get_io_service_bytes(cgroup_ops, cg, &io_service_bytes_str);
554 if (ret < 0) {
555 if (ret == -EOPNOTSUPP)
556 return read_file_fuse("/proc/diskstats", buf, size, d);
557 }
558
559 ret = cgroup_ops->get_io_wait_time(cgroup_ops, cg, &io_wait_time_str);
560 if (ret < 0) {
561 if (ret == -EOPNOTSUPP)
562 return read_file_fuse("/proc/diskstats", buf, size, d);
563 }
564
565 ret = cgroup_ops->get_io_service_time(cgroup_ops, cg, &io_service_time_str);
566 if (ret < 0) {
567 if (ret == -EOPNOTSUPP)
568 return read_file_fuse("/proc/diskstats", buf, size, d);
569 }
570
571 f = fopen_cached("/proc/diskstats", "re", &fopen_cache);
572 if (!f)
573 return 0;
574
575 while (getline(&line, &linelen, f) != -1) {
576 ssize_t l;
577 char lbuf[256];
578
579 i = sscanf(line, "%u %u %71s", &stats.major, &stats.minor, stats.dev_name);
580 if (i != 3)
581 continue;
582
583 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Read", &stats.read);
584 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Write", &stats.write);
585 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Discard", &stats.discard);
586
587 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Read", &stats.read_merged);
588 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Write", &stats.write_merged);
589 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Discard", &stats.discard_merged);
590
591 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Read", &stats.read_sectors);
592 stats.read_sectors = stats.read_sectors / 512;
593 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Write", &stats.write_sectors);
594 stats.write_sectors = stats.write_sectors / 512;
595 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Discard", &stats.discard_sectors);
596 stats.discard_sectors = stats.discard_sectors / 512;
597
598 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Read", &read_service_time);
599 read_service_time = read_service_time / 1000000;
600 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Read", &read_wait_time);
601 read_wait_time = read_wait_time / 1000000;
602 stats.read_ticks = read_service_time + read_wait_time;
603
604 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Write", &write_service_time);
605 write_service_time = write_service_time / 1000000;
606 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Write", &write_wait_time);
607 write_wait_time = write_wait_time / 1000000;
608 stats.write_ticks = write_service_time + write_wait_time;
609
610 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Discard", &discard_service_time);
611 discard_service_time = discard_service_time / 1000000;
612 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Discard", &discard_wait_time);
613 discard_wait_time = discard_wait_time / 1000000;
614 stats.discard_ticks = discard_service_time + discard_wait_time;
615
616 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Total", &stats.total_ticks);
617 stats.total_ticks = stats.total_ticks / 1000000;
618
619 memset(lbuf, 0, 256);
620 if (stats.read || stats.write || stats.read_merged || stats.write_merged ||
621 stats.read_sectors || stats.write_sectors || stats.read_ticks ||
622 stats.write_ticks || stats.ios_pgr || stats.total_ticks || stats.rq_ticks ||
623 stats.discard_merged || stats.discard_sectors || stats.discard_ticks)
624 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
625 stats.major,
626 stats.minor,
627 stats.dev_name,
628 stats.read,
629 stats.read_merged,
630 stats.read_sectors,
631 stats.read_ticks,
632 stats.write,
633 stats.write_merged,
634 stats.write_sectors,
635 stats.write_ticks,
636 stats.ios_pgr,
637 stats.total_ticks,
638 stats.rq_ticks,
639 stats.discard_merged,
640 stats.discard_sectors,
641 stats.discard_ticks);
642 else
643 continue;
644
645 l = snprintf(cache, cache_size, "%s", lbuf);
646 if (l < 0)
647 return log_error(0, "Failed to write cache");
648 if ((size_t)l >= cache_size)
649 return log_error(0, "Write to cache was truncated");
650
651 cache += l;
652 cache_size -= l;
653 total_len += l;
654 }
655
656 d->cached = 1;
657 d->size = total_len;
658 if (total_len > size)
659 total_len = size;
660 memcpy(buf, d->buf, total_len);
661
662 return total_len;
663 }
664
665 #ifdef RELOADTEST
666 static inline void iwashere(void)
667 {
668 mknod("/tmp/lxcfs-iwashere", S_IFREG, 0644);
669 }
670 #endif
671
672 /*
673 * This function retrieves the busy time of a group of tasks by looking at
674 * cpuacct.usage. Unfortunately, this only makes sense when the container has
675 * been given it's own cpuacct cgroup. If not, this function will take the busy
676 * time of all other taks that do not actually belong to the container into
677 * account as well. If someone has a clever solution for this please send a
678 * patch!
679 */
680 static double get_reaper_busy(pid_t task)
681 {
682 __do_free char *cgroup = NULL, *usage_str = NULL;
683 uint64_t usage = 0;
684 pid_t initpid;
685
686 initpid = lookup_initpid_in_store(task);
687 if (initpid <= 0)
688 return 0;
689
690 cgroup = get_pid_cgroup(initpid, "cpuacct");
691 if (!cgroup)
692 return 0;
693 prune_init_slice(cgroup);
694
695 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cgroup, "cpuacct.usage", &usage_str))
696 return 0;
697
698 if (safe_uint64(usage_str, &usage, 10) < 0)
699 lxcfs_error("Failed to convert usage %s", usage_str);
700
701 return ((double)usage / 1000000000);
702 }
703
704 static uint64_t get_reaper_start_time(pid_t pid)
705 {
706 __do_free void *fopen_cache = NULL;
707 __do_fclose FILE *f = NULL;
708 int ret;
709 uint64_t starttime;
710 char path[STRLITERALLEN("/proc/") + LXCFS_NUMSTRLEN64 +
711 STRLITERALLEN("/stat") + 1];
712 pid_t qpid;
713
714 qpid = lookup_initpid_in_store(pid);
715 if (qpid <= 0)
716 return ret_errno(EINVAL);
717
718 ret = snprintf(path, sizeof(path), "/proc/%d/stat", qpid);
719 if (ret < 0 || (size_t)ret >= sizeof(path))
720 return ret_errno(EINVAL);
721
722 f = fopen_cached(path, "re", &fopen_cache);
723 if (!f)
724 return ret_errno(EINVAL);
725
726 /* Note that the *scanf() argument supression requires that length
727 * modifiers such as "l" are omitted. Otherwise some compilers will yell
728 * at us. It's like telling someone you're not married and then asking
729 * if you can bring your wife to the party.
730 */
731 ret = fscanf(f, "%*d " /* (1) pid %d */
732 "%*s " /* (2) comm %s */
733 "%*c " /* (3) state %c */
734 "%*d " /* (4) ppid %d */
735 "%*d " /* (5) pgrp %d */
736 "%*d " /* (6) session %d */
737 "%*d " /* (7) tty_nr %d */
738 "%*d " /* (8) tpgid %d */
739 "%*u " /* (9) flags %u */
740 "%*u " /* (10) minflt %lu */
741 "%*u " /* (11) cminflt %lu */
742 "%*u " /* (12) majflt %lu */
743 "%*u " /* (13) cmajflt %lu */
744 "%*u " /* (14) utime %lu */
745 "%*u " /* (15) stime %lu */
746 "%*d " /* (16) cutime %ld */
747 "%*d " /* (17) cstime %ld */
748 "%*d " /* (18) priority %ld */
749 "%*d " /* (19) nice %ld */
750 "%*d " /* (20) num_threads %ld */
751 "%*d " /* (21) itrealvalue %ld */
752 "%" PRIu64, /* (22) starttime %llu */
753 &starttime);
754 if (ret != 1)
755 return ret_errno(EINVAL);
756
757 return ret_set_errno(starttime, 0);
758 }
759
760 static double get_reaper_start_time_in_sec(pid_t pid)
761 {
762 uint64_t clockticks, ticks_per_sec;
763 int64_t ret;
764 double res = 0;
765
766 clockticks = get_reaper_start_time(pid);
767 if (clockticks <= 0)
768 return log_debug(0, "Failed to retrieve start time of pid %d", pid);
769
770 ret = sysconf(_SC_CLK_TCK);
771 if (ret < 0)
772 return log_debug(0, "Failed to determine number of clock ticks in a second");
773
774 ticks_per_sec = (uint64_t)ret;
775 res = (double)clockticks / ticks_per_sec;
776 return res;
777 }
778
779 static double get_reaper_age(pid_t pid)
780 {
781 uint64_t uptime_ms;
782 double procstart, procage;
783
784 /*
785 * We need to substract the time the process has started since system
786 * boot minus the time when the system has started to get the actual
787 * reaper age.
788 */
789 procstart = get_reaper_start_time_in_sec(pid);
790 procage = procstart;
791 if (procstart > 0) {
792 int ret;
793 struct timespec spec;
794
795 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
796 if (ret < 0)
797 return 0;
798
799 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
800 procage = (uptime_ms - (procstart * 1000)) / 1000;
801 }
802
803 return procage;
804 }
805
806 /*
807 * We read /proc/uptime and reuse its second field.
808 * For the first field, we use the mtime for the reaper for
809 * the calling pid as returned by getreaperage
810 */
811 static int proc_uptime_read(char *buf, size_t size, off_t offset,
812 struct fuse_file_info *fi)
813 {
814 struct fuse_context *fc = fuse_get_context();
815 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
816 char *cache = d->buf;
817 ssize_t total_len = 0, ret = 0;
818 double busytime, idletime, reaperage;
819
820 #ifdef RELOADTEST
821 iwashere();
822 #endif
823
824 if (offset) {
825 size_t left;
826
827 if (offset > d->size)
828 return -EINVAL;
829
830 if (!d->cached)
831 return 0;
832
833 left = d->size - offset;
834 total_len = left > size ? size : left;
835 memcpy(buf, cache + offset, total_len);
836
837 return total_len;
838 }
839
840 reaperage = get_reaper_age(fc->pid);
841 /*
842 * To understand why this is done, please read the comment to the
843 * get_reaper_busy() function.
844 */
845 idletime = reaperage;
846 busytime = get_reaper_busy(fc->pid);
847 if (reaperage >= busytime)
848 idletime = reaperage - busytime;
849
850 ret = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
851 if (ret < 0 || ret >= d->buflen)
852 return read_file_fuse("/proc/uptime", buf, size, d);
853 total_len = ret;
854
855 d->cached = 1;
856 d->size = total_len;
857 if ((size_t)total_len > size)
858 total_len = size;
859 memcpy(buf, d->buf, total_len);
860
861 return total_len;
862 }
863
864 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
865 static int proc_stat_read(char *buf, size_t size, off_t offset,
866 struct fuse_file_info *fi)
867 {
868 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
869 __do_free void *fopen_cache = NULL;
870 __do_free struct cpuacct_usage *cg_cpu_usage = NULL;
871 __do_fclose FILE *f = NULL;
872 struct fuse_context *fc = fuse_get_context();
873 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
874 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
875 size_t linelen = 0, total_len = 0;
876 int curcpu = -1; /* cpu numbering starts at 0 */
877 int physcpu = 0;
878 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
879 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
880 uint64_t user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0,
881 iowait_sum = 0, irq_sum = 0, softirq_sum = 0, steal_sum = 0,
882 guest_sum = 0, guest_nice_sum = 0;
883 char cpuall[CPUALL_MAX_SIZE];
884 /* reserve for cpu all */
885 char *cache = d->buf + CPUALL_MAX_SIZE;
886 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
887 int cg_cpu_usage_size = 0;
888
889 if (offset) {
890 size_t left;
891
892 if (offset > d->size)
893 return -EINVAL;
894
895 if (!d->cached)
896 return 0;
897
898 left = d->size - offset;
899 total_len = left > size ? size : left;
900 memcpy(buf, d->buf + offset, total_len);
901
902 return total_len;
903 }
904
905 pid_t initpid = lookup_initpid_in_store(fc->pid);
906 if (initpid <= 1 || is_shared_pidns(initpid))
907 initpid = fc->pid;
908
909 /*
910 * when container run with host pid namespace initpid == 1, cgroup will "/"
911 * we should return host os's /proc contents.
912 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
913 */
914 if (initpid == 1)
915 return read_file_fuse("/proc/stat", buf, size, d);
916
917 cg = get_pid_cgroup(initpid, "cpuset");
918 if (!cg)
919 return read_file_fuse("/proc/stat", buf, size, d);
920 prune_init_slice(cg);
921
922 cpuset = get_cpuset(cg);
923 if (!cpuset)
924 return 0;
925
926 f = fopen_cached("/proc/stat", "re", &fopen_cache);
927 if (!f)
928 return 0;
929
930 /* Skip first system cpu line. */
931 if (getline(&line, &linelen, f) < 0)
932 return log_error(0, "proc_stat_read read first line failed");
933
934 /*
935 * Read cpuacct.usage_all for all CPUs.
936 * If the cpuacct cgroup is present, it is used to calculate the container's
937 * CPU usage. If not, values from the host's /proc/stat are used.
938 */
939 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) == 0) {
940 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs) {
941 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage,
942 cg_cpu_usage_size, f,
943 d->buf, d->buflen);
944 goto out;
945 }
946 } else {
947 lxcfs_v("proc_stat_read failed to read from cpuacct, falling back to the host's /proc/stat");
948 }
949
950 while (getline(&line, &linelen, f) != -1) {
951 ssize_t l;
952 char cpu_char[10]; /* That's a lot of cores */
953 char *c;
954 uint64_t all_used, cg_used, new_idle;
955 int ret;
956
957 if (strlen(line) == 0)
958 continue;
959 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
960 /* not a ^cpuN line containing a number N, just print it */
961 l = snprintf(cache, cache_size, "%s", line);
962 if (l < 0)
963 return log_error(0, "Failed to write cache");
964 if ((size_t)l >= cache_size)
965 return log_error(0, "Write to cache was truncated");
966
967 cache += l;
968 cache_size -= l;
969 total_len += l;
970
971 continue;
972 }
973
974 if (sscanf(cpu_char, "%d", &physcpu) != 1)
975 continue;
976
977 if (!cpu_in_cpuset(physcpu, cpuset))
978 continue;
979
980 curcpu++;
981
982 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
983 &user,
984 &nice,
985 &system,
986 &idle,
987 &iowait,
988 &irq,
989 &softirq,
990 &steal,
991 &guest,
992 &guest_nice);
993 if (ret != 10 || !cg_cpu_usage) {
994 c = strchr(line, ' ');
995 if (!c)
996 continue;
997
998 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
999 if (l < 0)
1000 return log_error(0, "Failed to write cache");
1001 if ((size_t)l >= cache_size)
1002 return log_error(0, "Write to cache was truncated");
1003
1004 cache += l;
1005 cache_size -= l;
1006 total_len += l;
1007
1008 if (ret != 10)
1009 continue;
1010 }
1011
1012 if (cg_cpu_usage) {
1013 if (physcpu >= cg_cpu_usage_size)
1014 break;
1015
1016 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
1017 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
1018
1019 if (all_used >= cg_used) {
1020 new_idle = idle + (all_used - cg_used);
1021 } else {
1022 lxcfs_debug("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
1023 curcpu, cg, all_used, cg_used);
1024 new_idle = idle;
1025 }
1026
1027 l = snprintf(cache, cache_size,
1028 "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
1029 curcpu, cg_cpu_usage[physcpu].user,
1030 cg_cpu_usage[physcpu].system, new_idle);
1031 if (l < 0)
1032 return log_error(0, "Failed to write cache");
1033 if ((size_t)l >= cache_size)
1034 return log_error(0, "Write to cache was truncated");
1035
1036 cache += l;
1037 cache_size -= l;
1038 total_len += l;
1039
1040 user_sum += cg_cpu_usage[physcpu].user;
1041 system_sum += cg_cpu_usage[physcpu].system;
1042 idle_sum += new_idle;
1043 } else {
1044 user_sum += user;
1045 nice_sum += nice;
1046 system_sum += system;
1047 idle_sum += idle;
1048 iowait_sum += iowait;
1049 irq_sum += irq;
1050 softirq_sum += softirq;
1051 steal_sum += steal;
1052 guest_sum += guest;
1053 guest_nice_sum += guest_nice;
1054 }
1055 }
1056
1057 cache = d->buf;
1058
1059 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
1060 user_sum,
1061 nice_sum,
1062 system_sum,
1063 idle_sum,
1064 iowait_sum,
1065 irq_sum,
1066 softirq_sum,
1067 steal_sum,
1068 guest_sum,
1069 guest_nice_sum);
1070 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
1071 memcpy(cache, cpuall, cpuall_len);
1072 cache += cpuall_len;
1073 } else {
1074 /* shouldn't happen */
1075 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d", cpuall_len);
1076 cpuall_len = 0;
1077 }
1078
1079 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
1080 total_len += cpuall_len;
1081
1082 out:
1083 d->cached = 1;
1084 d->size = total_len;
1085 if (total_len > size)
1086 total_len = size;
1087
1088 memcpy(buf, d->buf, total_len);
1089 return total_len;
1090 }
1091
1092 /* Note that "memory.stat" in cgroup2 is hierarchical by default. */
1093 static bool cgroup_parse_memory_stat(const char *cgroup, struct memory_stat *mstat)
1094 {
1095 __do_close int fd = -EBADF;
1096 __do_fclose FILE *f = NULL;
1097 __do_free char *line = NULL;
1098 __do_free void *fdopen_cache = NULL;
1099 bool unified;
1100 size_t len = 0;
1101 ssize_t linelen;
1102
1103 fd = cgroup_ops->get_memory_stats_fd(cgroup_ops, cgroup);
1104 if (fd < 0)
1105 return false;
1106
1107 f = fdopen_cached(fd, "re", &fdopen_cache);
1108 if (!f)
1109 return false;
1110
1111 unified = pure_unified_layout(cgroup_ops);
1112 while ((linelen = getline(&line, &len, f)) != -1) {
1113 if (!unified && startswith(line, "hierarchical_memory_limit")) {
1114 sscanf(line, "hierarchical_memory_limit %" PRIu64, &(mstat->hierarchical_memory_limit));
1115 } else if (!unified && startswith(line, "hierarchical_memsw_limit")) {
1116 sscanf(line, "hierarchical_memsw_limit %" PRIu64, &(mstat->hierarchical_memsw_limit));
1117 } else if (startswith(line, unified ? "file" :"total_cache")) {
1118 sscanf(line, unified ? "file %" PRIu64 : "total_cache %" PRIu64, &(mstat->total_cache));
1119 } else if (!unified && startswith(line, "total_rss")) {
1120 sscanf(line, "total_rss %" PRIu64, &(mstat->total_rss));
1121 } else if (!unified && startswith(line, "total_rss_huge")) {
1122 sscanf(line, "total_rss_huge %" PRIu64, &(mstat->total_rss_huge));
1123 } else if (startswith(line, unified ? "shmem" : "total_shmem")) {
1124 sscanf(line, unified ? "shmem %" PRIu64 : "total_shmem %" PRIu64, &(mstat->total_shmem));
1125 } else if (startswith(line, unified ? "file_mapped" : "total_mapped_file")) {
1126 sscanf(line, unified ? "file_mapped %" PRIu64 : "total_mapped_file %" PRIu64, &(mstat->total_mapped_file));
1127 } else if (!unified && startswith(line, "total_dirty")) {
1128 sscanf(line, "total_dirty %" PRIu64, &(mstat->total_dirty));
1129 } else if (!unified && startswith(line, "total_writeback")) {
1130 sscanf(line, "total_writeback %" PRIu64, &(mstat->total_writeback));
1131 } else if (!unified && startswith(line, "total_swap")) {
1132 sscanf(line, "total_swap %" PRIu64, &(mstat->total_swap));
1133 } else if (!unified && startswith(line, "total_pgpgin")) {
1134 sscanf(line, "total_pgpgin %" PRIu64, &(mstat->total_pgpgin));
1135 } else if (!unified && startswith(line, "total_pgpgout")) {
1136 sscanf(line, "total_pgpgout %" PRIu64, &(mstat->total_pgpgout));
1137 } else if (startswith(line, unified ? "pgfault" : "total_pgfault")) {
1138 sscanf(line, unified ? "pgfault %" PRIu64 : "total_pgfault %" PRIu64, &(mstat->total_pgfault));
1139 } else if (startswith(line, unified ? "pgmajfault" : "total_pgmajfault")) {
1140 sscanf(line, unified ? "pgmajfault %" PRIu64 : "total_pgmajfault %" PRIu64, &(mstat->total_pgmajfault));
1141 } else if (startswith(line, unified ? "inactive_anon" : "total_inactive_anon")) {
1142 sscanf(line, unified ? "inactive_anon %" PRIu64 : "total_inactive_anon %" PRIu64, &(mstat->total_inactive_anon));
1143 } else if (startswith(line, unified ? "active_anon" : "total_active_anon")) {
1144 sscanf(line, unified ? "active_anon %" PRIu64 : "total_active_anon %" PRIu64, &(mstat->total_active_anon));
1145 } else if (startswith(line, unified ? "inactive_file" : "total_inactive_file")) {
1146 sscanf(line, unified ? "inactive_file %" PRIu64 : "total_inactive_file %" PRIu64, &(mstat->total_inactive_file));
1147 } else if (startswith(line, unified ? "active_file" : "total_active_file")) {
1148 sscanf(line, unified ? "active_file %" PRIu64 : "total_active_file %" PRIu64, &(mstat->total_active_file));
1149 } else if (startswith(line, unified ? "unevictable" : "total_unevictable")) {
1150 sscanf(line, unified ? "unevictable %" PRIu64 : "total_unevictable %" PRIu64, &(mstat->total_unevictable));
1151 }
1152 }
1153
1154 return true;
1155 }
1156
1157 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1158 struct fuse_file_info *fi)
1159 {
1160 __do_free char *cgroup = NULL, *line = NULL, *memusage_str = NULL,
1161 *memswusage_str = NULL, *memswpriority_str = NULL;
1162 __do_free void *fopen_cache = NULL;
1163 __do_fclose FILE *f = NULL;
1164 struct fuse_context *fc = fuse_get_context();
1165 bool wants_swap = lxcfs_has_opt(fuse_get_context()->private_data, LXCFS_SWAP_ON);
1166 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1167 uint64_t memlimit = 0, memusage = 0,
1168 hosttotal = 0, swfree = 0, swusage = 0, swtotal = 0,
1169 memswpriority = 1;
1170 struct memory_stat mstat = {};
1171 size_t linelen = 0, total_len = 0;
1172 char *cache = d->buf;
1173 size_t cache_size = d->buflen;
1174 int ret;
1175
1176 if (offset) {
1177 size_t left;
1178
1179 if (offset > d->size)
1180 return -EINVAL;
1181
1182 if (!d->cached)
1183 return 0;
1184
1185 left = d->size - offset;
1186 total_len = left > size ? size : left;
1187 memcpy(buf, cache + offset, total_len);
1188
1189 return total_len;
1190 }
1191
1192 pid_t initpid = lookup_initpid_in_store(fc->pid);
1193 if (initpid <= 1 || is_shared_pidns(initpid))
1194 initpid = fc->pid;
1195
1196 cgroup = get_pid_cgroup(initpid, "memory");
1197 if (!cgroup)
1198 return read_file_fuse("/proc/meminfo", buf, size, d);
1199
1200 prune_init_slice(cgroup);
1201
1202 /* memory limits */
1203 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
1204 if (ret < 0)
1205 return read_file_fuse("/proc/meminfo", buf, size, d);
1206
1207 if (safe_uint64(memusage_str, &memusage, 10) < 0)
1208 lxcfs_error("Failed to convert memusage %s", memusage_str);
1209
1210 if (!cgroup_parse_memory_stat(cgroup, &mstat))
1211 return read_file_fuse("/proc/meminfo", buf, size, d);
1212
1213 memlimit = get_min_memlimit(cgroup, false);
1214
1215 /*
1216 * Following values are allowed to fail, because swapaccount might be
1217 * turned off for current kernel.
1218 */
1219 if (wants_swap)
1220 get_swap_info(cgroup, memlimit, memusage, &swtotal, &swusage, &memswpriority);
1221
1222 f = fopen_cached("/proc/meminfo", "re", &fopen_cache);
1223 if (!f)
1224 return read_file_fuse("/proc/meminfo", buf, size, d);
1225
1226 memusage /= 1024;
1227 memlimit /= 1024;
1228 while (getline(&line, &linelen, f) != -1) {
1229 ssize_t l;
1230 char *printme, lbuf[100];
1231
1232 memset(lbuf, 0, 100);
1233 if (startswith(line, "MemTotal:")) {
1234 sscanf(line+sizeof("MemTotal:")-1, "%" PRIu64, &hosttotal);
1235 if (memlimit == 0)
1236 memlimit = hosttotal;
1237
1238 if (hosttotal < memlimit)
1239 memlimit = hosttotal;
1240 snprintf(lbuf, 100, "MemTotal: %8" PRIu64 " kB\n", memlimit);
1241 printme = lbuf;
1242 } else if (startswith(line, "MemFree:")) {
1243 snprintf(lbuf, 100, "MemFree: %8" PRIu64 " kB\n", memlimit - memusage);
1244 printme = lbuf;
1245 } else if (startswith(line, "MemAvailable:")) {
1246 snprintf(lbuf, 100, "MemAvailable: %8" PRIu64 " kB\n", memlimit - memusage + mstat.total_cache / 1024);
1247 printme = lbuf;
1248 } else if (startswith(line, "SwapTotal:")) {
1249 if (wants_swap) {
1250 uint64_t hostswtotal = 0;
1251
1252 sscanf(line + STRLITERALLEN("SwapTotal:"), "%" PRIu64, &hostswtotal);
1253
1254 /* The total amount of swap is always reported to be the
1255 lesser of the RAM+SWAP limit or the SWAP device size.
1256 This is because the kernel can swap as much as it
1257 wants and not only up to swtotal. */
1258
1259 if (!liblxcfs_memory_is_cgroupv2())
1260 swtotal += memlimit;
1261
1262 if (hostswtotal < swtotal) {
1263 swtotal = hostswtotal;
1264 }
1265
1266 /* When swappiness is 0, pretend we can't swap. */
1267 if (memswpriority == 0) {
1268 swtotal = swusage;
1269 }
1270 }
1271
1272 snprintf(lbuf, 100, "SwapTotal: %8" PRIu64 " kB\n", swtotal);
1273 printme = lbuf;
1274 } else if (startswith(line, "SwapFree:")) {
1275 if (wants_swap) {
1276 swfree = swtotal - swusage;
1277 }
1278
1279 snprintf(lbuf, 100, "SwapFree: %8" PRIu64 " kB\n", swfree);
1280 printme = lbuf;
1281 } else if (startswith(line, "Slab:")) {
1282 snprintf(lbuf, 100, "Slab: %8" PRIu64 " kB\n", (uint64_t)0);
1283 printme = lbuf;
1284 } else if (startswith(line, "Buffers:")) {
1285 snprintf(lbuf, 100, "Buffers: %8" PRIu64 " kB\n", (uint64_t)0);
1286 printme = lbuf;
1287 } else if (startswith(line, "Cached:")) {
1288 snprintf(lbuf, 100, "Cached: %8" PRIu64 " kB\n",
1289 mstat.total_cache / 1024);
1290 printme = lbuf;
1291 } else if (startswith(line, "SwapCached:")) {
1292 snprintf(lbuf, 100, "SwapCached: %8" PRIu64 " kB\n", (uint64_t)0);
1293 printme = lbuf;
1294 } else if (startswith(line, "Active:")) {
1295 snprintf(lbuf, 100, "Active: %8" PRIu64 " kB\n",
1296 (mstat.total_active_anon +
1297 mstat.total_active_file) /
1298 1024);
1299 printme = lbuf;
1300 } else if (startswith(line, "Inactive:")) {
1301 snprintf(lbuf, 100, "Inactive: %8" PRIu64 " kB\n",
1302 (mstat.total_inactive_anon +
1303 mstat.total_inactive_file) /
1304 1024);
1305 printme = lbuf;
1306 } else if (startswith(line, "Active(anon):")) {
1307 snprintf(lbuf, 100, "Active(anon): %8" PRIu64 " kB\n",
1308 mstat.total_active_anon / 1024);
1309 printme = lbuf;
1310 } else if (startswith(line, "Inactive(anon):")) {
1311 snprintf(lbuf, 100, "Inactive(anon): %8" PRIu64 " kB\n",
1312 mstat.total_inactive_anon / 1024);
1313 printme = lbuf;
1314 } else if (startswith(line, "Active(file):")) {
1315 snprintf(lbuf, 100, "Active(file): %8" PRIu64 " kB\n",
1316 mstat.total_active_file / 1024);
1317 printme = lbuf;
1318 } else if (startswith(line, "Inactive(file):")) {
1319 snprintf(lbuf, 100, "Inactive(file): %8" PRIu64 " kB\n",
1320 mstat.total_inactive_file / 1024);
1321 printme = lbuf;
1322 } else if (startswith(line, "Unevictable:")) {
1323 snprintf(lbuf, 100, "Unevictable: %8" PRIu64 " kB\n",
1324 mstat.total_unevictable / 1024);
1325 printme = lbuf;
1326 } else if (startswith(line, "Dirty:")) {
1327 snprintf(lbuf, 100, "Dirty: %8" PRIu64 " kB\n",
1328 mstat.total_dirty / 1024);
1329 printme = lbuf;
1330 } else if (startswith(line, "Writeback:")) {
1331 snprintf(lbuf, 100, "Writeback: %8" PRIu64 " kB\n",
1332 mstat.total_writeback / 1024);
1333 printme = lbuf;
1334 } else if (startswith(line, "AnonPages:")) {
1335 snprintf(lbuf, 100, "AnonPages: %8" PRIu64 " kB\n",
1336 (mstat.total_active_anon +
1337 mstat.total_inactive_anon - mstat.total_shmem) /
1338 1024);
1339 printme = lbuf;
1340 } else if (startswith(line, "Mapped:")) {
1341 snprintf(lbuf, 100, "Mapped: %8" PRIu64 " kB\n",
1342 mstat.total_mapped_file / 1024);
1343 printme = lbuf;
1344 } else if (startswith(line, "SReclaimable:")) {
1345 snprintf(lbuf, 100, "SReclaimable: %8" PRIu64 " kB\n", (uint64_t)0);
1346 printme = lbuf;
1347 } else if (startswith(line, "SUnreclaim:")) {
1348 snprintf(lbuf, 100, "SUnreclaim: %8" PRIu64 " kB\n", (uint64_t)0);
1349 printme = lbuf;
1350 } else if (startswith(line, "Shmem:")) {
1351 snprintf(lbuf, 100, "Shmem: %8" PRIu64 " kB\n",
1352 mstat.total_shmem / 1024);
1353 printme = lbuf;
1354 } else if (startswith(line, "ShmemHugePages:")) {
1355 snprintf(lbuf, 100, "ShmemHugePages: %8" PRIu64 " kB\n", (uint64_t)0);
1356 printme = lbuf;
1357 } else if (startswith(line, "ShmemPmdMapped:")) {
1358 snprintf(lbuf, 100, "ShmemPmdMapped: %8" PRIu64 " kB\n", (uint64_t)0);
1359 printme = lbuf;
1360 } else if (startswith(line, "AnonHugePages:")) {
1361 snprintf(lbuf, 100, "AnonHugePages: %8" PRIu64 " kB\n",
1362 mstat.total_rss_huge / 1024);
1363 printme = lbuf;
1364 } else {
1365 printme = line;
1366 }
1367
1368 l = snprintf(cache, cache_size, "%s", printme);
1369 if (l < 0)
1370 return log_error(0, "Failed to write cache");
1371 if ((size_t)l >= cache_size)
1372 return log_error(0, "Write to cache was truncated");
1373
1374 cache += l;
1375 cache_size -= l;
1376 total_len += l;
1377 }
1378
1379 d->cached = 1;
1380 d->size = total_len;
1381 if (total_len > size)
1382 total_len = size;
1383 memcpy(buf, d->buf, total_len);
1384
1385 return total_len;
1386 }
1387
1388 static int proc_slabinfo_read(char *buf, size_t size, off_t offset,
1389 struct fuse_file_info *fi)
1390 {
1391 __do_free char *cgroup = NULL, *line = NULL;
1392 __do_free void *fopen_cache = NULL;
1393 __do_fclose FILE *f = NULL;
1394 __do_close int fd = -EBADF;
1395 struct fuse_context *fc = fuse_get_context();
1396 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1397 size_t linelen = 0, total_len = 0;
1398 char *cache = d->buf;
1399 size_t cache_size = d->buflen;
1400 pid_t initpid;
1401
1402 if (offset) {
1403 size_t left;
1404
1405 if (offset > d->size)
1406 return -EINVAL;
1407
1408 if (!d->cached)
1409 return 0;
1410
1411 left = d->size - offset;
1412 total_len = left > size ? size : left;
1413 memcpy(buf, cache + offset, total_len);
1414
1415 return total_len;
1416 }
1417
1418 initpid = lookup_initpid_in_store(fc->pid);
1419 if (initpid <= 1 || is_shared_pidns(initpid))
1420 initpid = fc->pid;
1421
1422 cgroup = get_pid_cgroup(initpid, "memory");
1423 if (!cgroup)
1424 return read_file_fuse("/proc/slabinfo", buf, size, d);
1425
1426 prune_init_slice(cgroup);
1427
1428 fd = cgroup_ops->get_memory_slabinfo_fd(cgroup_ops, cgroup);
1429 if (fd < 0)
1430 return read_file_fuse("/proc/slabinfo", buf, size, d);
1431
1432 f = fdopen_cached(fd, "re", &fopen_cache);
1433 if (!f)
1434 return read_file_fuse("/proc/slabinfo", buf, size, d);
1435
1436 while (getline(&line, &linelen, f) != -1) {
1437 ssize_t l = snprintf(cache, cache_size, "%s", line);
1438 if (l < 0)
1439 return log_error(0, "Failed to write cache");
1440 if ((size_t)l >= cache_size)
1441 return log_error(0, "Write to cache was truncated");
1442
1443 cache += l;
1444 cache_size -= l;
1445 total_len += l;
1446 }
1447
1448 d->cached = 1;
1449 d->size = total_len;
1450 if (total_len > size)
1451 total_len = size;
1452 memcpy(buf, d->buf, total_len);
1453
1454 return total_len;
1455 }
1456
1457 __lxcfs_fuse_ops int proc_read(const char *path, char *buf, size_t size,
1458 off_t offset, struct fuse_file_info *fi)
1459 {
1460 struct file_info *f = INTTYPE_TO_PTR(fi->fh);
1461
1462 switch (f->type) {
1463 case LXC_TYPE_PROC_MEMINFO:
1464 if (liblxcfs_functional())
1465 return proc_meminfo_read(buf, size, offset, fi);
1466
1467 return read_file_fuse_with_offset(LXC_TYPE_PROC_MEMINFO_PATH,
1468 buf, size, offset, f);
1469 case LXC_TYPE_PROC_CPUINFO:
1470 if (liblxcfs_functional())
1471 return proc_cpuinfo_read(buf, size, offset, fi);
1472
1473 return read_file_fuse_with_offset(LXC_TYPE_PROC_CPUINFO_PATH,
1474 buf, size, offset, f);
1475 case LXC_TYPE_PROC_UPTIME:
1476 if (liblxcfs_functional())
1477 return proc_uptime_read(buf, size, offset, fi);
1478
1479 return read_file_fuse_with_offset(LXC_TYPE_PROC_UPTIME_PATH,
1480 buf, size, offset, f);
1481 case LXC_TYPE_PROC_STAT:
1482 if (liblxcfs_functional())
1483 return proc_stat_read(buf, size, offset, fi);
1484
1485 return read_file_fuse_with_offset(LXC_TYPE_PROC_STAT_PATH, buf,
1486 size, offset, f);
1487 case LXC_TYPE_PROC_DISKSTATS:
1488 if (liblxcfs_functional())
1489 return proc_diskstats_read(buf, size, offset, fi);
1490
1491 return read_file_fuse_with_offset(LXC_TYPE_PROC_DISKSTATS_PATH,
1492 buf, size, offset, f);
1493 case LXC_TYPE_PROC_SWAPS:
1494 if (liblxcfs_functional())
1495 return proc_swaps_read(buf, size, offset, fi);
1496
1497 return read_file_fuse_with_offset(LXC_TYPE_PROC_SWAPS_PATH, buf,
1498 size, offset, f);
1499 case LXC_TYPE_PROC_LOADAVG:
1500 if (liblxcfs_functional())
1501 return proc_loadavg_read(buf, size, offset, fi);
1502
1503 return read_file_fuse_with_offset(LXC_TYPE_PROC_LOADAVG_PATH,
1504 buf, size, offset, f);
1505 case LXC_TYPE_PROC_SLABINFO:
1506 if (liblxcfs_functional())
1507 return proc_slabinfo_read(buf, size, offset, fi);
1508
1509 return read_file_fuse_with_offset(LXC_TYPE_PROC_SLABINFO_PATH,
1510 buf, size, offset, f);
1511 }
1512
1513 return -EINVAL;
1514 }