]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/proc_fuse.c
tree-wide: include own header file first
[mirror_lxcfs.git] / src / proc_fuse.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE
5 #endif
6
7 #include "config.h"
8
9 #define __STDC_FORMAT_MACROS
10 #include <dirent.h>
11 #include <errno.h>
12 #include <fcntl.h>
13 #include <fuse.h>
14 #include <inttypes.h>
15 #include <libgen.h>
16 #include <pthread.h>
17 #include <sched.h>
18 #include <stdarg.h>
19 #include <stdbool.h>
20 #include <stdint.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <time.h>
25 #include <unistd.h>
26 #include <wait.h>
27 #include <linux/magic.h>
28 #include <linux/sched.h>
29 #include <sys/epoll.h>
30 #include <sys/mman.h>
31 #include <sys/mount.h>
32 #include <sys/param.h>
33 #include <sys/socket.h>
34 #include <sys/syscall.h>
35 #include <sys/sysinfo.h>
36 #include <sys/vfs.h>
37
38 #include "proc_fuse.h"
39
40 #include "bindings.h"
41 #include "cgroup_fuse.h"
42 #include "cgroups/cgroup.h"
43 #include "cgroups/cgroup_utils.h"
44 #include "cpuset_parse.h"
45 #include "lxcfs_fuse_compat.h"
46 #include "memory_utils.h"
47 #include "proc_loadavg.h"
48 #include "proc_cpuview.h"
49 #include "utils.h"
50
51 struct memory_stat {
52 uint64_t hierarchical_memory_limit;
53 uint64_t hierarchical_memsw_limit;
54 uint64_t total_cache;
55 uint64_t total_rss;
56 uint64_t total_rss_huge;
57 uint64_t total_shmem;
58 uint64_t total_mapped_file;
59 uint64_t total_dirty;
60 uint64_t total_writeback;
61 uint64_t total_swap;
62 uint64_t total_pgpgin;
63 uint64_t total_pgpgout;
64 uint64_t total_pgfault;
65 uint64_t total_pgmajfault;
66 uint64_t total_inactive_anon;
67 uint64_t total_active_anon;
68 uint64_t total_inactive_file;
69 uint64_t total_active_file;
70 uint64_t total_unevictable;
71 };
72
73 static off_t get_procfile_size(const char *path)
74 {
75 __do_fclose FILE *f = NULL;
76 __do_free char *line = NULL;
77 size_t len = 0;
78 ssize_t sz, answer = 0;
79
80 f = fopen(path, "re");
81 if (!f)
82 return 0;
83
84 while ((sz = getline(&line, &len, f)) != -1)
85 answer += sz;
86
87 return answer;
88 }
89
90 __lxcfs_fuse_ops int proc_getattr(const char *path, struct stat *sb)
91 {
92 struct timespec now;
93
94 memset(sb, 0, sizeof(struct stat));
95 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
96 return -EINVAL;
97
98 sb->st_uid = sb->st_gid = 0;
99 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
100 if (strcmp(path, "/proc") == 0) {
101 sb->st_mode = S_IFDIR | 00555;
102 sb->st_nlink = 2;
103 return 0;
104 }
105
106 if (strcmp(path, "/proc/meminfo") == 0 ||
107 strcmp(path, "/proc/cpuinfo") == 0 ||
108 strcmp(path, "/proc/uptime") == 0 ||
109 strcmp(path, "/proc/stat") == 0 ||
110 strcmp(path, "/proc/diskstats") == 0 ||
111 strcmp(path, "/proc/swaps") == 0 ||
112 strcmp(path, "/proc/loadavg") == 0 ||
113 strcmp(path, "/proc/slabinfo") == 0) {
114 sb->st_size = get_procfile_size(path);
115 sb->st_mode = S_IFREG | 00444;
116 sb->st_nlink = 1;
117 return 0;
118 }
119
120 return -ENOENT;
121 }
122
123 __lxcfs_fuse_ops int proc_readdir(const char *path, void *buf,
124 fuse_fill_dir_t filler, off_t offset,
125 struct fuse_file_info *fi)
126 {
127 if (DIR_FILLER(filler, buf, ".", NULL, 0) != 0 ||
128 DIR_FILLER(filler, buf, "..", NULL, 0) != 0 ||
129 DIR_FILLER(filler, buf, "cpuinfo", NULL, 0) != 0 ||
130 DIR_FILLER(filler, buf, "meminfo", NULL, 0) != 0 ||
131 DIR_FILLER(filler, buf, "stat", NULL, 0) != 0 ||
132 DIR_FILLER(filler, buf, "uptime", NULL, 0) != 0 ||
133 DIR_FILLER(filler, buf, "diskstats", NULL, 0) != 0 ||
134 DIR_FILLER(filler, buf, "swaps", NULL, 0) != 0 ||
135 DIR_FILLER(filler, buf, "loadavg", NULL, 0) != 0 ||
136 DIR_FILLER(filler, buf, "slabinfo", NULL, 0) != 0)
137 return -EINVAL;
138
139 return 0;
140 }
141
142 __lxcfs_fuse_ops int proc_open(const char *path, struct fuse_file_info *fi)
143 {
144 __do_free struct file_info *info = NULL;
145 int type = -1;
146
147 if (strcmp(path, "/proc/meminfo") == 0)
148 type = LXC_TYPE_PROC_MEMINFO;
149 else if (strcmp(path, "/proc/cpuinfo") == 0)
150 type = LXC_TYPE_PROC_CPUINFO;
151 else if (strcmp(path, "/proc/uptime") == 0)
152 type = LXC_TYPE_PROC_UPTIME;
153 else if (strcmp(path, "/proc/stat") == 0)
154 type = LXC_TYPE_PROC_STAT;
155 else if (strcmp(path, "/proc/diskstats") == 0)
156 type = LXC_TYPE_PROC_DISKSTATS;
157 else if (strcmp(path, "/proc/swaps") == 0)
158 type = LXC_TYPE_PROC_SWAPS;
159 else if (strcmp(path, "/proc/loadavg") == 0)
160 type = LXC_TYPE_PROC_LOADAVG;
161 else if (strcmp(path, "/proc/slabinfo") == 0)
162 type = LXC_TYPE_PROC_SLABINFO;
163 if (type == -1)
164 return -ENOENT;
165
166 info = zalloc(sizeof(*info));
167 if (!info)
168 return -ENOMEM;
169
170 info->type = type;
171
172 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
173
174 info->buf = zalloc(info->buflen);
175 if (!info->buf)
176 return -ENOMEM;
177 /* set actual size to buffer size */
178 info->size = info->buflen;
179
180 fi->fh = PTR_TO_UINT64(move_ptr(info));
181 return 0;
182 }
183
184 __lxcfs_fuse_ops int proc_access(const char *path, int mask)
185 {
186 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
187 return 0;
188
189 /* these are all read-only */
190 if ((mask & ~R_OK) != 0)
191 return -EACCES;
192
193 return 0;
194 }
195
196 __lxcfs_fuse_ops int proc_release(const char *path, struct fuse_file_info *fi)
197 {
198 do_release_file_info(fi);
199 return 0;
200 }
201
202 static uint64_t get_memlimit(const char *cgroup, bool swap)
203 {
204 __do_free char *memlimit_str = NULL;
205 uint64_t memlimit = 0;
206 int ret;
207
208 if (swap)
209 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
210 else
211 ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
212 if (ret > 0 && memlimit_str[0] && safe_uint64(memlimit_str, &memlimit, 10) < 0)
213 lxcfs_error("Failed to convert memlimit %s", memlimit_str);
214
215 return memlimit;
216 }
217
218 /*
219 * This function taken from glibc-2.32, as POSIX dirname("/some-dir") will
220 * return "/some-dir" as opposed to "/", which breaks `get_min_memlimit()`
221 */
222 static char *gnu_dirname(char *path)
223 {
224 static const char dot[] = ".";
225 char *last_slash;
226
227 /* Find last '/'. */
228 last_slash = path != NULL ? strrchr(path, '/') : NULL;
229
230 if (last_slash != NULL && last_slash != path && last_slash[1] == '\0') {
231 /* Determine whether all remaining characters are slashes. */
232 char *runp;
233
234 for (runp = last_slash; runp != path; --runp)
235 if (runp[-1] != '/')
236 break;
237
238 /* The '/' is the last character, we have to look further. */
239 if (runp != path)
240 last_slash = memrchr(path, '/', runp - path);
241 }
242
243 if (last_slash != NULL) {
244 /* Determine whether all remaining characters are slashes. */
245 char *runp;
246
247 for (runp = last_slash; runp != path; --runp)
248 if (runp[-1] != '/')
249 break;
250
251 /* Terminate the path. */
252 if (runp == path) {
253 /*
254 * The last slash is the first character in the string.
255 * We have to return "/". As a special case we have to
256 * return "//" if there are exactly two slashes at the
257 * beginning of the string. See XBD 4.10 Path Name
258 * Resolution for more information
259 */
260 if (last_slash == path + 1)
261 ++last_slash;
262 else
263 last_slash = path + 1;
264 } else
265 last_slash = runp;
266
267 last_slash[0] = '\0';
268 } else {
269 /*
270 * This assignment is ill-designed but the XPG specs require to
271 * return a string containing "." in any case no directory part
272 * is found and so a static and constant string is required.
273 */
274 path = (char *)dot;
275 }
276
277 return path;
278 }
279
280 static uint64_t get_min_memlimit(const char *cgroup, bool swap)
281 {
282 __do_free char *copy = NULL;
283 uint64_t memlimit = 0, retlimit = 0;
284
285 copy = strdup(cgroup);
286 if (!copy)
287 return log_error_errno(0, ENOMEM, "Failed to allocate memory");
288
289 retlimit = get_memlimit(copy, swap);
290
291 /*
292 * If the cgroup doesn't start with / (probably won't happen), dirname()
293 * will terminate with "" instead of "/"
294 */
295 while (*copy && strcmp(copy, "/") != 0) {
296 char *it = copy;
297
298 it = gnu_dirname(it);
299 memlimit = get_memlimit(it, swap);
300 if (memlimit > 0 && memlimit < retlimit)
301 retlimit = memlimit;
302 };
303
304 return retlimit;
305 }
306
307 static inline bool startswith(const char *line, const char *pref)
308 {
309 return strncmp(line, pref, strlen(pref)) == 0;
310 }
311
312 static int proc_swaps_read(char *buf, size_t size, off_t offset,
313 struct fuse_file_info *fi)
314 {
315 __do_free char *cgroup = NULL, *memusage_str = NULL,
316 *memswusage_str = NULL, *memswpriority_str = NULL;
317 struct fuse_context *fc = fuse_get_context();
318 bool wants_swap = lxcfs_has_opt(fuse_get_context()->private_data, LXCFS_SWAP_ON);
319 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
320 uint64_t memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0,
321 swtotal = 0, swusage = 0, memswpriority = 1,
322 hostswtotal = 0, hostswfree = 0;
323 ssize_t total_len = 0;
324 ssize_t l = 0;
325 char *cache = d->buf;
326 int ret;
327 __do_free char *line = NULL;
328 __do_free void *fopen_cache = NULL;
329 __do_fclose FILE *f = NULL;
330 size_t linelen = 0;
331
332 if (offset) {
333 int left;
334
335 if (offset > d->size)
336 return -EINVAL;
337
338 if (!d->cached)
339 return 0;
340
341 left = d->size - offset;
342 total_len = left > size ? size: left;
343 memcpy(buf, cache + offset, total_len);
344
345 return total_len;
346 }
347
348 pid_t initpid = lookup_initpid_in_store(fc->pid);
349 if (initpid <= 1 || is_shared_pidns(initpid))
350 initpid = fc->pid;
351
352 cgroup = get_pid_cgroup(initpid, "memory");
353 if (!cgroup)
354 return read_file_fuse("/proc/swaps", buf, size, d);
355 prune_init_slice(cgroup);
356
357 memlimit = get_min_memlimit(cgroup, false);
358
359 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
360 if (ret < 0)
361 return 0;
362
363 if (safe_uint64(memusage_str, &memusage, 10) < 0)
364 lxcfs_error("Failed to convert memusage %s", memusage_str);
365
366 if (wants_swap) {
367 memswlimit = get_min_memlimit(cgroup, true);
368 if (memswlimit > 0) {
369 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
370 if (ret >= 0 && safe_uint64(memswusage_str, &memswusage, 10) == 0) {
371 if (memlimit > memswlimit)
372 swtotal = 0;
373 else
374 swtotal = (memswlimit - memlimit) / 1024;
375 if (memusage > memswusage || swtotal == 0)
376 swusage = 0;
377 else
378 swusage = (memswusage - memusage) / 1024;
379 }
380
381 ret = cgroup_ops->get_memory_swappiness(cgroup_ops, cgroup, &memswpriority_str);
382 if (ret >= 0)
383 safe_uint64(memswpriority_str, &memswpriority, 10);
384 }
385 }
386
387 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
388
389 /* Read host total and free values */
390 f = fopen_cached("/proc/meminfo", "re", &fopen_cache);
391 if (!f)
392 return 0;
393
394 while (getline(&line, &linelen, f) != -1) {
395 if (startswith(line, "SwapTotal:"))
396 sscanf(line, "SwapTotal: %8" PRIu64 " kB", &hostswtotal);
397 else if (startswith(line, "SwapFree:"))
398 sscanf(line, "SwapFree: %8" PRIu64 " kB", &hostswfree);
399 }
400
401 if (wants_swap) {
402 /* The total amount of swap is always reported to be the
403 lesser of the RAM+SWAP limit or the SWAP device size.
404 This is because the kernel can swap as much as it
405 wants and not only up to swtotal. */
406 swtotal = memlimit / 1024 + swtotal;
407 if (hostswtotal < swtotal) {
408 swtotal = hostswtotal;
409 }
410
411 /* When swappiness is 0, pretend we can't swap. */
412 if (memswpriority == 0) {
413 swtotal = swusage;
414 }
415 }
416
417 if (swtotal > 0) {
418 l = snprintf(d->buf + total_len, d->size - total_len,
419 "none%*svirtual\t\t%" PRIu64 "\t%" PRIu64 "\t0\n",
420 36, " ", swtotal, swusage);
421 total_len += l;
422 }
423
424 if (total_len < 0 || l < 0)
425 return log_error(0, "Failed writing to cache");
426
427 d->cached = 1;
428 d->size = (int)total_len;
429
430 if (total_len > size)
431 total_len = size;
432 memcpy(buf, d->buf, total_len);
433
434 return total_len;
435 }
436
437 static void get_blkio_io_value(char *str, unsigned major, unsigned minor,
438 char *iotype, uint64_t *v)
439 {
440 char *eol;
441 char key[32];
442 size_t len;
443
444 memset(key, 0, 32);
445 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
446
447 *v = 0;
448 len = strlen(key);
449 while (*str) {
450 if (startswith(str, key)) {
451 sscanf(str + len, "%lu", v);
452 return;
453 }
454 eol = strchr(str, '\n');
455 if (!eol)
456 return;
457 str = eol+1;
458 }
459 }
460
461 struct lxcfs_diskstats {
462 unsigned int major; /* 1 - major number */
463 unsigned int minor; /* 2 - minor mumber */
464 char dev_name[72]; /* 3 - device name */
465 uint64_t read; /* 4 - reads completed successfully */
466 uint64_t read_merged; /* 5 - reads merged */
467 uint64_t read_sectors; /* 6 - sectors read */
468 uint64_t read_ticks; /* 7 - time spent reading (ms) */
469 uint64_t write; /* 8 - writes completed */
470 uint64_t write_merged; /* 9 - writes merged */
471 uint64_t write_sectors; /* 10 - sectors written */
472 uint64_t write_ticks; /* 11 - time spent writing (ms) */
473 uint64_t ios_pgr; /* 12 - I/Os currently in progress */
474 uint64_t total_ticks; /* 13 - time spent doing I/Os (ms) */
475 uint64_t rq_ticks; /* 14 - weighted time spent doing I/Os (ms) */
476 uint64_t discard; /* 15 - discards completed successfully (4.18+) */
477 uint64_t discard_merged; /* 16 - discards merged (4.18+) */
478 uint64_t discard_sectors; /* 17 - sectors discarded (4.18+) */
479 uint64_t discard_ticks; /* 18 - time spent discarding (4.18+) */
480 };
481
482 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
483 struct fuse_file_info *fi)
484 {
485 __do_free char *cg = NULL, *io_serviced_str = NULL,
486 *io_merged_str = NULL, *io_service_bytes_str = NULL,
487 *io_wait_time_str = NULL, *io_service_time_str = NULL,
488 *line = NULL;
489 __do_free void *fopen_cache = NULL;
490 __do_fclose FILE *f = NULL;
491 struct fuse_context *fc = fuse_get_context();
492 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
493 struct lxcfs_diskstats stats = {};
494 /* helper fields */
495 uint64_t read_service_time, write_service_time, discard_service_time, read_wait_time,
496 write_wait_time, discard_wait_time;
497 char *cache = d->buf;
498 size_t cache_size = d->buflen;
499 size_t linelen = 0, total_len = 0;
500 int i = 0;
501 int ret;
502
503 if (offset) {
504 int left;
505
506 if (offset > d->size)
507 return -EINVAL;
508
509 if (!d->cached)
510 return 0;
511
512 left = d->size - offset;
513 total_len = left > size ? size: left;
514 memcpy(buf, cache + offset, total_len);
515
516 return total_len;
517 }
518
519 pid_t initpid = lookup_initpid_in_store(fc->pid);
520 if (initpid <= 1 || is_shared_pidns(initpid))
521 initpid = fc->pid;
522
523 cg = get_pid_cgroup(initpid, "blkio");
524 if (!cg)
525 return read_file_fuse("/proc/diskstats", buf, size, d);
526 prune_init_slice(cg);
527
528 ret = cgroup_ops->get_io_serviced(cgroup_ops, cg, &io_serviced_str);
529 if (ret < 0) {
530 if (ret == -EOPNOTSUPP)
531 return read_file_fuse("/proc/diskstats", buf, size, d);
532 }
533
534 ret = cgroup_ops->get_io_merged(cgroup_ops, cg, &io_merged_str);
535 if (ret < 0) {
536 if (ret == -EOPNOTSUPP)
537 return read_file_fuse("/proc/diskstats", buf, size, d);
538 }
539
540 ret = cgroup_ops->get_io_service_bytes(cgroup_ops, cg, &io_service_bytes_str);
541 if (ret < 0) {
542 if (ret == -EOPNOTSUPP)
543 return read_file_fuse("/proc/diskstats", buf, size, d);
544 }
545
546 ret = cgroup_ops->get_io_wait_time(cgroup_ops, cg, &io_wait_time_str);
547 if (ret < 0) {
548 if (ret == -EOPNOTSUPP)
549 return read_file_fuse("/proc/diskstats", buf, size, d);
550 }
551
552 ret = cgroup_ops->get_io_service_time(cgroup_ops, cg, &io_service_time_str);
553 if (ret < 0) {
554 if (ret == -EOPNOTSUPP)
555 return read_file_fuse("/proc/diskstats", buf, size, d);
556 }
557
558 f = fopen_cached("/proc/diskstats", "re", &fopen_cache);
559 if (!f)
560 return 0;
561
562 while (getline(&line, &linelen, f) != -1) {
563 ssize_t l;
564 char lbuf[256];
565
566 i = sscanf(line, "%u %u %71s", &stats.major, &stats.minor, stats.dev_name);
567 if (i != 3)
568 continue;
569
570 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Read", &stats.read);
571 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Write", &stats.write);
572 get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Discard", &stats.discard);
573
574 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Read", &stats.read_merged);
575 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Write", &stats.write_merged);
576 get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Discard", &stats.discard_merged);
577
578 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Read", &stats.read_sectors);
579 stats.read_sectors = stats.read_sectors / 512;
580 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Write", &stats.write_sectors);
581 stats.write_sectors = stats.write_sectors / 512;
582 get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Discard", &stats.discard_sectors);
583 stats.discard_sectors = stats.discard_sectors / 512;
584
585 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Read", &read_service_time);
586 read_service_time = read_service_time / 1000000;
587 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Read", &read_wait_time);
588 read_wait_time = read_wait_time / 1000000;
589 stats.read_ticks = read_service_time + read_wait_time;
590
591 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Write", &write_service_time);
592 write_service_time = write_service_time / 1000000;
593 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Write", &write_wait_time);
594 write_wait_time = write_wait_time / 1000000;
595 stats.write_ticks = write_service_time + write_wait_time;
596
597 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Discard", &discard_service_time);
598 discard_service_time = discard_service_time / 1000000;
599 get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Discard", &discard_wait_time);
600 discard_wait_time = discard_wait_time / 1000000;
601 stats.discard_ticks = discard_service_time + discard_wait_time;
602
603 get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Total", &stats.total_ticks);
604 stats.total_ticks = stats.total_ticks / 1000000;
605
606 memset(lbuf, 0, 256);
607 if (stats.read || stats.write || stats.read_merged || stats.write_merged ||
608 stats.read_sectors || stats.write_sectors || stats.read_ticks ||
609 stats.write_ticks || stats.ios_pgr || stats.total_ticks || stats.rq_ticks ||
610 stats.discard_merged || stats.discard_sectors || stats.discard_ticks)
611 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
612 stats.major,
613 stats.minor,
614 stats.dev_name,
615 stats.read,
616 stats.read_merged,
617 stats.read_sectors,
618 stats.read_ticks,
619 stats.write,
620 stats.write_merged,
621 stats.write_sectors,
622 stats.write_ticks,
623 stats.ios_pgr,
624 stats.total_ticks,
625 stats.rq_ticks,
626 stats.discard_merged,
627 stats.discard_sectors,
628 stats.discard_ticks);
629 else
630 continue;
631
632 l = snprintf(cache, cache_size, "%s", lbuf);
633 if (l < 0)
634 return log_error(0, "Failed to write cache");
635 if (l >= cache_size)
636 return log_error(0, "Write to cache was truncated");
637
638 cache += l;
639 cache_size -= l;
640 total_len += l;
641 }
642
643 d->cached = 1;
644 d->size = total_len;
645 if (total_len > size)
646 total_len = size;
647 memcpy(buf, d->buf, total_len);
648
649 return total_len;
650 }
651
652 #if RELOADTEST
653 static inline void iwashere(void)
654 {
655 mknod("/tmp/lxcfs-iwashere", S_IFREG, 0644);
656 }
657 #endif
658
659 /*
660 * This function retrieves the busy time of a group of tasks by looking at
661 * cpuacct.usage. Unfortunately, this only makes sense when the container has
662 * been given it's own cpuacct cgroup. If not, this function will take the busy
663 * time of all other taks that do not actually belong to the container into
664 * account as well. If someone has a clever solution for this please send a
665 * patch!
666 */
667 static double get_reaper_busy(pid_t task)
668 {
669 __do_free char *cgroup = NULL, *usage_str = NULL;
670 uint64_t usage = 0;
671 pid_t initpid;
672
673 initpid = lookup_initpid_in_store(task);
674 if (initpid <= 0)
675 return 0;
676
677 cgroup = get_pid_cgroup(initpid, "cpuacct");
678 if (!cgroup)
679 return 0;
680 prune_init_slice(cgroup);
681
682 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cgroup, "cpuacct.usage", &usage_str))
683 return 0;
684
685 if (safe_uint64(usage_str, &usage, 10) < 0)
686 lxcfs_error("Failed to convert usage %s", usage_str);
687
688 return ((double)usage / 1000000000);
689 }
690
691 static uint64_t get_reaper_start_time(pid_t pid)
692 {
693 __do_free void *fopen_cache = NULL;
694 __do_fclose FILE *f = NULL;
695 int ret;
696 uint64_t starttime;
697 char path[STRLITERALLEN("/proc/") + LXCFS_NUMSTRLEN64 +
698 STRLITERALLEN("/stat") + 1];
699 pid_t qpid;
700
701 qpid = lookup_initpid_in_store(pid);
702 if (qpid <= 0)
703 return ret_errno(EINVAL);
704
705 ret = snprintf(path, sizeof(path), "/proc/%d/stat", qpid);
706 if (ret < 0 || (size_t)ret >= sizeof(path))
707 return ret_errno(EINVAL);
708
709 f = fopen_cached(path, "re", &fopen_cache);
710 if (!f)
711 return ret_errno(EINVAL);
712
713 /* Note that the *scanf() argument supression requires that length
714 * modifiers such as "l" are omitted. Otherwise some compilers will yell
715 * at us. It's like telling someone you're not married and then asking
716 * if you can bring your wife to the party.
717 */
718 ret = fscanf(f, "%*d " /* (1) pid %d */
719 "%*s " /* (2) comm %s */
720 "%*c " /* (3) state %c */
721 "%*d " /* (4) ppid %d */
722 "%*d " /* (5) pgrp %d */
723 "%*d " /* (6) session %d */
724 "%*d " /* (7) tty_nr %d */
725 "%*d " /* (8) tpgid %d */
726 "%*u " /* (9) flags %u */
727 "%*u " /* (10) minflt %lu */
728 "%*u " /* (11) cminflt %lu */
729 "%*u " /* (12) majflt %lu */
730 "%*u " /* (13) cmajflt %lu */
731 "%*u " /* (14) utime %lu */
732 "%*u " /* (15) stime %lu */
733 "%*d " /* (16) cutime %ld */
734 "%*d " /* (17) cstime %ld */
735 "%*d " /* (18) priority %ld */
736 "%*d " /* (19) nice %ld */
737 "%*d " /* (20) num_threads %ld */
738 "%*d " /* (21) itrealvalue %ld */
739 "%" PRIu64, /* (22) starttime %llu */
740 &starttime);
741 if (ret != 1)
742 return ret_errno(EINVAL);
743
744 return ret_set_errno(starttime, 0);
745 }
746
747 static double get_reaper_start_time_in_sec(pid_t pid)
748 {
749 uint64_t clockticks, ticks_per_sec;
750 int64_t ret;
751 double res = 0;
752
753 clockticks = get_reaper_start_time(pid);
754 if (clockticks <= 0)
755 return log_debug(0, "Failed to retrieve start time of pid %d", pid);
756
757 ret = sysconf(_SC_CLK_TCK);
758 if (ret < 0)
759 return log_debug(0, "Failed to determine number of clock ticks in a second");
760
761 ticks_per_sec = (uint64_t)ret;
762 res = (double)clockticks / ticks_per_sec;
763 return res;
764 }
765
766 static double get_reaper_age(pid_t pid)
767 {
768 uint64_t uptime_ms;
769 double procstart, procage;
770
771 /*
772 * We need to substract the time the process has started since system
773 * boot minus the time when the system has started to get the actual
774 * reaper age.
775 */
776 procstart = get_reaper_start_time_in_sec(pid);
777 procage = procstart;
778 if (procstart > 0) {
779 int ret;
780 struct timespec spec;
781
782 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
783 if (ret < 0)
784 return 0;
785
786 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
787 procage = (uptime_ms - (procstart * 1000)) / 1000;
788 }
789
790 return procage;
791 }
792
793 /*
794 * We read /proc/uptime and reuse its second field.
795 * For the first field, we use the mtime for the reaper for
796 * the calling pid as returned by getreaperage
797 */
798 static int proc_uptime_read(char *buf, size_t size, off_t offset,
799 struct fuse_file_info *fi)
800 {
801 struct fuse_context *fc = fuse_get_context();
802 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
803 char *cache = d->buf;
804 ssize_t total_len = 0, ret = 0;
805 double busytime, idletime, reaperage;
806
807 #if RELOADTEST
808 iwashere();
809 #endif
810
811 if (offset) {
812 int left;
813
814 if (offset > d->size)
815 return -EINVAL;
816
817 if (!d->cached)
818 return 0;
819
820 left = d->size - offset;
821 total_len = left > size ? size : left;
822 memcpy(buf, cache + offset, total_len);
823
824 return total_len;
825 }
826
827 reaperage = get_reaper_age(fc->pid);
828 /*
829 * To understand why this is done, please read the comment to the
830 * get_reaper_busy() function.
831 */
832 idletime = reaperage;
833 busytime = get_reaper_busy(fc->pid);
834 if (reaperage >= busytime)
835 idletime = reaperage - busytime;
836
837 ret = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
838 if (ret < 0 || ret >= d->buflen)
839 return read_file_fuse("/proc/uptime", buf, size, d);
840 total_len = ret;
841
842 d->cached = 1;
843 d->size = total_len;
844 if (total_len > size)
845 total_len = size;
846 memcpy(buf, d->buf, total_len);
847
848 return total_len;
849 }
850
851 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
852 static int proc_stat_read(char *buf, size_t size, off_t offset,
853 struct fuse_file_info *fi)
854 {
855 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
856 __do_free void *fopen_cache = NULL;
857 __do_free struct cpuacct_usage *cg_cpu_usage = NULL;
858 __do_fclose FILE *f = NULL;
859 struct fuse_context *fc = fuse_get_context();
860 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
861 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
862 size_t linelen = 0, total_len = 0;
863 int curcpu = -1; /* cpu numbering starts at 0 */
864 int physcpu = 0;
865 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
866 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
867 uint64_t user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0,
868 iowait_sum = 0, irq_sum = 0, softirq_sum = 0, steal_sum = 0,
869 guest_sum = 0, guest_nice_sum = 0;
870 char cpuall[CPUALL_MAX_SIZE];
871 /* reserve for cpu all */
872 char *cache = d->buf + CPUALL_MAX_SIZE;
873 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
874 int cg_cpu_usage_size = 0;
875
876 if (offset) {
877 int left;
878
879 if (offset > d->size)
880 return -EINVAL;
881
882 if (!d->cached)
883 return 0;
884
885 left = d->size - offset;
886 total_len = left > size ? size : left;
887 memcpy(buf, d->buf + offset, total_len);
888
889 return total_len;
890 }
891
892 pid_t initpid = lookup_initpid_in_store(fc->pid);
893 if (initpid <= 1 || is_shared_pidns(initpid))
894 initpid = fc->pid;
895
896 /*
897 * when container run with host pid namespace initpid == 1, cgroup will "/"
898 * we should return host os's /proc contents.
899 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
900 */
901 if (initpid == 1)
902 return read_file_fuse("/proc/stat", buf, size, d);
903
904 cg = get_pid_cgroup(initpid, "cpuset");
905 if (!cg)
906 return read_file_fuse("/proc/stat", buf, size, d);
907 prune_init_slice(cg);
908
909 cpuset = get_cpuset(cg);
910 if (!cpuset)
911 return 0;
912
913 f = fopen_cached("/proc/stat", "re", &fopen_cache);
914 if (!f)
915 return 0;
916
917 /* Skip first system cpu line. */
918 if (getline(&line, &linelen, f) < 0)
919 return log_error(0, "proc_stat_read read first line failed");
920
921 /*
922 * Read cpuacct.usage_all for all CPUs.
923 * If the cpuacct cgroup is present, it is used to calculate the container's
924 * CPU usage. If not, values from the host's /proc/stat are used.
925 */
926 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) == 0) {
927 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs) {
928 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage,
929 cg_cpu_usage_size, f,
930 d->buf, d->buflen);
931 goto out;
932 }
933 } else {
934 lxcfs_v("proc_stat_read failed to read from cpuacct, falling back to the host's /proc/stat");
935 }
936
937 while (getline(&line, &linelen, f) != -1) {
938 ssize_t l;
939 char cpu_char[10]; /* That's a lot of cores */
940 char *c;
941 uint64_t all_used, cg_used, new_idle;
942 int ret;
943
944 if (strlen(line) == 0)
945 continue;
946 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
947 /* not a ^cpuN line containing a number N, just print it */
948 l = snprintf(cache, cache_size, "%s", line);
949 if (l < 0)
950 return log_error(0, "Failed to write cache");
951 if (l >= cache_size)
952 return log_error(0, "Write to cache was truncated");
953
954 cache += l;
955 cache_size -= l;
956 total_len += l;
957
958 continue;
959 }
960
961 if (sscanf(cpu_char, "%d", &physcpu) != 1)
962 continue;
963
964 if (!cpu_in_cpuset(physcpu, cpuset))
965 continue;
966
967 curcpu++;
968
969 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
970 &user,
971 &nice,
972 &system,
973 &idle,
974 &iowait,
975 &irq,
976 &softirq,
977 &steal,
978 &guest,
979 &guest_nice);
980 if (ret != 10 || !cg_cpu_usage) {
981 c = strchr(line, ' ');
982 if (!c)
983 continue;
984
985 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
986 if (l < 0)
987 return log_error(0, "Failed to write cache");
988 if (l >= cache_size)
989 return log_error(0, "Write to cache was truncated");
990
991 cache += l;
992 cache_size -= l;
993 total_len += l;
994
995 if (ret != 10)
996 continue;
997 }
998
999 if (cg_cpu_usage) {
1000 if (physcpu >= cg_cpu_usage_size)
1001 break;
1002
1003 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
1004 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
1005
1006 if (all_used >= cg_used) {
1007 new_idle = idle + (all_used - cg_used);
1008 } else {
1009 lxcfs_debug("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
1010 curcpu, cg, all_used, cg_used);
1011 new_idle = idle;
1012 }
1013
1014 l = snprintf(cache, cache_size,
1015 "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
1016 curcpu, cg_cpu_usage[physcpu].user,
1017 cg_cpu_usage[physcpu].system, new_idle);
1018 if (l < 0)
1019 return log_error(0, "Failed to write cache");
1020 if (l >= cache_size)
1021 return log_error(0, "Write to cache was truncated");
1022
1023 cache += l;
1024 cache_size -= l;
1025 total_len += l;
1026
1027 user_sum += cg_cpu_usage[physcpu].user;
1028 system_sum += cg_cpu_usage[physcpu].system;
1029 idle_sum += new_idle;
1030 } else {
1031 user_sum += user;
1032 nice_sum += nice;
1033 system_sum += system;
1034 idle_sum += idle;
1035 iowait_sum += iowait;
1036 irq_sum += irq;
1037 softirq_sum += softirq;
1038 steal_sum += steal;
1039 guest_sum += guest;
1040 guest_nice_sum += guest_nice;
1041 }
1042 }
1043
1044 cache = d->buf;
1045
1046 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
1047 user_sum,
1048 nice_sum,
1049 system_sum,
1050 idle_sum,
1051 iowait_sum,
1052 irq_sum,
1053 softirq_sum,
1054 steal_sum,
1055 guest_sum,
1056 guest_nice_sum);
1057 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
1058 memcpy(cache, cpuall, cpuall_len);
1059 cache += cpuall_len;
1060 } else {
1061 /* shouldn't happen */
1062 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d", cpuall_len);
1063 cpuall_len = 0;
1064 }
1065
1066 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
1067 total_len += cpuall_len;
1068
1069 out:
1070 d->cached = 1;
1071 d->size = total_len;
1072 if (total_len > size)
1073 total_len = size;
1074
1075 memcpy(buf, d->buf, total_len);
1076 return total_len;
1077 }
1078
1079 /* Note that "memory.stat" in cgroup2 is hierarchical by default. */
1080 static bool cgroup_parse_memory_stat(const char *cgroup, struct memory_stat *mstat)
1081 {
1082 __do_close int fd = -EBADF;
1083 __do_fclose FILE *f = NULL;
1084 __do_free char *line = NULL;
1085 __do_free void *fdopen_cache = NULL;
1086 bool unified;
1087 size_t len = 0;
1088 ssize_t linelen;
1089
1090 fd = cgroup_ops->get_memory_stats_fd(cgroup_ops, cgroup);
1091 if (fd < 0)
1092 return false;
1093
1094 f = fdopen_cached(fd, "re", &fdopen_cache);
1095 if (!f)
1096 return false;
1097
1098 unified = pure_unified_layout(cgroup_ops);
1099 while ((linelen = getline(&line, &len, f)) != -1) {
1100 if (!unified && startswith(line, "hierarchical_memory_limit")) {
1101 sscanf(line, "hierarchical_memory_limit %" PRIu64, &(mstat->hierarchical_memory_limit));
1102 } else if (!unified && startswith(line, "hierarchical_memsw_limit")) {
1103 sscanf(line, "hierarchical_memsw_limit %" PRIu64, &(mstat->hierarchical_memsw_limit));
1104 } else if (startswith(line, unified ? "file" :"total_cache")) {
1105 sscanf(line, unified ? "file %" PRIu64 : "total_cache %" PRIu64, &(mstat->total_cache));
1106 } else if (!unified && startswith(line, "total_rss")) {
1107 sscanf(line, "total_rss %" PRIu64, &(mstat->total_rss));
1108 } else if (!unified && startswith(line, "total_rss_huge")) {
1109 sscanf(line, "total_rss_huge %" PRIu64, &(mstat->total_rss_huge));
1110 } else if (startswith(line, unified ? "shmem" : "total_shmem")) {
1111 sscanf(line, unified ? "shmem %" PRIu64 : "total_shmem %" PRIu64, &(mstat->total_shmem));
1112 } else if (startswith(line, unified ? "file_mapped" : "total_mapped_file")) {
1113 sscanf(line, unified ? "file_mapped %" PRIu64 : "total_mapped_file %" PRIu64, &(mstat->total_mapped_file));
1114 } else if (!unified && startswith(line, "total_dirty")) {
1115 sscanf(line, "total_dirty %" PRIu64, &(mstat->total_dirty));
1116 } else if (!unified && startswith(line, "total_writeback")) {
1117 sscanf(line, "total_writeback %" PRIu64, &(mstat->total_writeback));
1118 } else if (!unified && startswith(line, "total_swap")) {
1119 sscanf(line, "total_swap %" PRIu64, &(mstat->total_swap));
1120 } else if (!unified && startswith(line, "total_pgpgin")) {
1121 sscanf(line, "total_pgpgin %" PRIu64, &(mstat->total_pgpgin));
1122 } else if (!unified && startswith(line, "total_pgpgout")) {
1123 sscanf(line, "total_pgpgout %" PRIu64, &(mstat->total_pgpgout));
1124 } else if (startswith(line, unified ? "pgfault" : "total_pgfault")) {
1125 sscanf(line, unified ? "pgfault %" PRIu64 : "total_pgfault %" PRIu64, &(mstat->total_pgfault));
1126 } else if (startswith(line, unified ? "pgmajfault" : "total_pgmajfault")) {
1127 sscanf(line, unified ? "pgmajfault %" PRIu64 : "total_pgmajfault %" PRIu64, &(mstat->total_pgmajfault));
1128 } else if (startswith(line, unified ? "inactive_anon" : "total_inactive_anon")) {
1129 sscanf(line, unified ? "inactive_anon %" PRIu64 : "total_inactive_anon %" PRIu64, &(mstat->total_inactive_anon));
1130 } else if (startswith(line, unified ? "active_anon" : "total_active_anon")) {
1131 sscanf(line, unified ? "active_anon %" PRIu64 : "total_active_anon %" PRIu64, &(mstat->total_active_anon));
1132 } else if (startswith(line, unified ? "inactive_file" : "total_inactive_file")) {
1133 sscanf(line, unified ? "inactive_file %" PRIu64 : "total_inactive_file %" PRIu64, &(mstat->total_inactive_file));
1134 } else if (startswith(line, unified ? "active_file" : "total_active_file")) {
1135 sscanf(line, unified ? "active_file %" PRIu64 : "total_active_file %" PRIu64, &(mstat->total_active_file));
1136 } else if (startswith(line, unified ? "unevictable" : "total_unevictable")) {
1137 sscanf(line, unified ? "unevictable %" PRIu64 : "total_unevictable %" PRIu64, &(mstat->total_unevictable));
1138 }
1139 }
1140
1141 return true;
1142 }
1143
1144 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1145 struct fuse_file_info *fi)
1146 {
1147 __do_free char *cgroup = NULL, *line = NULL, *memusage_str = NULL,
1148 *memswusage_str = NULL, *memswpriority_str = NULL;
1149 __do_free void *fopen_cache = NULL;
1150 __do_fclose FILE *f = NULL;
1151 struct fuse_context *fc = fuse_get_context();
1152 bool wants_swap = lxcfs_has_opt(fuse_get_context()->private_data, LXCFS_SWAP_ON);
1153 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1154 uint64_t memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
1155 hosttotal = 0, swfree = 0, swusage = 0, swtotal = 0,
1156 memswpriority = 1;
1157 struct memory_stat mstat = {};
1158 size_t linelen = 0, total_len = 0;
1159 char *cache = d->buf;
1160 size_t cache_size = d->buflen;
1161 int ret;
1162
1163 if (offset) {
1164 int left;
1165
1166 if (offset > d->size)
1167 return -EINVAL;
1168
1169 if (!d->cached)
1170 return 0;
1171
1172 left = d->size - offset;
1173 total_len = left > size ? size : left;
1174 memcpy(buf, cache + offset, total_len);
1175
1176 return total_len;
1177 }
1178
1179 pid_t initpid = lookup_initpid_in_store(fc->pid);
1180 if (initpid <= 1 || is_shared_pidns(initpid))
1181 initpid = fc->pid;
1182
1183 cgroup = get_pid_cgroup(initpid, "memory");
1184 if (!cgroup)
1185 return read_file_fuse("/proc/meminfo", buf, size, d);
1186
1187 prune_init_slice(cgroup);
1188
1189 /* memory limits */
1190 ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
1191 if (ret < 0)
1192 return read_file_fuse("/proc/meminfo", buf, size, d);
1193
1194 if (safe_uint64(memusage_str, &memusage, 10) < 0)
1195 lxcfs_error("Failed to convert memusage %s", memusage_str);
1196
1197 if (!cgroup_parse_memory_stat(cgroup, &mstat))
1198 return read_file_fuse("/proc/meminfo", buf, size, d);
1199
1200 memlimit = get_min_memlimit(cgroup, false);
1201
1202 /*
1203 * Following values are allowed to fail, because swapaccount might be
1204 * turned off for current kernel.
1205 */
1206 if (wants_swap) {
1207 memswlimit = get_min_memlimit(cgroup, true);
1208 if (memswlimit > 0) {
1209 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
1210 if (ret >= 0 && safe_uint64(memswusage_str, &memswusage, 10) == 0) {
1211 if (memlimit > memswlimit)
1212 swtotal = 0;
1213 else
1214 swtotal = (memswlimit - memlimit) / 1024;
1215 if (memusage > memswusage || swtotal == 0)
1216 swusage = 0;
1217 else
1218 swusage = (memswusage - memusage) / 1024;
1219 }
1220 }
1221
1222 ret = cgroup_ops->get_memory_swappiness(cgroup_ops, cgroup, &memswpriority_str);
1223 if (ret >= 0)
1224 safe_uint64(memswpriority_str, &memswpriority, 10);
1225 }
1226
1227 f = fopen_cached("/proc/meminfo", "re", &fopen_cache);
1228 if (!f)
1229 return read_file_fuse("/proc/meminfo", buf, size, d);
1230
1231 memusage /= 1024;
1232 memlimit /= 1024;
1233 while (getline(&line, &linelen, f) != -1) {
1234 ssize_t l;
1235 char *printme, lbuf[100];
1236
1237 memset(lbuf, 0, 100);
1238 if (startswith(line, "MemTotal:")) {
1239 sscanf(line+sizeof("MemTotal:")-1, "%" PRIu64, &hosttotal);
1240 if (memlimit == 0)
1241 memlimit = hosttotal;
1242
1243 if (hosttotal < memlimit)
1244 memlimit = hosttotal;
1245 snprintf(lbuf, 100, "MemTotal: %8" PRIu64 " kB\n", memlimit);
1246 printme = lbuf;
1247 } else if (startswith(line, "MemFree:")) {
1248 snprintf(lbuf, 100, "MemFree: %8" PRIu64 " kB\n", memlimit - memusage);
1249 printme = lbuf;
1250 } else if (startswith(line, "MemAvailable:")) {
1251 snprintf(lbuf, 100, "MemAvailable: %8" PRIu64 " kB\n", memlimit - memusage + mstat.total_cache / 1024);
1252 printme = lbuf;
1253 } else if (startswith(line, "SwapTotal:")) {
1254 if (wants_swap) {
1255 uint64_t hostswtotal = 0;
1256
1257 sscanf(line + STRLITERALLEN("SwapTotal:"), "%" PRIu64, &hostswtotal);
1258
1259 /* The total amount of swap is always reported to be the
1260 lesser of the RAM+SWAP limit or the SWAP device size.
1261 This is because the kernel can swap as much as it
1262 wants and not only up to swtotal. */
1263
1264 swtotal = memlimit + swtotal;
1265 if (hostswtotal < swtotal) {
1266 swtotal = hostswtotal;
1267 }
1268
1269 /* When swappiness is 0, pretend we can't swap. */
1270 if (memswpriority == 0) {
1271 swtotal = swusage;
1272 }
1273 }
1274
1275 snprintf(lbuf, 100, "SwapTotal: %8" PRIu64 " kB\n", swtotal);
1276 printme = lbuf;
1277 } else if (startswith(line, "SwapFree:")) {
1278 if (wants_swap) {
1279 swfree = swtotal - swusage;
1280 }
1281
1282 snprintf(lbuf, 100, "SwapFree: %8" PRIu64 " kB\n", swfree);
1283 printme = lbuf;
1284 } else if (startswith(line, "Slab:")) {
1285 snprintf(lbuf, 100, "Slab: %8" PRIu64 " kB\n", (uint64_t)0);
1286 printme = lbuf;
1287 } else if (startswith(line, "Buffers:")) {
1288 snprintf(lbuf, 100, "Buffers: %8" PRIu64 " kB\n", (uint64_t)0);
1289 printme = lbuf;
1290 } else if (startswith(line, "Cached:")) {
1291 snprintf(lbuf, 100, "Cached: %8" PRIu64 " kB\n",
1292 mstat.total_cache / 1024);
1293 printme = lbuf;
1294 } else if (startswith(line, "SwapCached:")) {
1295 snprintf(lbuf, 100, "SwapCached: %8" PRIu64 " kB\n", (uint64_t)0);
1296 printme = lbuf;
1297 } else if (startswith(line, "Active:")) {
1298 snprintf(lbuf, 100, "Active: %8" PRIu64 " kB\n",
1299 (mstat.total_active_anon +
1300 mstat.total_active_file) /
1301 1024);
1302 printme = lbuf;
1303 } else if (startswith(line, "Inactive:")) {
1304 snprintf(lbuf, 100, "Inactive: %8" PRIu64 " kB\n",
1305 (mstat.total_inactive_anon +
1306 mstat.total_inactive_file) /
1307 1024);
1308 printme = lbuf;
1309 } else if (startswith(line, "Active(anon):")) {
1310 snprintf(lbuf, 100, "Active(anon): %8" PRIu64 " kB\n",
1311 mstat.total_active_anon / 1024);
1312 printme = lbuf;
1313 } else if (startswith(line, "Inactive(anon):")) {
1314 snprintf(lbuf, 100, "Inactive(anon): %8" PRIu64 " kB\n",
1315 mstat.total_inactive_anon / 1024);
1316 printme = lbuf;
1317 } else if (startswith(line, "Active(file):")) {
1318 snprintf(lbuf, 100, "Active(file): %8" PRIu64 " kB\n",
1319 mstat.total_active_file / 1024);
1320 printme = lbuf;
1321 } else if (startswith(line, "Inactive(file):")) {
1322 snprintf(lbuf, 100, "Inactive(file): %8" PRIu64 " kB\n",
1323 mstat.total_inactive_file / 1024);
1324 printme = lbuf;
1325 } else if (startswith(line, "Unevictable:")) {
1326 snprintf(lbuf, 100, "Unevictable: %8" PRIu64 " kB\n",
1327 mstat.total_unevictable / 1024);
1328 printme = lbuf;
1329 } else if (startswith(line, "Dirty:")) {
1330 snprintf(lbuf, 100, "Dirty: %8" PRIu64 " kB\n",
1331 mstat.total_dirty / 1024);
1332 printme = lbuf;
1333 } else if (startswith(line, "Writeback:")) {
1334 snprintf(lbuf, 100, "Writeback: %8" PRIu64 " kB\n",
1335 mstat.total_writeback / 1024);
1336 printme = lbuf;
1337 } else if (startswith(line, "AnonPages:")) {
1338 snprintf(lbuf, 100, "AnonPages: %8" PRIu64 " kB\n",
1339 (mstat.total_active_anon +
1340 mstat.total_inactive_anon - mstat.total_shmem) /
1341 1024);
1342 printme = lbuf;
1343 } else if (startswith(line, "Mapped:")) {
1344 snprintf(lbuf, 100, "Mapped: %8" PRIu64 " kB\n",
1345 mstat.total_mapped_file / 1024);
1346 printme = lbuf;
1347 } else if (startswith(line, "SReclaimable:")) {
1348 snprintf(lbuf, 100, "SReclaimable: %8" PRIu64 " kB\n", (uint64_t)0);
1349 printme = lbuf;
1350 } else if (startswith(line, "SUnreclaim:")) {
1351 snprintf(lbuf, 100, "SUnreclaim: %8" PRIu64 " kB\n", (uint64_t)0);
1352 printme = lbuf;
1353 } else if (startswith(line, "Shmem:")) {
1354 snprintf(lbuf, 100, "Shmem: %8" PRIu64 " kB\n",
1355 mstat.total_shmem / 1024);
1356 printme = lbuf;
1357 } else if (startswith(line, "ShmemHugePages:")) {
1358 snprintf(lbuf, 100, "ShmemHugePages: %8" PRIu64 " kB\n", (uint64_t)0);
1359 printme = lbuf;
1360 } else if (startswith(line, "ShmemPmdMapped:")) {
1361 snprintf(lbuf, 100, "ShmemPmdMapped: %8" PRIu64 " kB\n", (uint64_t)0);
1362 printme = lbuf;
1363 } else if (startswith(line, "AnonHugePages:")) {
1364 snprintf(lbuf, 100, "AnonHugePages: %8" PRIu64 " kB\n",
1365 mstat.total_rss_huge / 1024);
1366 printme = lbuf;
1367 } else {
1368 printme = line;
1369 }
1370
1371 l = snprintf(cache, cache_size, "%s", printme);
1372 if (l < 0)
1373 return log_error(0, "Failed to write cache");
1374 if (l >= cache_size)
1375 return log_error(0, "Write to cache was truncated");
1376
1377 cache += l;
1378 cache_size -= l;
1379 total_len += l;
1380 }
1381
1382 d->cached = 1;
1383 d->size = total_len;
1384 if (total_len > size)
1385 total_len = size;
1386 memcpy(buf, d->buf, total_len);
1387
1388 return total_len;
1389 }
1390
1391 static int proc_slabinfo_read(char *buf, size_t size, off_t offset,
1392 struct fuse_file_info *fi)
1393 {
1394 __do_free char *cgroup = NULL, *line = NULL;
1395 __do_free void *fopen_cache = NULL;
1396 __do_fclose FILE *f = NULL;
1397 __do_close int fd = -EBADF;
1398 struct fuse_context *fc = fuse_get_context();
1399 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1400 size_t linelen = 0, total_len = 0;
1401 char *cache = d->buf;
1402 size_t cache_size = d->buflen;
1403 pid_t initpid;
1404
1405 if (offset) {
1406 int left;
1407
1408 if (offset > d->size)
1409 return -EINVAL;
1410
1411 if (!d->cached)
1412 return 0;
1413
1414 left = d->size - offset;
1415 total_len = left > size ? size : left;
1416 memcpy(buf, cache + offset, total_len);
1417
1418 return total_len;
1419 }
1420
1421 initpid = lookup_initpid_in_store(fc->pid);
1422 if (initpid <= 1 || is_shared_pidns(initpid))
1423 initpid = fc->pid;
1424
1425 cgroup = get_pid_cgroup(initpid, "memory");
1426 if (!cgroup)
1427 return read_file_fuse("/proc/slabinfo", buf, size, d);
1428
1429 prune_init_slice(cgroup);
1430
1431 fd = cgroup_ops->get_memory_slabinfo_fd(cgroup_ops, cgroup);
1432 if (fd < 0)
1433 return read_file_fuse("/proc/slabinfo", buf, size, d);
1434
1435 f = fdopen_cached(fd, "re", &fopen_cache);
1436 if (!f)
1437 return read_file_fuse("/proc/slabinfo", buf, size, d);
1438
1439 while (getline(&line, &linelen, f) != -1) {
1440 ssize_t l = snprintf(cache, cache_size, "%s", line);
1441 if (l < 0)
1442 return log_error(0, "Failed to write cache");
1443 if (l >= cache_size)
1444 return log_error(0, "Write to cache was truncated");
1445
1446 cache += l;
1447 cache_size -= l;
1448 total_len += l;
1449 }
1450
1451 d->cached = 1;
1452 d->size = total_len;
1453 if (total_len > size)
1454 total_len = size;
1455 memcpy(buf, d->buf, total_len);
1456
1457 return total_len;
1458 }
1459
1460 __lxcfs_fuse_ops int proc_read(const char *path, char *buf, size_t size,
1461 off_t offset, struct fuse_file_info *fi)
1462 {
1463 struct file_info *f = INTTYPE_TO_PTR(fi->fh);
1464
1465 switch (f->type) {
1466 case LXC_TYPE_PROC_MEMINFO:
1467 if (liblxcfs_functional())
1468 return proc_meminfo_read(buf, size, offset, fi);
1469
1470 return read_file_fuse_with_offset(LXC_TYPE_PROC_MEMINFO_PATH,
1471 buf, size, offset, f);
1472 case LXC_TYPE_PROC_CPUINFO:
1473 if (liblxcfs_functional())
1474 return proc_cpuinfo_read(buf, size, offset, fi);
1475
1476 return read_file_fuse_with_offset(LXC_TYPE_PROC_CPUINFO_PATH,
1477 buf, size, offset, f);
1478 case LXC_TYPE_PROC_UPTIME:
1479 if (liblxcfs_functional())
1480 return proc_uptime_read(buf, size, offset, fi);
1481
1482 return read_file_fuse_with_offset(LXC_TYPE_PROC_UPTIME_PATH,
1483 buf, size, offset, f);
1484 case LXC_TYPE_PROC_STAT:
1485 if (liblxcfs_functional())
1486 return proc_stat_read(buf, size, offset, fi);
1487
1488 return read_file_fuse_with_offset(LXC_TYPE_PROC_STAT_PATH, buf,
1489 size, offset, f);
1490 case LXC_TYPE_PROC_DISKSTATS:
1491 if (liblxcfs_functional())
1492 return proc_diskstats_read(buf, size, offset, fi);
1493
1494 return read_file_fuse_with_offset(LXC_TYPE_PROC_DISKSTATS_PATH,
1495 buf, size, offset, f);
1496 case LXC_TYPE_PROC_SWAPS:
1497 if (liblxcfs_functional())
1498 return proc_swaps_read(buf, size, offset, fi);
1499
1500 return read_file_fuse_with_offset(LXC_TYPE_PROC_SWAPS_PATH, buf,
1501 size, offset, f);
1502 case LXC_TYPE_PROC_LOADAVG:
1503 if (liblxcfs_functional())
1504 return proc_loadavg_read(buf, size, offset, fi);
1505
1506 return read_file_fuse_with_offset(LXC_TYPE_PROC_LOADAVG_PATH,
1507 buf, size, offset, f);
1508 case LXC_TYPE_PROC_SLABINFO:
1509 if (liblxcfs_functional())
1510 return proc_slabinfo_read(buf, size, offset, fi);
1511
1512 return read_file_fuse_with_offset(LXC_TYPE_PROC_SLABINFO_PATH,
1513 buf, size, offset, f);
1514 }
1515
1516 return -EINVAL;
1517 }