]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/proc_cpuview.c
Merge pull request #441 from timgates42/bugfix_typo_through
[mirror_lxcfs.git] / src / proc_cpuview.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
1f5596dd
CB
2
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE
5#endif
6
f834b6bf
SP
7#include "config.h"
8
9#ifdef HAVE_FUSE3
10#ifndef FUSE_USE_VERSION
11#define FUSE_USE_VERSION 30
12#endif
13#else
1f5596dd
CB
14#ifndef FUSE_USE_VERSION
15#define FUSE_USE_VERSION 26
16#endif
f834b6bf 17#endif
1f5596dd
CB
18
19#define _FILE_OFFSET_BITS 64
20
21#define __STDC_FORMAT_MACROS
22#include <dirent.h>
23#include <errno.h>
24#include <fcntl.h>
25#include <fuse.h>
26#include <inttypes.h>
27#include <libgen.h>
28#include <pthread.h>
29#include <sched.h>
30#include <stdarg.h>
31#include <stdbool.h>
32#include <stdint.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <time.h>
37#include <unistd.h>
38#include <wait.h>
39#include <linux/magic.h>
40#include <linux/sched.h>
41#include <sys/epoll.h>
42#include <sys/mman.h>
43#include <sys/mount.h>
44#include <sys/param.h>
45#include <sys/socket.h>
46#include <sys/syscall.h>
47#include <sys/sysinfo.h>
48#include <sys/vfs.h>
49
50#include "bindings.h"
1f5596dd
CB
51#include "cgroup_fuse.h"
52#include "cpuset_parse.h"
53#include "cgroups/cgroup.h"
54#include "cgroups/cgroup_utils.h"
55#include "memory_utils.h"
4ec5c9da 56#include "proc_loadavg.h"
1f5596dd
CB
57#include "utils.h"
58
1f5596dd
CB
59/* Data for CPU view */
60struct cg_proc_stat {
61 char *cg;
ce617d73
CB
62 struct cpuacct_usage *usage; /* Real usage as read from the host's /proc/stat. */
63 struct cpuacct_usage *view; /* Usage stats reported to the container. */
1f5596dd 64 int cpu_count;
ce617d73 65 pthread_mutex_t lock; /* For node manipulation. */
1f5596dd
CB
66 struct cg_proc_stat *next;
67};
68
69struct cg_proc_stat_head {
70 struct cg_proc_stat *next;
71 time_t lastcheck;
72
73 /*
74 * For access to the list. Reading can be parallel, pruning is exclusive.
75 */
76 pthread_rwlock_t lock;
77};
78
79#define CPUVIEW_HASH_SIZE 100
80static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
81
b456d40d
CB
82static void reset_proc_stat_node(struct cg_proc_stat *node,
83 struct cpuacct_usage *usage, int cpu_count)
1f5596dd 84{
1f5596dd
CB
85 lxcfs_debug("Resetting stat node for %s\n", node->cg);
86 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
87
b456d40d 88 for (int i = 0; i < cpu_count; i++) {
1f5596dd
CB
89 node->view[i].user = 0;
90 node->view[i].system = 0;
91 node->view[i].idle = 0;
92 }
93
94 node->cpu_count = cpu_count;
95}
96
97static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
98{
99 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
100
101 /* Allocate new memory */
82d74a95 102 new_usage = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
1f5596dd
CB
103 if (!new_usage)
104 return false;
105
82d74a95 106 new_view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
1f5596dd
CB
107 if (!new_view)
108 return false;
109
110 /* Copy existing data & initialize new elements */
111 for (int i = 0; i < cpu_count; i++) {
112 if (i < node->cpu_count) {
82d74a95
CB
113 new_usage[i].user = node->usage[i].user;
114 new_usage[i].system = node->usage[i].system;
115 new_usage[i].idle = node->usage[i].idle;
116
117 new_view[i].user = node->view[i].user;
118 new_view[i].system = node->view[i].system;
119 new_view[i].idle = node->view[i].idle;
1f5596dd
CB
120 }
121 }
122
123 free(node->usage);
124 node->usage = move_ptr(new_usage);
125
126 free(node->view);
127 node->view = move_ptr(new_view);
128 node->cpu_count = cpu_count;
129
130 return true;
131}
132
4ec5c9da
CB
133static void free_proc_stat_node(struct cg_proc_stat *node)
134{
6a4dceb1
CB
135 if (node) {
136 /*
137 * We're abusing the usage pointer to indicate that
138 * pthread_mutex_init() was successful. Don't judge me.
139 */
140 if (node->usage)
141 pthread_mutex_destroy(&node->lock);
142 free_disarm(node->cg);
143 free_disarm(node->usage);
144 free_disarm(node->view);
145 free_disarm(node);
146 }
4ec5c9da
CB
147}
148
6a4dceb1
CB
149define_cleanup_function(struct cg_proc_stat *, free_proc_stat_node);
150
1f5596dd
CB
151static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
152{
0d129671
CB
153 call_cleaner(free_proc_stat_node) struct cg_proc_stat *new = new_node;
154 struct cg_proc_stat *rv = new_node;
155 int hash = calc_hash(new->cg) % CPUVIEW_HASH_SIZE;
1f5596dd 156 struct cg_proc_stat_head *head = proc_stat_history[hash];
0d129671 157 struct cg_proc_stat *cur;
1f5596dd
CB
158
159 pthread_rwlock_wrlock(&head->lock);
160
161 if (!head->next) {
0d129671 162 head->next = move_ptr(new);
164acda7 163 goto out_rwlock_unlock;
1f5596dd
CB
164 }
165
0d129671 166 cur = head->next;
1f5596dd
CB
167
168 for (;;) {
0d129671
CB
169 /*
170 * The node to be added is already present in the list, so
171 * free the newly allocated one and return the one we found.
172 */
173 if (strcmp(cur->cg, new->cg) == 0) {
174 rv = cur;
164acda7 175 goto out_rwlock_unlock;
1f5596dd
CB
176 }
177
0d129671
CB
178 /* Keep walking. */
179 if (cur->next) {
180 cur = cur->next;
1f5596dd
CB
181 continue;
182 }
183
0d129671
CB
184 /* Add new node to end of list. */
185 cur->next = move_ptr(new);
164acda7 186 goto out_rwlock_unlock;
1f5596dd
CB
187 }
188
164acda7 189out_rwlock_unlock:
1f5596dd 190 pthread_rwlock_unlock(&head->lock);
0d129671 191 return move_ptr(rv);
1f5596dd
CB
192}
193
6a4dceb1
CB
194static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage,
195 int cpu_count, const char *cg)
1f5596dd 196{
6a4dceb1
CB
197 call_cleaner(free_proc_stat_node) struct cg_proc_stat *node = NULL;
198 __do_free struct cpuacct_usage *new_usage = NULL;
1f5596dd 199
6a4dceb1 200 node = zalloc(sizeof(struct cg_proc_stat));
1f5596dd 201 if (!node)
6a4dceb1 202 return NULL;
1f5596dd 203
6a4dceb1 204 node->cg = strdup(cg);
1f5596dd 205 if (!node->cg)
6a4dceb1 206 return NULL;
1f5596dd 207
6a4dceb1
CB
208 new_usage = memdup(usage, sizeof(struct cpuacct_usage) * cpu_count);
209 if (!new_usage)
210 return NULL;
1f5596dd 211
6a4dceb1 212 node->view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
1f5596dd 213 if (!node->view)
6a4dceb1 214 return NULL;
1f5596dd
CB
215
216 node->cpu_count = cpu_count;
1f5596dd 217
6a4dceb1
CB
218 if (pthread_mutex_init(&node->lock, NULL))
219 return NULL;
220 /*
221 * We're abusing the usage pointer to indicate that
222 * pthread_mutex_init() was successful. Don't judge me.
223 */
224 node->usage = move_ptr(new_usage);
1f5596dd 225
6a4dceb1 226 return move_ptr(node);
1f5596dd
CB
227}
228
2d00d04c
CB
229static bool cgroup_supports(const char *controller, const char *cgroup,
230 const char *file)
4ec5c9da 231{
2c990b1d
CB
232 __do_free char *path = NULL;
233 int cfd;
4ec5c9da
CB
234
235 cfd = get_cgroup_fd(controller);
236 if (cfd < 0)
237 return false;
238
925d5849 239 path = must_make_path_relative(cgroup, file, NULL);
2d00d04c 240 return faccessat(cfd, path, F_OK, 0) == 0;
4ec5c9da
CB
241}
242
1f5596dd
CB
243static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
244{
b456d40d 245 struct cg_proc_stat *first = NULL;
1f5596dd 246
b456d40d 247 for (struct cg_proc_stat *prev = NULL; node; ) {
2d00d04c
CB
248 if (!cgroup_supports("cpu", node->cg, "cpu.shares")) {
249 call_cleaner(free_proc_stat_node) struct cg_proc_stat *cur = node;
1f5596dd
CB
250
251 if (prev)
252 prev->next = node->next;
253 else
254 first = node->next;
255
256 node = node->next;
2d00d04c 257 lxcfs_debug("Removing stat node for %s\n", cur->cg);
1f5596dd
CB
258 } else {
259 if (!first)
260 first = node;
261 prev = node;
262 node = node->next;
263 }
264 }
265
266 return first;
267}
268
269#define PROC_STAT_PRUNE_INTERVAL 10
270static void prune_proc_stat_history(void)
271{
1f5596dd
CB
272 time_t now = time(NULL);
273
b456d40d 274 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1f5596dd
CB
275 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
276
277 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
278 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
279 return;
280 }
281
282 if (proc_stat_history[i]->next) {
283 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
284 proc_stat_history[i]->lastcheck = now;
285 }
286
287 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
288 }
289}
290
291static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head,
292 const char *cg)
293{
294 struct cg_proc_stat *node;
295
296 pthread_rwlock_rdlock(&head->lock);
297
298 if (!head->next) {
299 pthread_rwlock_unlock(&head->lock);
300 return NULL;
301 }
302
303 node = head->next;
304
305 do {
306 if (strcmp(cg, node->cg) == 0)
307 goto out;
308 } while ((node = node->next));
309
310 node = NULL;
311
312out:
313 pthread_rwlock_unlock(&head->lock);
314 prune_proc_stat_history();
315 return node;
316}
317
318static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
319{
320 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
321 struct cg_proc_stat_head *head = proc_stat_history[hash];
322 struct cg_proc_stat *node;
323
324 node = find_proc_stat_node(head, cg);
1f5596dd
CB
325 if (!node) {
326 node = new_proc_stat_node(usage, cpu_count, cg);
327 if (!node)
328 return NULL;
329
330 node = add_proc_stat_node(node);
331 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
332 }
333
334 pthread_mutex_lock(&node->lock);
335
ce089f10
CB
336 /*
337 * If additional CPUs on the host have been enabled, CPU usage counter
338 * arrays have to be expanded.
339 */
1f5596dd
CB
340 if (node->cpu_count < cpu_count) {
341 lxcfs_debug("Expanding stat node %d->%d for %s\n",
ce089f10 342 node->cpu_count, cpu_count, cg);
1f5596dd
CB
343
344 if (!expand_proc_stat_node(node, cpu_count)) {
345 pthread_mutex_unlock(&node->lock);
b456d40d 346 return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg);
1f5596dd
CB
347 }
348 }
349
350 return node;
351}
352
2b8eff1d
CB
353static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage,
354 uint64_t *counter, uint64_t threshold)
1f5596dd 355{
1ba088ae 356 uint64_t free_space, to_add;
1f5596dd
CB
357
358 free_space = threshold - usage->user - usage->system;
359
360 if (free_space > usage->idle)
361 free_space = usage->idle;
362
8206874a
CB
363 if (free_space > *surplus)
364 to_add = *surplus;
365 else
366 to_add = free_space;
1f5596dd
CB
367
368 *counter += to_add;
369 usage->idle -= to_add;
370 *surplus -= to_add;
371}
372
1ba088ae
CB
373static uint64_t diff_cpu_usage(struct cpuacct_usage *older,
374 struct cpuacct_usage *newer,
375 struct cpuacct_usage *diff, int cpu_count)
1f5596dd 376{
1ba088ae 377 uint64_t sum = 0;
1f5596dd 378
b456d40d 379 for (int i = 0; i < cpu_count; i++) {
1f5596dd
CB
380 if (!newer[i].online)
381 continue;
382
b456d40d
CB
383 /*
384 * When cpuset is changed on the fly, the CPUs might get
385 * reordered. We could either reset all counters, or check
386 * that the substractions below will return expected results.
1f5596dd
CB
387 */
388 if (newer[i].user > older[i].user)
389 diff[i].user = newer[i].user - older[i].user;
390 else
391 diff[i].user = 0;
392
393 if (newer[i].system > older[i].system)
394 diff[i].system = newer[i].system - older[i].system;
395 else
396 diff[i].system = 0;
397
398 if (newer[i].idle > older[i].idle)
399 diff[i].idle = newer[i].idle - older[i].idle;
400 else
401 diff[i].idle = 0;
402
403 sum += diff[i].user;
404 sum += diff[i].system;
405 sum += diff[i].idle;
406 }
407
408 return sum;
409}
410
411/*
b456d40d
CB
412 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
413 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
92264841 414 * through `value`.
1f5596dd
CB
415 */
416static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
417{
418 __do_free char *str = NULL;
48f6862e 419 char file[STRLITERALLEN("cpu.cfs_period_us") + 1];
9844eea7 420 bool first = true;
48f6862e 421 int ret;
1f5596dd 422
48f6862e 423 if (pure_unified_layout(cgroup_ops)) {
9844eea7 424 first = !strcmp(param, "quota");
48f6862e
CB
425 ret = snprintf(file, sizeof(file), "cpu.max");
426 } else {
427 ret = snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
9844eea7 428 }
48f6862e 429 if (ret < 0 || (size_t)ret >= sizeof(file))
1f5596dd
CB
430 return false;
431
48f6862e 432 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
1f5596dd
CB
433 return false;
434
48f6862e 435 return sscanf(str, first ? "%" PRId64 : "%*d %" PRId64, value) == 1;
1f5596dd
CB
436}
437
438/*
439 * Return the exact number of visible CPUs based on CPU quotas.
440 * If there is no quota set, zero is returned.
441 */
442static double exact_cpu_count(const char *cg)
443{
444 double rv;
445 int nprocs;
446 int64_t cfs_quota, cfs_period;
447
c602a0d0
CB
448 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
449 return 0;
450
451 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
452 return 0;
1f5596dd
CB
453
454 if (cfs_quota <= 0 || cfs_period <= 0)
455 return 0;
456
457 rv = (double)cfs_quota / (double)cfs_period;
458
459 nprocs = get_nprocs();
460
461 if (rv > nprocs)
462 rv = nprocs;
463
464 return rv;
465}
466
467/*
468 * Return the maximum number of visible CPUs based on CPU quotas.
469 * If there is no quota set, zero is returned.
470 */
4ec5c9da 471int max_cpu_count(const char *cg)
1f5596dd 472{
700dd417 473 __do_free char *cpuset = NULL;
1f5596dd
CB
474 int rv, nprocs;
475 int64_t cfs_quota, cfs_period;
476 int nr_cpus_in_cpuset = 0;
1f5596dd 477
921bdfdb
CB
478 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
479 return 0;
480
481 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
482 return 0;
1f5596dd
CB
483
484 cpuset = get_cpuset(cg);
485 if (cpuset)
486 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
487
921bdfdb 488 if (cfs_quota <= 0 || cfs_period <= 0) {
1f5596dd
CB
489 if (nr_cpus_in_cpuset > 0)
490 return nr_cpus_in_cpuset;
491
492 return 0;
493 }
494
495 rv = cfs_quota / cfs_period;
496
921bdfdb
CB
497 /*
498 * In case quota/period does not yield a whole number, add one CPU for
1f5596dd
CB
499 * the remainder.
500 */
501 if ((cfs_quota % cfs_period) > 0)
502 rv += 1;
503
504 nprocs = get_nprocs();
1f5596dd
CB
505 if (rv > nprocs)
506 rv = nprocs;
507
921bdfdb 508 /* Use min value in cpu quota and cpuset. */
1f5596dd
CB
509 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
510 rv = nr_cpus_in_cpuset;
511
512 return rv;
513}
514
515int cpuview_proc_stat(const char *cg, const char *cpuset,
516 struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size,
517 FILE *f, char *buf, size_t buf_size)
518{
519 __do_free char *line = NULL;
520 __do_free struct cpuacct_usage *diff = NULL;
4f18a602 521 size_t linelen = 0, total_len = 0;
1f5596dd
CB
522 int curcpu = -1; /* cpu numbering starts at 0 */
523 int physcpu, i;
39f231da 524 int cpu_cnt = 0;
2b8eff1d
CB
525 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
526 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
527 uint64_t user_sum = 0, system_sum = 0, idle_sum = 0;
528 uint64_t user_surplus = 0, system_surplus = 0;
39f231da 529 int nprocs, max_cpus;
4f18a602 530 ssize_t l;
2b8eff1d 531 uint64_t total_sum, threshold;
1f5596dd 532 struct cg_proc_stat *stat_node;
1f5596dd 533
39f231da 534 nprocs = get_nprocs_conf();
1f5596dd
CB
535 if (cg_cpu_usage_size < nprocs)
536 nprocs = cg_cpu_usage_size;
537
538 /* Read all CPU stats and stop when we've encountered other lines */
539 while (getline(&line, &linelen, f) != -1) {
540 int ret;
541 char cpu_char[10]; /* That's a lot of cores */
542 uint64_t all_used, cg_used;
543
544 if (strlen(line) == 0)
545 continue;
546
547 /* not a ^cpuN line containing a number N */
548 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
549 break;
550
551 if (sscanf(cpu_char, "%d", &physcpu) != 1)
552 continue;
553
554 if (physcpu >= cg_cpu_usage_size)
555 continue;
556
fd65c77c
CB
557 curcpu++;
558 cpu_cnt++;
1f5596dd
CB
559
560 if (!cpu_in_cpuset(physcpu, cpuset)) {
561 for (i = curcpu; i <= physcpu; i++)
562 cg_cpu_usage[i].online = false;
563 continue;
564 }
565
566 if (curcpu < physcpu) {
567 /* Some CPUs may be disabled */
568 for (i = curcpu; i < physcpu; i++)
569 cg_cpu_usage[i].online = false;
570
571 curcpu = physcpu;
572 }
573
574 cg_cpu_usage[curcpu].online = true;
575
2b8eff1d 576 ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu",
1f5596dd
CB
577 &user,
578 &nice,
579 &system,
580 &idle,
581 &iowait,
582 &irq,
583 &softirq,
584 &steal,
585 &guest,
586 &guest_nice);
1f5596dd
CB
587 if (ret != 10)
588 continue;
589
590 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
591 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
592
593 if (all_used >= cg_used) {
594 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
595
596 } else {
2b8eff1d
CB
597 lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
598 curcpu, cg, all_used, cg_used);
1f5596dd
CB
599 cg_cpu_usage[curcpu].idle = idle;
600 }
601 }
602
f9434b9a
CB
603 /* Cannot use more CPUs than is available in cpuset. */
604 max_cpus = max_cpu_count(cg);
605 if (max_cpus > cpu_cnt || !max_cpus)
606 max_cpus = cpu_cnt;
607
1f5596dd 608 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
b456d40d
CB
609 if (!stat_node)
610 return log_error(0, "Failed to find/create stat node for %s", cg);
1f5596dd 611
b4572722 612 diff = zalloc(sizeof(struct cpuacct_usage) * nprocs);
700dd417 613 if (!diff)
1f5596dd 614 return 0;
1f5596dd
CB
615
616 /*
617 * If the new values are LOWER than values stored in memory, it means
618 * the cgroup has been reset/recreated and we should reset too.
619 */
620 for (curcpu = 0; curcpu < nprocs; curcpu++) {
621 if (!cg_cpu_usage[curcpu].online)
622 continue;
623
624 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
625 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
626
627 break;
628 }
629
630 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
631
632 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
633 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
634
635 if (!stat_node->usage[curcpu].online)
636 continue;
637
638 i++;
639
b4572722 640 stat_node->usage[curcpu].user += diff[curcpu].user;
1f5596dd 641 stat_node->usage[curcpu].system += diff[curcpu].system;
b4572722 642 stat_node->usage[curcpu].idle += diff[curcpu].idle;
1f5596dd
CB
643
644 if (max_cpus > 0 && i >= max_cpus) {
b4572722
CB
645 user_surplus += diff[curcpu].user;
646 system_surplus += diff[curcpu].system;
1f5596dd
CB
647 }
648 }
649
650 /* Calculate usage counters of visible CPUs */
651 if (max_cpus > 0) {
2b8eff1d
CB
652 uint64_t diff_user = 0;
653 uint64_t diff_system = 0;
654 uint64_t diff_idle = 0;
655 uint64_t max_diff_idle = 0;
656 uint64_t max_diff_idle_index = 0;
1f5596dd
CB
657 double exact_cpus;
658
659 /* threshold = maximum usage per cpu, including idle */
660 threshold = total_sum / cpu_cnt * max_cpus;
661
662 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
663 if (!stat_node->usage[curcpu].online)
664 continue;
665
666 i++;
667
668 if (i == max_cpus)
669 break;
670
671 if (diff[curcpu].user + diff[curcpu].system >= threshold)
672 continue;
673
674 /* Add user */
675 add_cpu_usage(&user_surplus, &diff[curcpu],
676 &diff[curcpu].user, threshold);
677
678 if (diff[curcpu].user + diff[curcpu].system >= threshold)
679 continue;
680
681 /* If there is still room, add system */
682 add_cpu_usage(&system_surplus, &diff[curcpu],
683 &diff[curcpu].system, threshold);
684 }
685
686 if (user_surplus > 0)
687 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
688 if (system_surplus > 0)
689 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
690
691 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
692 if (!stat_node->usage[curcpu].online)
693 continue;
694
695 i++;
696
697 if (i == max_cpus)
698 break;
699
b4572722
CB
700 stat_node->view[curcpu].user += diff[curcpu].user;
701 stat_node->view[curcpu].system += diff[curcpu].system;
702 stat_node->view[curcpu].idle += diff[curcpu].idle;
1f5596dd 703
b4572722
CB
704 user_sum += stat_node->view[curcpu].user;
705 system_sum += stat_node->view[curcpu].system;
706 idle_sum += stat_node->view[curcpu].idle;
1f5596dd 707
b4572722
CB
708 diff_user += diff[curcpu].user;
709 diff_system += diff[curcpu].system;
710 diff_idle += diff[curcpu].idle;
1f5596dd 711 if (diff[curcpu].idle > max_diff_idle) {
b4572722
CB
712 max_diff_idle = diff[curcpu].idle;
713 max_diff_idle_index = curcpu;
1f5596dd
CB
714 }
715
716 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
717 }
718 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
719
720 /* revise cpu usage view to support partial cpu case. */
721 exact_cpus = exact_cpu_count(cg);
722 if (exact_cpus < (double)max_cpus){
1ba088ae 723 uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
1f5596dd
CB
724
725 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
726 lxcfs_v("delta: %lu\n", delta);
727 lxcfs_v("idle_sum before: %lu\n", idle_sum);
b4572722
CB
728 if (idle_sum > delta)
729 idle_sum = idle_sum - delta;
730 else
731 idle_sum = 0;
1f5596dd
CB
732 lxcfs_v("idle_sum after: %lu\n", idle_sum);
733
734 curcpu = max_diff_idle_index;
735 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
b4572722
CB
736 if (stat_node->view[curcpu].idle > delta)
737 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle - delta;
738 else
739 stat_node->view[curcpu].idle = 0;
1f5596dd
CB
740 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
741 }
742 } else {
743 for (curcpu = 0; curcpu < nprocs; curcpu++) {
744 if (!stat_node->usage[curcpu].online)
745 continue;
746
b4572722
CB
747 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
748 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
749 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
1f5596dd 750
b4572722
CB
751 user_sum += stat_node->view[curcpu].user;
752 system_sum += stat_node->view[curcpu].system;
753 idle_sum += stat_node->view[curcpu].idle;
1f5596dd
CB
754 }
755 }
756
757 /* Render the file */
758 /* cpu-all */
2b8eff1d
CB
759 l = snprintf(buf, buf_size,
760 "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
761 user_sum, system_sum, idle_sum);
1f5596dd 762 lxcfs_v("cpu-all: %s\n", buf);
b456d40d
CB
763 if (l < 0)
764 return log_error(0, "Failed to write cache");
765 if (l >= buf_size)
766 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
767
768 buf += l;
769 buf_size -= l;
770 total_len += l;
771
772 /* Render visible CPUs */
773 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
774 if (!stat_node->usage[curcpu].online)
775 continue;
776
777 i++;
778
779 if (max_cpus > 0 && i == max_cpus)
780 break;
781
2b8eff1d
CB
782 l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
783 i,
784 stat_node->view[curcpu].user,
785 stat_node->view[curcpu].system,
786 stat_node->view[curcpu].idle);
1f5596dd 787 lxcfs_v("cpu: %s\n", buf);
b456d40d
CB
788 if (l < 0)
789 return log_error(0, "Failed to write cache");
790 if (l >= buf_size)
791 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
792
793 buf += l;
794 buf_size -= l;
795 total_len += l;
796 }
797
798 /* Pass the rest of /proc/stat, start with the last line read */
799 l = snprintf(buf, buf_size, "%s", line);
b456d40d
CB
800 if (l < 0)
801 return log_error(0, "Failed to write cache");
802 if (l >= buf_size)
803 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
804
805 buf += l;
806 buf_size -= l;
807 total_len += l;
808
809 /* Pass the rest of the host's /proc/stat */
810 while (getline(&line, &linelen, f) != -1) {
811 l = snprintf(buf, buf_size, "%s", line);
b456d40d
CB
812 if (l < 0)
813 return log_error(0, "Failed to write cache");
814 if (l >= buf_size)
815 return log_error(0, "Write to cache was truncated");
816
1f5596dd
CB
817 buf += l;
818 buf_size -= l;
819 total_len += l;
820 }
821
822 if (stat_node)
823 pthread_mutex_unlock(&stat_node->lock);
b456d40d 824
1f5596dd
CB
825 return total_len;
826}
827
828/*
829 * check whether this is a '^processor" line in /proc/cpuinfo
830 */
b456d40d 831static inline bool is_processor_line(const char *line)
1f5596dd
CB
832{
833 int cpu;
b456d40d 834 return sscanf(line, "processor : %d", &cpu) == 1;
1f5596dd
CB
835}
836
b456d40d 837static inline bool cpuline_in_cpuset(const char *line, const char *cpuset)
1f5596dd
CB
838{
839 int cpu;
c539526c
CB
840
841 if (sscanf(line, "processor : %d", &cpu) == 1)
842 return cpu_in_cpuset(cpu, cpuset);
843
844 return false;
1f5596dd
CB
845}
846
847int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
848 struct fuse_file_info *fi)
849{
850 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
757a63e7 851 __do_free void *fopen_cache = NULL;
1f5596dd
CB
852 __do_fclose FILE *f = NULL;
853 struct fuse_context *fc = fuse_get_context();
0274438c 854 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
99b183fb 855 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1f5596dd
CB
856 size_t linelen = 0, total_len = 0;
857 bool am_printing = false, firstline = true, is_s390x = false;
858 int curcpu = -1, cpu, max_cpus = 0;
859 bool use_view;
860 char *cache = d->buf;
861 size_t cache_size = d->buflen;
862
f9434b9a 863 if (offset) {
1f5596dd
CB
864 int left;
865
866 if (offset > d->size)
867 return -EINVAL;
868
869 if (!d->cached)
870 return 0;
871
872 left = d->size - offset;
873 total_len = left > size ? size: left;
874 memcpy(buf, cache + offset, total_len);
875
876 return total_len;
877 }
878
879 pid_t initpid = lookup_initpid_in_store(fc->pid);
880 if (initpid <= 1 || is_shared_pidns(initpid))
881 initpid = fc->pid;
b456d40d 882
1f5596dd
CB
883 cg = get_pid_cgroup(initpid, "cpuset");
884 if (!cg)
885 return read_file_fuse("proc/cpuinfo", buf, size, d);
886 prune_init_slice(cg);
887
888 cpuset = get_cpuset(cg);
889 if (!cpuset)
890 return 0;
891
8044f626 892 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
0274438c 893 use_view = true;
8044f626
CB
894 else
895 use_view = false;
1f5596dd
CB
896 if (use_view)
897 max_cpus = max_cpu_count(cg);
898
757a63e7 899 f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache);
1f5596dd
CB
900 if (!f)
901 return 0;
902
903 while (getline(&line, &linelen, f) != -1) {
904 ssize_t l;
905 if (firstline) {
906 firstline = false;
907 if (strstr(line, "IBM/S390") != NULL) {
908 is_s390x = true;
909 am_printing = true;
910 continue;
911 }
912 }
b456d40d 913
1f5596dd
CB
914 if (strncmp(line, "# processors:", 12) == 0)
915 continue;
b456d40d 916
1f5596dd 917 if (is_processor_line(line)) {
d0031abf 918 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
1f5596dd 919 break;
b456d40d 920
1f5596dd
CB
921 am_printing = cpuline_in_cpuset(line, cpuset);
922 if (am_printing) {
d0031abf 923 curcpu++;
1f5596dd 924 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
b456d40d
CB
925 if (l < 0)
926 return log_error(0, "Failed to write cache");
927 if (l >= cache_size)
928 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
929 cache += l;
930 cache_size -= l;
931 total_len += l;
932 }
933 continue;
934 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
935 char *p;
b456d40d 936
d0031abf 937 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
1f5596dd 938 break;
b456d40d 939
1f5596dd
CB
940 if (!cpu_in_cpuset(cpu, cpuset))
941 continue;
b456d40d 942
1f5596dd
CB
943 curcpu ++;
944 p = strchr(line, ':');
945 if (!p || !*p)
946 return 0;
947 p++;
b456d40d 948
1f5596dd 949 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
b456d40d
CB
950 if (l < 0)
951 return log_error(0, "Failed to write cache");
952 if (l >= cache_size)
953 return log_error(0, "Write to cache was truncated");
954
1f5596dd
CB
955 cache += l;
956 cache_size -= l;
957 total_len += l;
958 continue;
959
960 }
961 if (am_printing) {
962 l = snprintf(cache, cache_size, "%s", line);
b456d40d
CB
963 if (l < 0)
964 return log_error(0, "Failed to write cache");
965 if (l >= cache_size)
966 return log_error(0, "Write to cache was truncated");
967
1f5596dd
CB
968 cache += l;
969 cache_size -= l;
970 total_len += l;
971 }
972 }
973
974 if (is_s390x) {
975 __do_free char *origcache = d->buf;
976 ssize_t l;
977
978 d->buf = malloc(d->buflen);
979 if (!d->buf) {
980 d->buf = move_ptr(origcache);
981 return 0;
982 }
983
984 cache = d->buf;
985 cache_size = d->buflen;
986 total_len = 0;
987 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
988 if (l < 0 || l >= cache_size)
989 return 0;
990
991 cache_size -= l;
992 cache += l;
993 total_len += l;
994 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
995 if (l < 0 || l >= cache_size)
996 return 0;
997
998 cache_size -= l;
999 cache += l;
1000 total_len += l;
1001 l = snprintf(cache, cache_size, "%s", origcache);
1002 if (l < 0 || l >= cache_size)
1003 return 0;
1004 total_len += l;
1005 }
1006
1007 d->cached = 1;
1008 d->size = total_len;
d0031abf
CB
1009 if (total_len > size)
1010 total_len = size;
1f5596dd
CB
1011
1012 /* read from off 0 */
1013 memcpy(buf, d->buf, total_len);
d0031abf 1014
1f5596dd
CB
1015 return total_len;
1016}
1017
1018/*
1019 * Returns 0 on success.
1020 * It is the caller's responsibility to free `return_usage`, unless this
1021 * function returns an error.
1022 */
1023int read_cpuacct_usage_all(char *cg, char *cpuset,
1024 struct cpuacct_usage **return_usage, int *size)
1025{
1026 __do_free char *usage_str = NULL;
1027 __do_free struct cpuacct_usage *cpu_usage = NULL;
9ce186dc 1028 int i = 0, j = 0, read_pos = 0, read_cnt = 0;
8b6987a2 1029 int cpucount;
9ce186dc 1030 int ret;
1f5596dd
CB
1031 int cg_cpu;
1032 uint64_t cg_user, cg_system;
1033 int64_t ticks_per_sec;
1034
1035 ticks_per_sec = sysconf(_SC_CLK_TCK);
1f5596dd 1036 if (ticks_per_sec < 0 && errno == EINVAL) {
8b6987a2 1037 lxcfs_debug("%m - Failed to determine number of ticks per second");
1f5596dd
CB
1038 return -1;
1039 }
1040
f9434b9a 1041 cpucount = get_nprocs_conf();
1f5596dd
CB
1042 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
1043 if (!cpu_usage)
1044 return -ENOMEM;
1045
1046 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1047 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
8b6987a2
CB
1048 char *sep = " \t\n";
1049 char *tok;
1f5596dd 1050
8b6987a2
CB
1051 /* Read cpuacct.usage_percpu instead. */
1052 lxcfs_debug("Falling back to cpuacct.usage_percpu");
1f5596dd
CB
1053 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
1054 return -1;
1f5596dd 1055
8b6987a2
CB
1056 lxc_iterate_parts(tok, usage_str, sep) {
1057 uint64_t percpu_user;
1058
1059 if (i >= cpucount)
1060 break;
1f5596dd 1061
8b6987a2
CB
1062 tok = trim_whitespace_in_place(tok);
1063 ret = safe_uint64(tok, &percpu_user, 10);
1064 if (ret)
1065 return -1;
1f5596dd 1066
8b6987a2
CB
1067 /* Convert the time from nanoseconds to USER_HZ */
1068 cpu_usage[i].user = percpu_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1069 cpu_usage[i].system = cpu_usage[i].user;
1f5596dd 1070 i++;
8b6987a2 1071 lxcfs_debug("cpu%d with time %s", i, tok);
1f5596dd 1072 }
8b6987a2
CB
1073 } else {
1074 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0)
1075 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg);
1f5596dd 1076
8b6987a2 1077 read_pos += read_cnt;
1f5596dd 1078
8b6987a2
CB
1079 for (i = 0, j = 0; i < cpucount; i++) {
1080 ret = sscanf(usage_str + read_pos,
1081 "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu,
1082 &cg_user, &cg_system, &read_cnt);
1f5596dd 1083
8b6987a2
CB
1084 if (ret == EOF)
1085 break;
1f5596dd 1086
8b6987a2
CB
1087 if (ret != 3)
1088 return log_error(-EINVAL, "Failed to parse cpuacct.usage_all line %s from cgroup %s",
1089 usage_str + read_pos, cg);
1f5596dd 1090
8b6987a2 1091 read_pos += read_cnt;
1f5596dd 1092
8b6987a2
CB
1093 /* Convert the time from nanoseconds to USER_HZ */
1094 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1095 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
1096 j++;
1097 }
1f5596dd
CB
1098 }
1099
1100 *return_usage = move_ptr(cpu_usage);
1101 *size = cpucount;
1102 return 0;
1103}
1104
1105static bool cpuview_init_head(struct cg_proc_stat_head **head)
1106{
9d7fc1a3 1107 __do_free struct cg_proc_stat_head *h;
1f5596dd 1108
9d7fc1a3
CB
1109 h = zalloc(sizeof(struct cg_proc_stat_head));
1110 if (!h)
1111 return false;
1f5596dd 1112
9d7fc1a3
CB
1113 if (pthread_rwlock_init(&h->lock, NULL))
1114 return false;
1115
1116 h->lastcheck = time(NULL);
1f5596dd 1117
9d7fc1a3 1118 *head = move_ptr(h);
1f5596dd
CB
1119 return true;
1120}
1121
4ec5c9da 1122bool init_cpuview(void)
1f5596dd
CB
1123{
1124 int i;
1125
1126 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
1127 proc_stat_history[i] = NULL;
1128
1129 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1130 if (!cpuview_init_head(&proc_stat_history[i]))
1131 goto err;
1132 }
1133
1134 return true;
1135
1136err:
1137 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1138 if (proc_stat_history[i])
1139 free_disarm(proc_stat_history[i]);
1140 }
1141
1142 return false;
1143}
1144
1f5596dd
CB
1145static void cpuview_free_head(struct cg_proc_stat_head *head)
1146{
905769cd 1147 struct cg_proc_stat *node;
1f5596dd
CB
1148
1149 if (head->next) {
1150 node = head->next;
1151
1152 for (;;) {
905769cd 1153 struct cg_proc_stat *cur = node;
1f5596dd 1154 node = node->next;
905769cd 1155 free_proc_stat_node(cur);
1f5596dd
CB
1156 if (!node)
1157 break;
1158 }
1159 }
1160
1161 pthread_rwlock_destroy(&head->lock);
1162 free_disarm(head);
1163}
1164
4ec5c9da 1165void free_cpuview(void)
1f5596dd 1166{
4ec5c9da 1167 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++)
1f5596dd
CB
1168 if (proc_stat_history[i])
1169 cpuview_free_head(proc_stat_history[i]);
1f5596dd 1170}