]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/proc_cpuview.c
proc_cpuview: fix unused variable warning
[mirror_lxcfs.git] / src / proc_cpuview.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
1f5596dd
CB
2
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE
5#endif
6
f834b6bf
SP
7#include "config.h"
8
1f5596dd
CB
9#define __STDC_FORMAT_MACROS
10#include <dirent.h>
11#include <errno.h>
12#include <fcntl.h>
1f5596dd
CB
13#include <inttypes.h>
14#include <libgen.h>
15#include <pthread.h>
16#include <sched.h>
17#include <stdarg.h>
18#include <stdbool.h>
19#include <stdint.h>
20#include <stdio.h>
21#include <stdlib.h>
22#include <string.h>
23#include <time.h>
24#include <unistd.h>
25#include <wait.h>
26#include <linux/magic.h>
27#include <linux/sched.h>
28#include <sys/epoll.h>
29#include <sys/mman.h>
30#include <sys/mount.h>
31#include <sys/param.h>
32#include <sys/socket.h>
33#include <sys/syscall.h>
34#include <sys/sysinfo.h>
35#include <sys/vfs.h>
36
e01afbb7
CB
37#include "proc_cpuview.h"
38
1f5596dd 39#include "bindings.h"
1f5596dd
CB
40#include "cgroup_fuse.h"
41#include "cpuset_parse.h"
42#include "cgroups/cgroup.h"
43#include "cgroups/cgroup_utils.h"
44#include "memory_utils.h"
4ec5c9da 45#include "proc_loadavg.h"
1f5596dd
CB
46#include "utils.h"
47
1f5596dd
CB
48/* Data for CPU view */
49struct cg_proc_stat {
50 char *cg;
ce617d73
CB
51 struct cpuacct_usage *usage; /* Real usage as read from the host's /proc/stat. */
52 struct cpuacct_usage *view; /* Usage stats reported to the container. */
1f5596dd 53 int cpu_count;
ce617d73 54 pthread_mutex_t lock; /* For node manipulation. */
1f5596dd
CB
55 struct cg_proc_stat *next;
56};
57
58struct cg_proc_stat_head {
59 struct cg_proc_stat *next;
60 time_t lastcheck;
61
62 /*
63 * For access to the list. Reading can be parallel, pruning is exclusive.
64 */
65 pthread_rwlock_t lock;
66};
67
68#define CPUVIEW_HASH_SIZE 100
69static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
70
b456d40d
CB
71static void reset_proc_stat_node(struct cg_proc_stat *node,
72 struct cpuacct_usage *usage, int cpu_count)
1f5596dd 73{
1f5596dd
CB
74 lxcfs_debug("Resetting stat node for %s\n", node->cg);
75 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
76
b456d40d 77 for (int i = 0; i < cpu_count; i++) {
1f5596dd
CB
78 node->view[i].user = 0;
79 node->view[i].system = 0;
80 node->view[i].idle = 0;
81 }
82
83 node->cpu_count = cpu_count;
84}
85
86static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
87{
88 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
89
90 /* Allocate new memory */
82d74a95 91 new_usage = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
1f5596dd
CB
92 if (!new_usage)
93 return false;
94
82d74a95 95 new_view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
1f5596dd
CB
96 if (!new_view)
97 return false;
98
99 /* Copy existing data & initialize new elements */
100 for (int i = 0; i < cpu_count; i++) {
101 if (i < node->cpu_count) {
82d74a95
CB
102 new_usage[i].user = node->usage[i].user;
103 new_usage[i].system = node->usage[i].system;
104 new_usage[i].idle = node->usage[i].idle;
105
106 new_view[i].user = node->view[i].user;
107 new_view[i].system = node->view[i].system;
108 new_view[i].idle = node->view[i].idle;
1f5596dd
CB
109 }
110 }
111
112 free(node->usage);
113 node->usage = move_ptr(new_usage);
114
115 free(node->view);
116 node->view = move_ptr(new_view);
117 node->cpu_count = cpu_count;
118
119 return true;
120}
121
4ec5c9da
CB
122static void free_proc_stat_node(struct cg_proc_stat *node)
123{
6a4dceb1
CB
124 if (node) {
125 /*
126 * We're abusing the usage pointer to indicate that
127 * pthread_mutex_init() was successful. Don't judge me.
128 */
129 if (node->usage)
130 pthread_mutex_destroy(&node->lock);
131 free_disarm(node->cg);
132 free_disarm(node->usage);
133 free_disarm(node->view);
134 free_disarm(node);
135 }
4ec5c9da
CB
136}
137
6a4dceb1
CB
138define_cleanup_function(struct cg_proc_stat *, free_proc_stat_node);
139
1f5596dd
CB
140static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
141{
0d129671
CB
142 call_cleaner(free_proc_stat_node) struct cg_proc_stat *new = new_node;
143 struct cg_proc_stat *rv = new_node;
144 int hash = calc_hash(new->cg) % CPUVIEW_HASH_SIZE;
1f5596dd 145 struct cg_proc_stat_head *head = proc_stat_history[hash];
0d129671 146 struct cg_proc_stat *cur;
1f5596dd
CB
147
148 pthread_rwlock_wrlock(&head->lock);
149
150 if (!head->next) {
0d129671 151 head->next = move_ptr(new);
164acda7 152 goto out_rwlock_unlock;
1f5596dd
CB
153 }
154
0d129671 155 cur = head->next;
1f5596dd
CB
156
157 for (;;) {
0d129671
CB
158 /*
159 * The node to be added is already present in the list, so
160 * free the newly allocated one and return the one we found.
161 */
162 if (strcmp(cur->cg, new->cg) == 0) {
163 rv = cur;
164acda7 164 goto out_rwlock_unlock;
1f5596dd
CB
165 }
166
0d129671
CB
167 /* Keep walking. */
168 if (cur->next) {
169 cur = cur->next;
1f5596dd
CB
170 continue;
171 }
172
0d129671
CB
173 /* Add new node to end of list. */
174 cur->next = move_ptr(new);
164acda7 175 goto out_rwlock_unlock;
1f5596dd
CB
176 }
177
164acda7 178out_rwlock_unlock:
1f5596dd 179 pthread_rwlock_unlock(&head->lock);
0d129671 180 return move_ptr(rv);
1f5596dd
CB
181}
182
6a4dceb1
CB
183static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage,
184 int cpu_count, const char *cg)
1f5596dd 185{
6a4dceb1
CB
186 call_cleaner(free_proc_stat_node) struct cg_proc_stat *node = NULL;
187 __do_free struct cpuacct_usage *new_usage = NULL;
1f5596dd 188
6a4dceb1 189 node = zalloc(sizeof(struct cg_proc_stat));
1f5596dd 190 if (!node)
6a4dceb1 191 return NULL;
1f5596dd 192
6a4dceb1 193 node->cg = strdup(cg);
1f5596dd 194 if (!node->cg)
6a4dceb1 195 return NULL;
1f5596dd 196
6a4dceb1
CB
197 new_usage = memdup(usage, sizeof(struct cpuacct_usage) * cpu_count);
198 if (!new_usage)
199 return NULL;
1f5596dd 200
6a4dceb1 201 node->view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
1f5596dd 202 if (!node->view)
6a4dceb1 203 return NULL;
1f5596dd
CB
204
205 node->cpu_count = cpu_count;
1f5596dd 206
6a4dceb1
CB
207 if (pthread_mutex_init(&node->lock, NULL))
208 return NULL;
209 /*
210 * We're abusing the usage pointer to indicate that
211 * pthread_mutex_init() was successful. Don't judge me.
212 */
213 node->usage = move_ptr(new_usage);
1f5596dd 214
6a4dceb1 215 return move_ptr(node);
1f5596dd
CB
216}
217
2d00d04c
CB
218static bool cgroup_supports(const char *controller, const char *cgroup,
219 const char *file)
4ec5c9da 220{
2c990b1d
CB
221 __do_free char *path = NULL;
222 int cfd;
4ec5c9da
CB
223
224 cfd = get_cgroup_fd(controller);
225 if (cfd < 0)
226 return false;
227
925d5849 228 path = must_make_path_relative(cgroup, file, NULL);
2d00d04c 229 return faccessat(cfd, path, F_OK, 0) == 0;
4ec5c9da
CB
230}
231
1f5596dd
CB
232static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
233{
b456d40d 234 struct cg_proc_stat *first = NULL;
1f5596dd 235
b456d40d 236 for (struct cg_proc_stat *prev = NULL; node; ) {
2d00d04c 237 if (!cgroup_supports("cpu", node->cg, "cpu.shares")) {
d5e34313 238 struct cg_proc_stat *cur = node;
1f5596dd
CB
239
240 if (prev)
241 prev->next = node->next;
242 else
243 first = node->next;
244
245 node = node->next;
d5e34313
CB
246 lxcfs_debug("Removing stat node for %s\n", cur);
247
248 free_proc_stat_node(cur);
1f5596dd
CB
249 } else {
250 if (!first)
251 first = node;
252 prev = node;
253 node = node->next;
254 }
255 }
256
257 return first;
258}
259
260#define PROC_STAT_PRUNE_INTERVAL 10
261static void prune_proc_stat_history(void)
262{
1f5596dd
CB
263 time_t now = time(NULL);
264
b456d40d 265 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1f5596dd
CB
266 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
267
268 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
269 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
270 return;
271 }
272
273 if (proc_stat_history[i]->next) {
274 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
275 proc_stat_history[i]->lastcheck = now;
276 }
277
278 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
279 }
280}
281
282static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head,
283 const char *cg)
284{
285 struct cg_proc_stat *node;
286
287 pthread_rwlock_rdlock(&head->lock);
288
289 if (!head->next) {
290 pthread_rwlock_unlock(&head->lock);
291 return NULL;
292 }
293
294 node = head->next;
295
296 do {
297 if (strcmp(cg, node->cg) == 0)
298 goto out;
299 } while ((node = node->next));
300
301 node = NULL;
302
303out:
304 pthread_rwlock_unlock(&head->lock);
305 prune_proc_stat_history();
306 return node;
307}
308
692f48eb
CB
309static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage,
310 int cpu_count, const char *cg)
1f5596dd
CB
311{
312 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
313 struct cg_proc_stat_head *head = proc_stat_history[hash];
314 struct cg_proc_stat *node;
315
316 node = find_proc_stat_node(head, cg);
1f5596dd
CB
317 if (!node) {
318 node = new_proc_stat_node(usage, cpu_count, cg);
319 if (!node)
320 return NULL;
321
322 node = add_proc_stat_node(node);
323 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
324 }
325
326 pthread_mutex_lock(&node->lock);
327
ce089f10
CB
328 /*
329 * If additional CPUs on the host have been enabled, CPU usage counter
330 * arrays have to be expanded.
331 */
1f5596dd
CB
332 if (node->cpu_count < cpu_count) {
333 lxcfs_debug("Expanding stat node %d->%d for %s\n",
ce089f10 334 node->cpu_count, cpu_count, cg);
1f5596dd
CB
335
336 if (!expand_proc_stat_node(node, cpu_count)) {
337 pthread_mutex_unlock(&node->lock);
b456d40d 338 return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg);
1f5596dd
CB
339 }
340 }
341
342 return node;
343}
344
2b8eff1d
CB
345static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage,
346 uint64_t *counter, uint64_t threshold)
1f5596dd 347{
1ba088ae 348 uint64_t free_space, to_add;
1f5596dd
CB
349
350 free_space = threshold - usage->user - usage->system;
351
352 if (free_space > usage->idle)
353 free_space = usage->idle;
354
8206874a
CB
355 if (free_space > *surplus)
356 to_add = *surplus;
357 else
358 to_add = free_space;
1f5596dd
CB
359
360 *counter += to_add;
361 usage->idle -= to_add;
362 *surplus -= to_add;
363}
364
1ba088ae
CB
365static uint64_t diff_cpu_usage(struct cpuacct_usage *older,
366 struct cpuacct_usage *newer,
367 struct cpuacct_usage *diff, int cpu_count)
1f5596dd 368{
1ba088ae 369 uint64_t sum = 0;
1f5596dd 370
b456d40d 371 for (int i = 0; i < cpu_count; i++) {
1f5596dd
CB
372 if (!newer[i].online)
373 continue;
374
b456d40d
CB
375 /*
376 * When cpuset is changed on the fly, the CPUs might get
377 * reordered. We could either reset all counters, or check
378 * that the substractions below will return expected results.
1f5596dd
CB
379 */
380 if (newer[i].user > older[i].user)
381 diff[i].user = newer[i].user - older[i].user;
382 else
383 diff[i].user = 0;
384
385 if (newer[i].system > older[i].system)
386 diff[i].system = newer[i].system - older[i].system;
387 else
388 diff[i].system = 0;
389
390 if (newer[i].idle > older[i].idle)
391 diff[i].idle = newer[i].idle - older[i].idle;
392 else
393 diff[i].idle = 0;
394
395 sum += diff[i].user;
396 sum += diff[i].system;
397 sum += diff[i].idle;
398 }
399
400 return sum;
401}
402
403/*
b456d40d
CB
404 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
405 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
92264841 406 * through `value`.
1f5596dd
CB
407 */
408static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
409{
410 __do_free char *str = NULL;
48f6862e 411 char file[STRLITERALLEN("cpu.cfs_period_us") + 1];
9844eea7 412 bool first = true;
48f6862e 413 int ret;
1f5596dd 414
48f6862e 415 if (pure_unified_layout(cgroup_ops)) {
9844eea7 416 first = !strcmp(param, "quota");
48f6862e
CB
417 ret = snprintf(file, sizeof(file), "cpu.max");
418 } else {
419 ret = snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
9844eea7 420 }
48f6862e 421 if (ret < 0 || (size_t)ret >= sizeof(file))
1f5596dd
CB
422 return false;
423
48f6862e 424 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
1f5596dd
CB
425 return false;
426
48f6862e 427 return sscanf(str, first ? "%" PRId64 : "%*d %" PRId64, value) == 1;
1f5596dd
CB
428}
429
430/*
431 * Return the exact number of visible CPUs based on CPU quotas.
432 * If there is no quota set, zero is returned.
433 */
434static double exact_cpu_count(const char *cg)
435{
436 double rv;
437 int nprocs;
438 int64_t cfs_quota, cfs_period;
439
c602a0d0
CB
440 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
441 return 0;
442
443 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
444 return 0;
1f5596dd
CB
445
446 if (cfs_quota <= 0 || cfs_period <= 0)
447 return 0;
448
449 rv = (double)cfs_quota / (double)cfs_period;
450
451 nprocs = get_nprocs();
452
453 if (rv > nprocs)
454 rv = nprocs;
455
456 return rv;
457}
458
459/*
460 * Return the maximum number of visible CPUs based on CPU quotas.
461 * If there is no quota set, zero is returned.
462 */
4ec5c9da 463int max_cpu_count(const char *cg)
1f5596dd 464{
700dd417 465 __do_free char *cpuset = NULL;
1f5596dd
CB
466 int rv, nprocs;
467 int64_t cfs_quota, cfs_period;
468 int nr_cpus_in_cpuset = 0;
1f5596dd 469
921bdfdb
CB
470 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
471 return 0;
472
473 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
474 return 0;
1f5596dd
CB
475
476 cpuset = get_cpuset(cg);
477 if (cpuset)
478 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
479
921bdfdb 480 if (cfs_quota <= 0 || cfs_period <= 0) {
1f5596dd
CB
481 if (nr_cpus_in_cpuset > 0)
482 return nr_cpus_in_cpuset;
483
484 return 0;
485 }
486
487 rv = cfs_quota / cfs_period;
488
921bdfdb
CB
489 /*
490 * In case quota/period does not yield a whole number, add one CPU for
1f5596dd
CB
491 * the remainder.
492 */
493 if ((cfs_quota % cfs_period) > 0)
494 rv += 1;
495
496 nprocs = get_nprocs();
1f5596dd
CB
497 if (rv > nprocs)
498 rv = nprocs;
499
921bdfdb 500 /* Use min value in cpu quota and cpuset. */
1f5596dd
CB
501 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
502 rv = nr_cpus_in_cpuset;
503
504 return rv;
505}
506
507int cpuview_proc_stat(const char *cg, const char *cpuset,
508 struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size,
509 FILE *f, char *buf, size_t buf_size)
510{
511 __do_free char *line = NULL;
512 __do_free struct cpuacct_usage *diff = NULL;
4f18a602 513 size_t linelen = 0, total_len = 0;
1f5596dd
CB
514 int curcpu = -1; /* cpu numbering starts at 0 */
515 int physcpu, i;
39f231da 516 int cpu_cnt = 0;
2b8eff1d
CB
517 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
518 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
519 uint64_t user_sum = 0, system_sum = 0, idle_sum = 0;
520 uint64_t user_surplus = 0, system_surplus = 0;
39f231da 521 int nprocs, max_cpus;
4f18a602 522 ssize_t l;
2b8eff1d 523 uint64_t total_sum, threshold;
1f5596dd 524 struct cg_proc_stat *stat_node;
1f5596dd 525
39f231da 526 nprocs = get_nprocs_conf();
1f5596dd
CB
527 if (cg_cpu_usage_size < nprocs)
528 nprocs = cg_cpu_usage_size;
529
530 /* Read all CPU stats and stop when we've encountered other lines */
531 while (getline(&line, &linelen, f) != -1) {
532 int ret;
533 char cpu_char[10]; /* That's a lot of cores */
534 uint64_t all_used, cg_used;
535
536 if (strlen(line) == 0)
537 continue;
538
539 /* not a ^cpuN line containing a number N */
540 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
541 break;
542
543 if (sscanf(cpu_char, "%d", &physcpu) != 1)
544 continue;
545
546 if (physcpu >= cg_cpu_usage_size)
547 continue;
548
fd65c77c
CB
549 curcpu++;
550 cpu_cnt++;
1f5596dd
CB
551
552 if (!cpu_in_cpuset(physcpu, cpuset)) {
553 for (i = curcpu; i <= physcpu; i++)
554 cg_cpu_usage[i].online = false;
555 continue;
556 }
557
558 if (curcpu < physcpu) {
559 /* Some CPUs may be disabled */
560 for (i = curcpu; i < physcpu; i++)
561 cg_cpu_usage[i].online = false;
562
563 curcpu = physcpu;
564 }
565
566 cg_cpu_usage[curcpu].online = true;
567
2b8eff1d 568 ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu",
1f5596dd
CB
569 &user,
570 &nice,
571 &system,
572 &idle,
573 &iowait,
574 &irq,
575 &softirq,
576 &steal,
577 &guest,
578 &guest_nice);
1f5596dd
CB
579 if (ret != 10)
580 continue;
581
582 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
583 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
584
585 if (all_used >= cg_used) {
586 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
587
588 } else {
2b8eff1d
CB
589 lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
590 curcpu, cg, all_used, cg_used);
1f5596dd
CB
591 cg_cpu_usage[curcpu].idle = idle;
592 }
593 }
594
f9434b9a
CB
595 /* Cannot use more CPUs than is available in cpuset. */
596 max_cpus = max_cpu_count(cg);
597 if (max_cpus > cpu_cnt || !max_cpus)
598 max_cpus = cpu_cnt;
599
692f48eb 600 /* takes lock pthread_mutex_lock(&node->lock) */
1f5596dd 601 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
b456d40d
CB
602 if (!stat_node)
603 return log_error(0, "Failed to find/create stat node for %s", cg);
1f5596dd 604
b4572722 605 diff = zalloc(sizeof(struct cpuacct_usage) * nprocs);
700dd417 606 if (!diff)
08d61303 607 goto out_pthread_mutex_unlock;
1f5596dd
CB
608
609 /*
610 * If the new values are LOWER than values stored in memory, it means
611 * the cgroup has been reset/recreated and we should reset too.
612 */
613 for (curcpu = 0; curcpu < nprocs; curcpu++) {
614 if (!cg_cpu_usage[curcpu].online)
615 continue;
616
617 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
618 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
619
620 break;
621 }
622
623 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
624
625 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
626 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
627
628 if (!stat_node->usage[curcpu].online)
629 continue;
630
631 i++;
632
b4572722 633 stat_node->usage[curcpu].user += diff[curcpu].user;
1f5596dd 634 stat_node->usage[curcpu].system += diff[curcpu].system;
b4572722 635 stat_node->usage[curcpu].idle += diff[curcpu].idle;
1f5596dd
CB
636
637 if (max_cpus > 0 && i >= max_cpus) {
b4572722
CB
638 user_surplus += diff[curcpu].user;
639 system_surplus += diff[curcpu].system;
1f5596dd
CB
640 }
641 }
642
643 /* Calculate usage counters of visible CPUs */
644 if (max_cpus > 0) {
2b8eff1d
CB
645 uint64_t diff_user = 0;
646 uint64_t diff_system = 0;
647 uint64_t diff_idle = 0;
648 uint64_t max_diff_idle = 0;
649 uint64_t max_diff_idle_index = 0;
1f5596dd 650 double exact_cpus;
1f5596dd
CB
651 /* threshold = maximum usage per cpu, including idle */
652 threshold = total_sum / cpu_cnt * max_cpus;
653
654 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
655 if (!stat_node->usage[curcpu].online)
656 continue;
657
658 i++;
659
660 if (i == max_cpus)
661 break;
662
663 if (diff[curcpu].user + diff[curcpu].system >= threshold)
664 continue;
665
666 /* Add user */
667 add_cpu_usage(&user_surplus, &diff[curcpu],
668 &diff[curcpu].user, threshold);
669
670 if (diff[curcpu].user + diff[curcpu].system >= threshold)
671 continue;
672
673 /* If there is still room, add system */
674 add_cpu_usage(&system_surplus, &diff[curcpu],
675 &diff[curcpu].system, threshold);
676 }
677
678 if (user_surplus > 0)
679 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
680 if (system_surplus > 0)
681 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
682
683 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
684 if (!stat_node->usage[curcpu].online)
685 continue;
686
687 i++;
688
689 if (i == max_cpus)
690 break;
691
b4572722
CB
692 stat_node->view[curcpu].user += diff[curcpu].user;
693 stat_node->view[curcpu].system += diff[curcpu].system;
694 stat_node->view[curcpu].idle += diff[curcpu].idle;
1f5596dd 695
b4572722
CB
696 user_sum += stat_node->view[curcpu].user;
697 system_sum += stat_node->view[curcpu].system;
698 idle_sum += stat_node->view[curcpu].idle;
1f5596dd 699
b4572722
CB
700 diff_user += diff[curcpu].user;
701 diff_system += diff[curcpu].system;
702 diff_idle += diff[curcpu].idle;
1f5596dd 703 if (diff[curcpu].idle > max_diff_idle) {
b4572722
CB
704 max_diff_idle = diff[curcpu].idle;
705 max_diff_idle_index = curcpu;
1f5596dd
CB
706 }
707
708 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
709 }
710 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
711
712 /* revise cpu usage view to support partial cpu case. */
713 exact_cpus = exact_cpu_count(cg);
714 if (exact_cpus < (double)max_cpus){
1ba088ae 715 uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
1f5596dd
CB
716
717 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
718 lxcfs_v("delta: %lu\n", delta);
719 lxcfs_v("idle_sum before: %lu\n", idle_sum);
b4572722
CB
720 if (idle_sum > delta)
721 idle_sum = idle_sum - delta;
722 else
723 idle_sum = 0;
1f5596dd
CB
724 lxcfs_v("idle_sum after: %lu\n", idle_sum);
725
726 curcpu = max_diff_idle_index;
727 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
b4572722
CB
728 if (stat_node->view[curcpu].idle > delta)
729 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle - delta;
730 else
731 stat_node->view[curcpu].idle = 0;
1f5596dd
CB
732 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
733 }
734 } else {
735 for (curcpu = 0; curcpu < nprocs; curcpu++) {
736 if (!stat_node->usage[curcpu].online)
737 continue;
738
b4572722
CB
739 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
740 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
741 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
1f5596dd 742
b4572722
CB
743 user_sum += stat_node->view[curcpu].user;
744 system_sum += stat_node->view[curcpu].system;
745 idle_sum += stat_node->view[curcpu].idle;
1f5596dd
CB
746 }
747 }
748
749 /* Render the file */
750 /* cpu-all */
2b8eff1d
CB
751 l = snprintf(buf, buf_size,
752 "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
753 user_sum, system_sum, idle_sum);
1f5596dd 754 lxcfs_v("cpu-all: %s\n", buf);
692f48eb
CB
755 if (l < 0) {
756 lxcfs_error("Failed to write cache");
757 total_len = 0;
758 goto out_pthread_mutex_unlock;
759 }
3cf1e562 760 if ((size_t)l >= buf_size) {
08d61303
CB
761 lxcfs_error("Write to cache was truncated");
762 total_len = 0;
763 goto out_pthread_mutex_unlock;
764 }
1f5596dd
CB
765
766 buf += l;
767 buf_size -= l;
768 total_len += l;
769
770 /* Render visible CPUs */
771 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
772 if (!stat_node->usage[curcpu].online)
773 continue;
774
775 i++;
776
777 if (max_cpus > 0 && i == max_cpus)
778 break;
779
2b8eff1d
CB
780 l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
781 i,
782 stat_node->view[curcpu].user,
783 stat_node->view[curcpu].system,
784 stat_node->view[curcpu].idle);
1f5596dd 785 lxcfs_v("cpu: %s\n", buf);
692f48eb
CB
786 if (l < 0) {
787 lxcfs_error("Failed to write cache");
788 total_len = 0;
789 goto out_pthread_mutex_unlock;
790 }
3cf1e562 791 if ((size_t)l >= buf_size) {
692f48eb
CB
792 lxcfs_error("Write to cache was truncated");
793 total_len = 0;
794 goto out_pthread_mutex_unlock;
795 }
1f5596dd
CB
796
797 buf += l;
798 buf_size -= l;
799 total_len += l;
800 }
801
802 /* Pass the rest of /proc/stat, start with the last line read */
803 l = snprintf(buf, buf_size, "%s", line);
692f48eb
CB
804 if (l < 0) {
805 lxcfs_error("Failed to write cache");
806 total_len = 0;
807 goto out_pthread_mutex_unlock;
808 }
3cf1e562 809 if ((size_t)l >= buf_size) {
692f48eb
CB
810 lxcfs_error("Write to cache was truncated");
811 total_len = 0;
812 goto out_pthread_mutex_unlock;
813 }
1f5596dd
CB
814
815 buf += l;
816 buf_size -= l;
817 total_len += l;
818
819 /* Pass the rest of the host's /proc/stat */
820 while (getline(&line, &linelen, f) != -1) {
821 l = snprintf(buf, buf_size, "%s", line);
692f48eb
CB
822 if (l < 0) {
823 lxcfs_error("Failed to write cache");
824 total_len = 0;
825 goto out_pthread_mutex_unlock;
826 }
3cf1e562 827 if ((size_t)l >= buf_size) {
692f48eb
CB
828 lxcfs_error("Write to cache was truncated");
829 total_len = 0;
830 goto out_pthread_mutex_unlock;
831 }
b456d40d 832
1f5596dd
CB
833 buf += l;
834 buf_size -= l;
835 total_len += l;
836 }
837
692f48eb 838out_pthread_mutex_unlock:
1f5596dd
CB
839 if (stat_node)
840 pthread_mutex_unlock(&stat_node->lock);
b456d40d 841
1f5596dd
CB
842 return total_len;
843}
844
845/*
846 * check whether this is a '^processor" line in /proc/cpuinfo
847 */
b456d40d 848static inline bool is_processor_line(const char *line)
1f5596dd
CB
849{
850 int cpu;
b456d40d 851 return sscanf(line, "processor : %d", &cpu) == 1;
1f5596dd
CB
852}
853
b456d40d 854static inline bool cpuline_in_cpuset(const char *line, const char *cpuset)
1f5596dd
CB
855{
856 int cpu;
c539526c
CB
857
858 if (sscanf(line, "processor : %d", &cpu) == 1)
859 return cpu_in_cpuset(cpu, cpuset);
860
861 return false;
1f5596dd
CB
862}
863
864int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
865 struct fuse_file_info *fi)
866{
867 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
757a63e7 868 __do_free void *fopen_cache = NULL;
1f5596dd
CB
869 __do_fclose FILE *f = NULL;
870 struct fuse_context *fc = fuse_get_context();
0274438c 871 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
99b183fb 872 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1f5596dd
CB
873 size_t linelen = 0, total_len = 0;
874 bool am_printing = false, firstline = true, is_s390x = false;
875 int curcpu = -1, cpu, max_cpus = 0;
876 bool use_view;
877 char *cache = d->buf;
878 size_t cache_size = d->buflen;
879
f9434b9a 880 if (offset) {
3cf1e562 881 size_t left;
1f5596dd
CB
882
883 if (offset > d->size)
884 return -EINVAL;
885
886 if (!d->cached)
887 return 0;
888
889 left = d->size - offset;
890 total_len = left > size ? size: left;
891 memcpy(buf, cache + offset, total_len);
892
893 return total_len;
894 }
895
896 pid_t initpid = lookup_initpid_in_store(fc->pid);
897 if (initpid <= 1 || is_shared_pidns(initpid))
898 initpid = fc->pid;
b456d40d 899
1f5596dd
CB
900 cg = get_pid_cgroup(initpid, "cpuset");
901 if (!cg)
902 return read_file_fuse("proc/cpuinfo", buf, size, d);
903 prune_init_slice(cg);
904
905 cpuset = get_cpuset(cg);
906 if (!cpuset)
907 return 0;
908
8044f626 909 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
0274438c 910 use_view = true;
8044f626
CB
911 else
912 use_view = false;
1f5596dd
CB
913 if (use_view)
914 max_cpus = max_cpu_count(cg);
915
757a63e7 916 f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache);
1f5596dd
CB
917 if (!f)
918 return 0;
919
920 while (getline(&line, &linelen, f) != -1) {
921 ssize_t l;
922 if (firstline) {
923 firstline = false;
924 if (strstr(line, "IBM/S390") != NULL) {
925 is_s390x = true;
926 am_printing = true;
927 continue;
928 }
929 }
b456d40d 930
1f5596dd
CB
931 if (strncmp(line, "# processors:", 12) == 0)
932 continue;
b456d40d 933
1f5596dd 934 if (is_processor_line(line)) {
d0031abf 935 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
1f5596dd 936 break;
b456d40d 937
1f5596dd
CB
938 am_printing = cpuline_in_cpuset(line, cpuset);
939 if (am_printing) {
d0031abf 940 curcpu++;
1f5596dd 941 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
b456d40d
CB
942 if (l < 0)
943 return log_error(0, "Failed to write cache");
3cf1e562 944 if ((size_t)l >= cache_size)
b456d40d 945 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
946 cache += l;
947 cache_size -= l;
948 total_len += l;
949 }
950 continue;
951 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
952 char *p;
b456d40d 953
d0031abf 954 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
1f5596dd 955 break;
b456d40d 956
1f5596dd
CB
957 if (!cpu_in_cpuset(cpu, cpuset))
958 continue;
b456d40d 959
1f5596dd
CB
960 curcpu ++;
961 p = strchr(line, ':');
962 if (!p || !*p)
963 return 0;
964 p++;
b456d40d 965
1f5596dd 966 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
b456d40d
CB
967 if (l < 0)
968 return log_error(0, "Failed to write cache");
3cf1e562 969 if ((size_t)l >= cache_size)
b456d40d
CB
970 return log_error(0, "Write to cache was truncated");
971
1f5596dd
CB
972 cache += l;
973 cache_size -= l;
974 total_len += l;
975 continue;
976
977 }
978 if (am_printing) {
979 l = snprintf(cache, cache_size, "%s", line);
b456d40d
CB
980 if (l < 0)
981 return log_error(0, "Failed to write cache");
3cf1e562 982 if ((size_t)l >= cache_size)
b456d40d
CB
983 return log_error(0, "Write to cache was truncated");
984
1f5596dd
CB
985 cache += l;
986 cache_size -= l;
987 total_len += l;
988 }
989 }
990
991 if (is_s390x) {
992 __do_free char *origcache = d->buf;
993 ssize_t l;
994
995 d->buf = malloc(d->buflen);
996 if (!d->buf) {
997 d->buf = move_ptr(origcache);
998 return 0;
999 }
1000
1001 cache = d->buf;
1002 cache_size = d->buflen;
1003 total_len = 0;
1004 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3cf1e562 1005 if (l < 0 || (size_t)l >= cache_size)
1f5596dd
CB
1006 return 0;
1007
1008 cache_size -= l;
1009 cache += l;
1010 total_len += l;
1011 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3cf1e562 1012 if (l < 0 || (size_t)l >= cache_size)
1f5596dd
CB
1013 return 0;
1014
1015 cache_size -= l;
1016 cache += l;
1017 total_len += l;
1018 l = snprintf(cache, cache_size, "%s", origcache);
3cf1e562 1019 if (l < 0 || (size_t)l >= cache_size)
1f5596dd
CB
1020 return 0;
1021 total_len += l;
1022 }
1023
1024 d->cached = 1;
1025 d->size = total_len;
d0031abf
CB
1026 if (total_len > size)
1027 total_len = size;
1f5596dd
CB
1028
1029 /* read from off 0 */
1030 memcpy(buf, d->buf, total_len);
d0031abf 1031
1f5596dd
CB
1032 return total_len;
1033}
1034
1035/*
1036 * Returns 0 on success.
1037 * It is the caller's responsibility to free `return_usage`, unless this
1038 * function returns an error.
1039 */
1040int read_cpuacct_usage_all(char *cg, char *cpuset,
1041 struct cpuacct_usage **return_usage, int *size)
1042{
1043 __do_free char *usage_str = NULL;
1044 __do_free struct cpuacct_usage *cpu_usage = NULL;
9ce186dc 1045 int i = 0, j = 0, read_pos = 0, read_cnt = 0;
8b6987a2 1046 int cpucount;
9ce186dc 1047 int ret;
1f5596dd
CB
1048 int cg_cpu;
1049 uint64_t cg_user, cg_system;
1050 int64_t ticks_per_sec;
1051
1052 ticks_per_sec = sysconf(_SC_CLK_TCK);
1f5596dd 1053 if (ticks_per_sec < 0 && errno == EINVAL) {
8b6987a2 1054 lxcfs_debug("%m - Failed to determine number of ticks per second");
1f5596dd
CB
1055 return -1;
1056 }
1057
f9434b9a 1058 cpucount = get_nprocs_conf();
1f5596dd
CB
1059 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
1060 if (!cpu_usage)
1061 return -ENOMEM;
1062
1063 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1064 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
8b6987a2
CB
1065 char *sep = " \t\n";
1066 char *tok;
1f5596dd 1067
8b6987a2
CB
1068 /* Read cpuacct.usage_percpu instead. */
1069 lxcfs_debug("Falling back to cpuacct.usage_percpu");
1f5596dd
CB
1070 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
1071 return -1;
1f5596dd 1072
8b6987a2
CB
1073 lxc_iterate_parts(tok, usage_str, sep) {
1074 uint64_t percpu_user;
1075
1076 if (i >= cpucount)
1077 break;
1f5596dd 1078
8b6987a2
CB
1079 tok = trim_whitespace_in_place(tok);
1080 ret = safe_uint64(tok, &percpu_user, 10);
1081 if (ret)
1082 return -1;
1f5596dd 1083
8b6987a2
CB
1084 /* Convert the time from nanoseconds to USER_HZ */
1085 cpu_usage[i].user = percpu_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1086 cpu_usage[i].system = cpu_usage[i].user;
1f5596dd 1087 i++;
8b6987a2 1088 lxcfs_debug("cpu%d with time %s", i, tok);
1f5596dd 1089 }
8b6987a2
CB
1090 } else {
1091 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0)
1092 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg);
1f5596dd 1093
8b6987a2 1094 read_pos += read_cnt;
1f5596dd 1095
8b6987a2
CB
1096 for (i = 0, j = 0; i < cpucount; i++) {
1097 ret = sscanf(usage_str + read_pos,
1098 "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu,
1099 &cg_user, &cg_system, &read_cnt);
1f5596dd 1100
8b6987a2
CB
1101 if (ret == EOF)
1102 break;
1f5596dd 1103
8b6987a2
CB
1104 if (ret != 3)
1105 return log_error(-EINVAL, "Failed to parse cpuacct.usage_all line %s from cgroup %s",
1106 usage_str + read_pos, cg);
1f5596dd 1107
8b6987a2 1108 read_pos += read_cnt;
1f5596dd 1109
8b6987a2
CB
1110 /* Convert the time from nanoseconds to USER_HZ */
1111 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1112 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
1113 j++;
1114 }
1f5596dd
CB
1115 }
1116
1117 *return_usage = move_ptr(cpu_usage);
1118 *size = cpucount;
1119 return 0;
1120}
1121
1122static bool cpuview_init_head(struct cg_proc_stat_head **head)
1123{
9d7fc1a3 1124 __do_free struct cg_proc_stat_head *h;
1f5596dd 1125
9d7fc1a3
CB
1126 h = zalloc(sizeof(struct cg_proc_stat_head));
1127 if (!h)
1128 return false;
1f5596dd 1129
9d7fc1a3
CB
1130 if (pthread_rwlock_init(&h->lock, NULL))
1131 return false;
1132
1133 h->lastcheck = time(NULL);
1f5596dd 1134
9d7fc1a3 1135 *head = move_ptr(h);
1f5596dd
CB
1136 return true;
1137}
1138
4ec5c9da 1139bool init_cpuview(void)
1f5596dd
CB
1140{
1141 int i;
1142
1143 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
1144 proc_stat_history[i] = NULL;
1145
1146 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1147 if (!cpuview_init_head(&proc_stat_history[i]))
1148 goto err;
1149 }
1150
1151 return true;
1152
1153err:
1154 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1155 if (proc_stat_history[i])
1156 free_disarm(proc_stat_history[i]);
1157 }
1158
1159 return false;
1160}
1161
1f5596dd
CB
1162static void cpuview_free_head(struct cg_proc_stat_head *head)
1163{
905769cd 1164 struct cg_proc_stat *node;
1f5596dd
CB
1165
1166 if (head->next) {
1167 node = head->next;
1168
1169 for (;;) {
905769cd 1170 struct cg_proc_stat *cur = node;
1f5596dd 1171 node = node->next;
905769cd 1172 free_proc_stat_node(cur);
1f5596dd
CB
1173 if (!node)
1174 break;
1175 }
1176 }
1177
1178 pthread_rwlock_destroy(&head->lock);
1179 free_disarm(head);
1180}
1181
4ec5c9da 1182void free_cpuview(void)
1f5596dd 1183{
4ec5c9da 1184 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++)
1f5596dd
CB
1185 if (proc_stat_history[i])
1186 cpuview_free_head(proc_stat_history[i]);
1f5596dd 1187}