]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/proc_cpuview.c
src: rely on config.h for fuse version
[mirror_lxcfs.git] / src / proc_cpuview.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
1f5596dd
CB
2
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE
5#endif
6
f834b6bf
SP
7#include "config.h"
8
1f5596dd
CB
9#define __STDC_FORMAT_MACROS
10#include <dirent.h>
11#include <errno.h>
12#include <fcntl.h>
13#include <fuse.h>
14#include <inttypes.h>
15#include <libgen.h>
16#include <pthread.h>
17#include <sched.h>
18#include <stdarg.h>
19#include <stdbool.h>
20#include <stdint.h>
21#include <stdio.h>
22#include <stdlib.h>
23#include <string.h>
24#include <time.h>
25#include <unistd.h>
26#include <wait.h>
27#include <linux/magic.h>
28#include <linux/sched.h>
29#include <sys/epoll.h>
30#include <sys/mman.h>
31#include <sys/mount.h>
32#include <sys/param.h>
33#include <sys/socket.h>
34#include <sys/syscall.h>
35#include <sys/sysinfo.h>
36#include <sys/vfs.h>
37
38#include "bindings.h"
1f5596dd
CB
39#include "cgroup_fuse.h"
40#include "cpuset_parse.h"
41#include "cgroups/cgroup.h"
42#include "cgroups/cgroup_utils.h"
43#include "memory_utils.h"
4ec5c9da 44#include "proc_loadavg.h"
1f5596dd
CB
45#include "utils.h"
46
1f5596dd
CB
47/* Data for CPU view */
48struct cg_proc_stat {
49 char *cg;
ce617d73
CB
50 struct cpuacct_usage *usage; /* Real usage as read from the host's /proc/stat. */
51 struct cpuacct_usage *view; /* Usage stats reported to the container. */
1f5596dd 52 int cpu_count;
ce617d73 53 pthread_mutex_t lock; /* For node manipulation. */
1f5596dd
CB
54 struct cg_proc_stat *next;
55};
56
57struct cg_proc_stat_head {
58 struct cg_proc_stat *next;
59 time_t lastcheck;
60
61 /*
62 * For access to the list. Reading can be parallel, pruning is exclusive.
63 */
64 pthread_rwlock_t lock;
65};
66
67#define CPUVIEW_HASH_SIZE 100
68static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
69
b456d40d
CB
70static void reset_proc_stat_node(struct cg_proc_stat *node,
71 struct cpuacct_usage *usage, int cpu_count)
1f5596dd 72{
1f5596dd
CB
73 lxcfs_debug("Resetting stat node for %s\n", node->cg);
74 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
75
b456d40d 76 for (int i = 0; i < cpu_count; i++) {
1f5596dd
CB
77 node->view[i].user = 0;
78 node->view[i].system = 0;
79 node->view[i].idle = 0;
80 }
81
82 node->cpu_count = cpu_count;
83}
84
85static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
86{
87 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
88
89 /* Allocate new memory */
82d74a95 90 new_usage = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
1f5596dd
CB
91 if (!new_usage)
92 return false;
93
82d74a95 94 new_view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
1f5596dd
CB
95 if (!new_view)
96 return false;
97
98 /* Copy existing data & initialize new elements */
99 for (int i = 0; i < cpu_count; i++) {
100 if (i < node->cpu_count) {
82d74a95
CB
101 new_usage[i].user = node->usage[i].user;
102 new_usage[i].system = node->usage[i].system;
103 new_usage[i].idle = node->usage[i].idle;
104
105 new_view[i].user = node->view[i].user;
106 new_view[i].system = node->view[i].system;
107 new_view[i].idle = node->view[i].idle;
1f5596dd
CB
108 }
109 }
110
111 free(node->usage);
112 node->usage = move_ptr(new_usage);
113
114 free(node->view);
115 node->view = move_ptr(new_view);
116 node->cpu_count = cpu_count;
117
118 return true;
119}
120
4ec5c9da
CB
121static void free_proc_stat_node(struct cg_proc_stat *node)
122{
6a4dceb1
CB
123 if (node) {
124 /*
125 * We're abusing the usage pointer to indicate that
126 * pthread_mutex_init() was successful. Don't judge me.
127 */
128 if (node->usage)
129 pthread_mutex_destroy(&node->lock);
130 free_disarm(node->cg);
131 free_disarm(node->usage);
132 free_disarm(node->view);
133 free_disarm(node);
134 }
4ec5c9da
CB
135}
136
6a4dceb1
CB
137define_cleanup_function(struct cg_proc_stat *, free_proc_stat_node);
138
1f5596dd
CB
139static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
140{
0d129671
CB
141 call_cleaner(free_proc_stat_node) struct cg_proc_stat *new = new_node;
142 struct cg_proc_stat *rv = new_node;
143 int hash = calc_hash(new->cg) % CPUVIEW_HASH_SIZE;
1f5596dd 144 struct cg_proc_stat_head *head = proc_stat_history[hash];
0d129671 145 struct cg_proc_stat *cur;
1f5596dd
CB
146
147 pthread_rwlock_wrlock(&head->lock);
148
149 if (!head->next) {
0d129671 150 head->next = move_ptr(new);
164acda7 151 goto out_rwlock_unlock;
1f5596dd
CB
152 }
153
0d129671 154 cur = head->next;
1f5596dd
CB
155
156 for (;;) {
0d129671
CB
157 /*
158 * The node to be added is already present in the list, so
159 * free the newly allocated one and return the one we found.
160 */
161 if (strcmp(cur->cg, new->cg) == 0) {
162 rv = cur;
164acda7 163 goto out_rwlock_unlock;
1f5596dd
CB
164 }
165
0d129671
CB
166 /* Keep walking. */
167 if (cur->next) {
168 cur = cur->next;
1f5596dd
CB
169 continue;
170 }
171
0d129671
CB
172 /* Add new node to end of list. */
173 cur->next = move_ptr(new);
164acda7 174 goto out_rwlock_unlock;
1f5596dd
CB
175 }
176
164acda7 177out_rwlock_unlock:
1f5596dd 178 pthread_rwlock_unlock(&head->lock);
0d129671 179 return move_ptr(rv);
1f5596dd
CB
180}
181
6a4dceb1
CB
182static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage,
183 int cpu_count, const char *cg)
1f5596dd 184{
6a4dceb1
CB
185 call_cleaner(free_proc_stat_node) struct cg_proc_stat *node = NULL;
186 __do_free struct cpuacct_usage *new_usage = NULL;
1f5596dd 187
6a4dceb1 188 node = zalloc(sizeof(struct cg_proc_stat));
1f5596dd 189 if (!node)
6a4dceb1 190 return NULL;
1f5596dd 191
6a4dceb1 192 node->cg = strdup(cg);
1f5596dd 193 if (!node->cg)
6a4dceb1 194 return NULL;
1f5596dd 195
6a4dceb1
CB
196 new_usage = memdup(usage, sizeof(struct cpuacct_usage) * cpu_count);
197 if (!new_usage)
198 return NULL;
1f5596dd 199
6a4dceb1 200 node->view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
1f5596dd 201 if (!node->view)
6a4dceb1 202 return NULL;
1f5596dd
CB
203
204 node->cpu_count = cpu_count;
1f5596dd 205
6a4dceb1
CB
206 if (pthread_mutex_init(&node->lock, NULL))
207 return NULL;
208 /*
209 * We're abusing the usage pointer to indicate that
210 * pthread_mutex_init() was successful. Don't judge me.
211 */
212 node->usage = move_ptr(new_usage);
1f5596dd 213
6a4dceb1 214 return move_ptr(node);
1f5596dd
CB
215}
216
2d00d04c
CB
217static bool cgroup_supports(const char *controller, const char *cgroup,
218 const char *file)
4ec5c9da 219{
2c990b1d
CB
220 __do_free char *path = NULL;
221 int cfd;
4ec5c9da
CB
222
223 cfd = get_cgroup_fd(controller);
224 if (cfd < 0)
225 return false;
226
925d5849 227 path = must_make_path_relative(cgroup, file, NULL);
2d00d04c 228 return faccessat(cfd, path, F_OK, 0) == 0;
4ec5c9da
CB
229}
230
1f5596dd
CB
231static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
232{
b456d40d 233 struct cg_proc_stat *first = NULL;
1f5596dd 234
b456d40d 235 for (struct cg_proc_stat *prev = NULL; node; ) {
2d00d04c
CB
236 if (!cgroup_supports("cpu", node->cg, "cpu.shares")) {
237 call_cleaner(free_proc_stat_node) struct cg_proc_stat *cur = node;
1f5596dd
CB
238
239 if (prev)
240 prev->next = node->next;
241 else
242 first = node->next;
243
244 node = node->next;
2d00d04c 245 lxcfs_debug("Removing stat node for %s\n", cur->cg);
1f5596dd
CB
246 } else {
247 if (!first)
248 first = node;
249 prev = node;
250 node = node->next;
251 }
252 }
253
254 return first;
255}
256
257#define PROC_STAT_PRUNE_INTERVAL 10
258static void prune_proc_stat_history(void)
259{
1f5596dd
CB
260 time_t now = time(NULL);
261
b456d40d 262 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1f5596dd
CB
263 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
264
265 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
266 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
267 return;
268 }
269
270 if (proc_stat_history[i]->next) {
271 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
272 proc_stat_history[i]->lastcheck = now;
273 }
274
275 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
276 }
277}
278
279static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head,
280 const char *cg)
281{
282 struct cg_proc_stat *node;
283
284 pthread_rwlock_rdlock(&head->lock);
285
286 if (!head->next) {
287 pthread_rwlock_unlock(&head->lock);
288 return NULL;
289 }
290
291 node = head->next;
292
293 do {
294 if (strcmp(cg, node->cg) == 0)
295 goto out;
296 } while ((node = node->next));
297
298 node = NULL;
299
300out:
301 pthread_rwlock_unlock(&head->lock);
302 prune_proc_stat_history();
303 return node;
304}
305
692f48eb
CB
306static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage,
307 int cpu_count, const char *cg)
1f5596dd
CB
308{
309 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
310 struct cg_proc_stat_head *head = proc_stat_history[hash];
311 struct cg_proc_stat *node;
312
313 node = find_proc_stat_node(head, cg);
1f5596dd
CB
314 if (!node) {
315 node = new_proc_stat_node(usage, cpu_count, cg);
316 if (!node)
317 return NULL;
318
319 node = add_proc_stat_node(node);
320 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
321 }
322
323 pthread_mutex_lock(&node->lock);
324
ce089f10
CB
325 /*
326 * If additional CPUs on the host have been enabled, CPU usage counter
327 * arrays have to be expanded.
328 */
1f5596dd
CB
329 if (node->cpu_count < cpu_count) {
330 lxcfs_debug("Expanding stat node %d->%d for %s\n",
ce089f10 331 node->cpu_count, cpu_count, cg);
1f5596dd
CB
332
333 if (!expand_proc_stat_node(node, cpu_count)) {
334 pthread_mutex_unlock(&node->lock);
b456d40d 335 return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg);
1f5596dd
CB
336 }
337 }
338
339 return node;
340}
341
2b8eff1d
CB
342static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage,
343 uint64_t *counter, uint64_t threshold)
1f5596dd 344{
1ba088ae 345 uint64_t free_space, to_add;
1f5596dd
CB
346
347 free_space = threshold - usage->user - usage->system;
348
349 if (free_space > usage->idle)
350 free_space = usage->idle;
351
8206874a
CB
352 if (free_space > *surplus)
353 to_add = *surplus;
354 else
355 to_add = free_space;
1f5596dd
CB
356
357 *counter += to_add;
358 usage->idle -= to_add;
359 *surplus -= to_add;
360}
361
1ba088ae
CB
362static uint64_t diff_cpu_usage(struct cpuacct_usage *older,
363 struct cpuacct_usage *newer,
364 struct cpuacct_usage *diff, int cpu_count)
1f5596dd 365{
1ba088ae 366 uint64_t sum = 0;
1f5596dd 367
b456d40d 368 for (int i = 0; i < cpu_count; i++) {
1f5596dd
CB
369 if (!newer[i].online)
370 continue;
371
b456d40d
CB
372 /*
373 * When cpuset is changed on the fly, the CPUs might get
374 * reordered. We could either reset all counters, or check
375 * that the substractions below will return expected results.
1f5596dd
CB
376 */
377 if (newer[i].user > older[i].user)
378 diff[i].user = newer[i].user - older[i].user;
379 else
380 diff[i].user = 0;
381
382 if (newer[i].system > older[i].system)
383 diff[i].system = newer[i].system - older[i].system;
384 else
385 diff[i].system = 0;
386
387 if (newer[i].idle > older[i].idle)
388 diff[i].idle = newer[i].idle - older[i].idle;
389 else
390 diff[i].idle = 0;
391
392 sum += diff[i].user;
393 sum += diff[i].system;
394 sum += diff[i].idle;
395 }
396
397 return sum;
398}
399
400/*
b456d40d
CB
401 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
402 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
92264841 403 * through `value`.
1f5596dd
CB
404 */
405static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
406{
407 __do_free char *str = NULL;
48f6862e 408 char file[STRLITERALLEN("cpu.cfs_period_us") + 1];
9844eea7 409 bool first = true;
48f6862e 410 int ret;
1f5596dd 411
48f6862e 412 if (pure_unified_layout(cgroup_ops)) {
9844eea7 413 first = !strcmp(param, "quota");
48f6862e
CB
414 ret = snprintf(file, sizeof(file), "cpu.max");
415 } else {
416 ret = snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
9844eea7 417 }
48f6862e 418 if (ret < 0 || (size_t)ret >= sizeof(file))
1f5596dd
CB
419 return false;
420
48f6862e 421 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
1f5596dd
CB
422 return false;
423
48f6862e 424 return sscanf(str, first ? "%" PRId64 : "%*d %" PRId64, value) == 1;
1f5596dd
CB
425}
426
427/*
428 * Return the exact number of visible CPUs based on CPU quotas.
429 * If there is no quota set, zero is returned.
430 */
431static double exact_cpu_count(const char *cg)
432{
433 double rv;
434 int nprocs;
435 int64_t cfs_quota, cfs_period;
436
c602a0d0
CB
437 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
438 return 0;
439
440 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
441 return 0;
1f5596dd
CB
442
443 if (cfs_quota <= 0 || cfs_period <= 0)
444 return 0;
445
446 rv = (double)cfs_quota / (double)cfs_period;
447
448 nprocs = get_nprocs();
449
450 if (rv > nprocs)
451 rv = nprocs;
452
453 return rv;
454}
455
456/*
457 * Return the maximum number of visible CPUs based on CPU quotas.
458 * If there is no quota set, zero is returned.
459 */
4ec5c9da 460int max_cpu_count(const char *cg)
1f5596dd 461{
700dd417 462 __do_free char *cpuset = NULL;
1f5596dd
CB
463 int rv, nprocs;
464 int64_t cfs_quota, cfs_period;
465 int nr_cpus_in_cpuset = 0;
1f5596dd 466
921bdfdb
CB
467 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
468 return 0;
469
470 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
471 return 0;
1f5596dd
CB
472
473 cpuset = get_cpuset(cg);
474 if (cpuset)
475 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
476
921bdfdb 477 if (cfs_quota <= 0 || cfs_period <= 0) {
1f5596dd
CB
478 if (nr_cpus_in_cpuset > 0)
479 return nr_cpus_in_cpuset;
480
481 return 0;
482 }
483
484 rv = cfs_quota / cfs_period;
485
921bdfdb
CB
486 /*
487 * In case quota/period does not yield a whole number, add one CPU for
1f5596dd
CB
488 * the remainder.
489 */
490 if ((cfs_quota % cfs_period) > 0)
491 rv += 1;
492
493 nprocs = get_nprocs();
1f5596dd
CB
494 if (rv > nprocs)
495 rv = nprocs;
496
921bdfdb 497 /* Use min value in cpu quota and cpuset. */
1f5596dd
CB
498 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
499 rv = nr_cpus_in_cpuset;
500
501 return rv;
502}
503
504int cpuview_proc_stat(const char *cg, const char *cpuset,
505 struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size,
506 FILE *f, char *buf, size_t buf_size)
507{
508 __do_free char *line = NULL;
509 __do_free struct cpuacct_usage *diff = NULL;
4f18a602 510 size_t linelen = 0, total_len = 0;
1f5596dd
CB
511 int curcpu = -1; /* cpu numbering starts at 0 */
512 int physcpu, i;
39f231da 513 int cpu_cnt = 0;
2b8eff1d
CB
514 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
515 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
516 uint64_t user_sum = 0, system_sum = 0, idle_sum = 0;
517 uint64_t user_surplus = 0, system_surplus = 0;
39f231da 518 int nprocs, max_cpus;
4f18a602 519 ssize_t l;
2b8eff1d 520 uint64_t total_sum, threshold;
1f5596dd 521 struct cg_proc_stat *stat_node;
1f5596dd 522
39f231da 523 nprocs = get_nprocs_conf();
1f5596dd
CB
524 if (cg_cpu_usage_size < nprocs)
525 nprocs = cg_cpu_usage_size;
526
527 /* Read all CPU stats and stop when we've encountered other lines */
528 while (getline(&line, &linelen, f) != -1) {
529 int ret;
530 char cpu_char[10]; /* That's a lot of cores */
531 uint64_t all_used, cg_used;
532
533 if (strlen(line) == 0)
534 continue;
535
536 /* not a ^cpuN line containing a number N */
537 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
538 break;
539
540 if (sscanf(cpu_char, "%d", &physcpu) != 1)
541 continue;
542
543 if (physcpu >= cg_cpu_usage_size)
544 continue;
545
fd65c77c
CB
546 curcpu++;
547 cpu_cnt++;
1f5596dd
CB
548
549 if (!cpu_in_cpuset(physcpu, cpuset)) {
550 for (i = curcpu; i <= physcpu; i++)
551 cg_cpu_usage[i].online = false;
552 continue;
553 }
554
555 if (curcpu < physcpu) {
556 /* Some CPUs may be disabled */
557 for (i = curcpu; i < physcpu; i++)
558 cg_cpu_usage[i].online = false;
559
560 curcpu = physcpu;
561 }
562
563 cg_cpu_usage[curcpu].online = true;
564
2b8eff1d 565 ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu",
1f5596dd
CB
566 &user,
567 &nice,
568 &system,
569 &idle,
570 &iowait,
571 &irq,
572 &softirq,
573 &steal,
574 &guest,
575 &guest_nice);
1f5596dd
CB
576 if (ret != 10)
577 continue;
578
579 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
580 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
581
582 if (all_used >= cg_used) {
583 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
584
585 } else {
2b8eff1d
CB
586 lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
587 curcpu, cg, all_used, cg_used);
1f5596dd
CB
588 cg_cpu_usage[curcpu].idle = idle;
589 }
590 }
591
f9434b9a
CB
592 /* Cannot use more CPUs than is available in cpuset. */
593 max_cpus = max_cpu_count(cg);
594 if (max_cpus > cpu_cnt || !max_cpus)
595 max_cpus = cpu_cnt;
596
692f48eb 597 /* takes lock pthread_mutex_lock(&node->lock) */
1f5596dd 598 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
b456d40d
CB
599 if (!stat_node)
600 return log_error(0, "Failed to find/create stat node for %s", cg);
1f5596dd 601
b4572722 602 diff = zalloc(sizeof(struct cpuacct_usage) * nprocs);
700dd417 603 if (!diff)
08d61303 604 goto out_pthread_mutex_unlock;
1f5596dd
CB
605
606 /*
607 * If the new values are LOWER than values stored in memory, it means
608 * the cgroup has been reset/recreated and we should reset too.
609 */
610 for (curcpu = 0; curcpu < nprocs; curcpu++) {
611 if (!cg_cpu_usage[curcpu].online)
612 continue;
613
614 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
615 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
616
617 break;
618 }
619
620 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
621
622 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
623 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
624
625 if (!stat_node->usage[curcpu].online)
626 continue;
627
628 i++;
629
b4572722 630 stat_node->usage[curcpu].user += diff[curcpu].user;
1f5596dd 631 stat_node->usage[curcpu].system += diff[curcpu].system;
b4572722 632 stat_node->usage[curcpu].idle += diff[curcpu].idle;
1f5596dd
CB
633
634 if (max_cpus > 0 && i >= max_cpus) {
b4572722
CB
635 user_surplus += diff[curcpu].user;
636 system_surplus += diff[curcpu].system;
1f5596dd
CB
637 }
638 }
639
640 /* Calculate usage counters of visible CPUs */
641 if (max_cpus > 0) {
2b8eff1d
CB
642 uint64_t diff_user = 0;
643 uint64_t diff_system = 0;
644 uint64_t diff_idle = 0;
645 uint64_t max_diff_idle = 0;
646 uint64_t max_diff_idle_index = 0;
1f5596dd 647 double exact_cpus;
1f5596dd
CB
648 /* threshold = maximum usage per cpu, including idle */
649 threshold = total_sum / cpu_cnt * max_cpus;
650
651 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
652 if (!stat_node->usage[curcpu].online)
653 continue;
654
655 i++;
656
657 if (i == max_cpus)
658 break;
659
660 if (diff[curcpu].user + diff[curcpu].system >= threshold)
661 continue;
662
663 /* Add user */
664 add_cpu_usage(&user_surplus, &diff[curcpu],
665 &diff[curcpu].user, threshold);
666
667 if (diff[curcpu].user + diff[curcpu].system >= threshold)
668 continue;
669
670 /* If there is still room, add system */
671 add_cpu_usage(&system_surplus, &diff[curcpu],
672 &diff[curcpu].system, threshold);
673 }
674
675 if (user_surplus > 0)
676 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
677 if (system_surplus > 0)
678 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
679
680 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
681 if (!stat_node->usage[curcpu].online)
682 continue;
683
684 i++;
685
686 if (i == max_cpus)
687 break;
688
b4572722
CB
689 stat_node->view[curcpu].user += diff[curcpu].user;
690 stat_node->view[curcpu].system += diff[curcpu].system;
691 stat_node->view[curcpu].idle += diff[curcpu].idle;
1f5596dd 692
b4572722
CB
693 user_sum += stat_node->view[curcpu].user;
694 system_sum += stat_node->view[curcpu].system;
695 idle_sum += stat_node->view[curcpu].idle;
1f5596dd 696
b4572722
CB
697 diff_user += diff[curcpu].user;
698 diff_system += diff[curcpu].system;
699 diff_idle += diff[curcpu].idle;
1f5596dd 700 if (diff[curcpu].idle > max_diff_idle) {
b4572722
CB
701 max_diff_idle = diff[curcpu].idle;
702 max_diff_idle_index = curcpu;
1f5596dd
CB
703 }
704
705 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
706 }
707 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
708
709 /* revise cpu usage view to support partial cpu case. */
710 exact_cpus = exact_cpu_count(cg);
711 if (exact_cpus < (double)max_cpus){
1ba088ae 712 uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
1f5596dd
CB
713
714 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
715 lxcfs_v("delta: %lu\n", delta);
716 lxcfs_v("idle_sum before: %lu\n", idle_sum);
b4572722
CB
717 if (idle_sum > delta)
718 idle_sum = idle_sum - delta;
719 else
720 idle_sum = 0;
1f5596dd
CB
721 lxcfs_v("idle_sum after: %lu\n", idle_sum);
722
723 curcpu = max_diff_idle_index;
724 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
b4572722
CB
725 if (stat_node->view[curcpu].idle > delta)
726 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle - delta;
727 else
728 stat_node->view[curcpu].idle = 0;
1f5596dd
CB
729 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
730 }
731 } else {
732 for (curcpu = 0; curcpu < nprocs; curcpu++) {
733 if (!stat_node->usage[curcpu].online)
734 continue;
735
b4572722
CB
736 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
737 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
738 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
1f5596dd 739
b4572722
CB
740 user_sum += stat_node->view[curcpu].user;
741 system_sum += stat_node->view[curcpu].system;
742 idle_sum += stat_node->view[curcpu].idle;
1f5596dd
CB
743 }
744 }
745
746 /* Render the file */
747 /* cpu-all */
2b8eff1d
CB
748 l = snprintf(buf, buf_size,
749 "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
750 user_sum, system_sum, idle_sum);
1f5596dd 751 lxcfs_v("cpu-all: %s\n", buf);
692f48eb
CB
752 if (l < 0) {
753 lxcfs_error("Failed to write cache");
754 total_len = 0;
755 goto out_pthread_mutex_unlock;
756 }
08d61303
CB
757 if (l >= buf_size) {
758 lxcfs_error("Write to cache was truncated");
759 total_len = 0;
760 goto out_pthread_mutex_unlock;
761 }
1f5596dd
CB
762
763 buf += l;
764 buf_size -= l;
765 total_len += l;
766
767 /* Render visible CPUs */
768 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
769 if (!stat_node->usage[curcpu].online)
770 continue;
771
772 i++;
773
774 if (max_cpus > 0 && i == max_cpus)
775 break;
776
2b8eff1d
CB
777 l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
778 i,
779 stat_node->view[curcpu].user,
780 stat_node->view[curcpu].system,
781 stat_node->view[curcpu].idle);
1f5596dd 782 lxcfs_v("cpu: %s\n", buf);
692f48eb
CB
783 if (l < 0) {
784 lxcfs_error("Failed to write cache");
785 total_len = 0;
786 goto out_pthread_mutex_unlock;
787 }
788 if (l >= buf_size) {
789 lxcfs_error("Write to cache was truncated");
790 total_len = 0;
791 goto out_pthread_mutex_unlock;
792 }
1f5596dd
CB
793
794 buf += l;
795 buf_size -= l;
796 total_len += l;
797 }
798
799 /* Pass the rest of /proc/stat, start with the last line read */
800 l = snprintf(buf, buf_size, "%s", line);
692f48eb
CB
801 if (l < 0) {
802 lxcfs_error("Failed to write cache");
803 total_len = 0;
804 goto out_pthread_mutex_unlock;
805 }
806 if (l >= buf_size) {
807 lxcfs_error("Write to cache was truncated");
808 total_len = 0;
809 goto out_pthread_mutex_unlock;
810 }
1f5596dd
CB
811
812 buf += l;
813 buf_size -= l;
814 total_len += l;
815
816 /* Pass the rest of the host's /proc/stat */
817 while (getline(&line, &linelen, f) != -1) {
818 l = snprintf(buf, buf_size, "%s", line);
692f48eb
CB
819 if (l < 0) {
820 lxcfs_error("Failed to write cache");
821 total_len = 0;
822 goto out_pthread_mutex_unlock;
823 }
824 if (l >= buf_size) {
825 lxcfs_error("Write to cache was truncated");
826 total_len = 0;
827 goto out_pthread_mutex_unlock;
828 }
b456d40d 829
1f5596dd
CB
830 buf += l;
831 buf_size -= l;
832 total_len += l;
833 }
834
692f48eb 835out_pthread_mutex_unlock:
1f5596dd
CB
836 if (stat_node)
837 pthread_mutex_unlock(&stat_node->lock);
b456d40d 838
1f5596dd
CB
839 return total_len;
840}
841
842/*
843 * check whether this is a '^processor" line in /proc/cpuinfo
844 */
b456d40d 845static inline bool is_processor_line(const char *line)
1f5596dd
CB
846{
847 int cpu;
b456d40d 848 return sscanf(line, "processor : %d", &cpu) == 1;
1f5596dd
CB
849}
850
b456d40d 851static inline bool cpuline_in_cpuset(const char *line, const char *cpuset)
1f5596dd
CB
852{
853 int cpu;
c539526c
CB
854
855 if (sscanf(line, "processor : %d", &cpu) == 1)
856 return cpu_in_cpuset(cpu, cpuset);
857
858 return false;
1f5596dd
CB
859}
860
861int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
862 struct fuse_file_info *fi)
863{
864 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
757a63e7 865 __do_free void *fopen_cache = NULL;
1f5596dd
CB
866 __do_fclose FILE *f = NULL;
867 struct fuse_context *fc = fuse_get_context();
0274438c 868 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
99b183fb 869 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1f5596dd
CB
870 size_t linelen = 0, total_len = 0;
871 bool am_printing = false, firstline = true, is_s390x = false;
872 int curcpu = -1, cpu, max_cpus = 0;
873 bool use_view;
874 char *cache = d->buf;
875 size_t cache_size = d->buflen;
876
f9434b9a 877 if (offset) {
1f5596dd
CB
878 int left;
879
880 if (offset > d->size)
881 return -EINVAL;
882
883 if (!d->cached)
884 return 0;
885
886 left = d->size - offset;
887 total_len = left > size ? size: left;
888 memcpy(buf, cache + offset, total_len);
889
890 return total_len;
891 }
892
893 pid_t initpid = lookup_initpid_in_store(fc->pid);
894 if (initpid <= 1 || is_shared_pidns(initpid))
895 initpid = fc->pid;
b456d40d 896
1f5596dd
CB
897 cg = get_pid_cgroup(initpid, "cpuset");
898 if (!cg)
899 return read_file_fuse("proc/cpuinfo", buf, size, d);
900 prune_init_slice(cg);
901
902 cpuset = get_cpuset(cg);
903 if (!cpuset)
904 return 0;
905
8044f626 906 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
0274438c 907 use_view = true;
8044f626
CB
908 else
909 use_view = false;
1f5596dd
CB
910 if (use_view)
911 max_cpus = max_cpu_count(cg);
912
757a63e7 913 f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache);
1f5596dd
CB
914 if (!f)
915 return 0;
916
917 while (getline(&line, &linelen, f) != -1) {
918 ssize_t l;
919 if (firstline) {
920 firstline = false;
921 if (strstr(line, "IBM/S390") != NULL) {
922 is_s390x = true;
923 am_printing = true;
924 continue;
925 }
926 }
b456d40d 927
1f5596dd
CB
928 if (strncmp(line, "# processors:", 12) == 0)
929 continue;
b456d40d 930
1f5596dd 931 if (is_processor_line(line)) {
d0031abf 932 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
1f5596dd 933 break;
b456d40d 934
1f5596dd
CB
935 am_printing = cpuline_in_cpuset(line, cpuset);
936 if (am_printing) {
d0031abf 937 curcpu++;
1f5596dd 938 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
b456d40d
CB
939 if (l < 0)
940 return log_error(0, "Failed to write cache");
941 if (l >= cache_size)
942 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
943 cache += l;
944 cache_size -= l;
945 total_len += l;
946 }
947 continue;
948 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
949 char *p;
b456d40d 950
d0031abf 951 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
1f5596dd 952 break;
b456d40d 953
1f5596dd
CB
954 if (!cpu_in_cpuset(cpu, cpuset))
955 continue;
b456d40d 956
1f5596dd
CB
957 curcpu ++;
958 p = strchr(line, ':');
959 if (!p || !*p)
960 return 0;
961 p++;
b456d40d 962
1f5596dd 963 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
b456d40d
CB
964 if (l < 0)
965 return log_error(0, "Failed to write cache");
966 if (l >= cache_size)
967 return log_error(0, "Write to cache was truncated");
968
1f5596dd
CB
969 cache += l;
970 cache_size -= l;
971 total_len += l;
972 continue;
973
974 }
975 if (am_printing) {
976 l = snprintf(cache, cache_size, "%s", line);
b456d40d
CB
977 if (l < 0)
978 return log_error(0, "Failed to write cache");
979 if (l >= cache_size)
980 return log_error(0, "Write to cache was truncated");
981
1f5596dd
CB
982 cache += l;
983 cache_size -= l;
984 total_len += l;
985 }
986 }
987
988 if (is_s390x) {
989 __do_free char *origcache = d->buf;
990 ssize_t l;
991
992 d->buf = malloc(d->buflen);
993 if (!d->buf) {
994 d->buf = move_ptr(origcache);
995 return 0;
996 }
997
998 cache = d->buf;
999 cache_size = d->buflen;
1000 total_len = 0;
1001 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
1002 if (l < 0 || l >= cache_size)
1003 return 0;
1004
1005 cache_size -= l;
1006 cache += l;
1007 total_len += l;
1008 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
1009 if (l < 0 || l >= cache_size)
1010 return 0;
1011
1012 cache_size -= l;
1013 cache += l;
1014 total_len += l;
1015 l = snprintf(cache, cache_size, "%s", origcache);
1016 if (l < 0 || l >= cache_size)
1017 return 0;
1018 total_len += l;
1019 }
1020
1021 d->cached = 1;
1022 d->size = total_len;
d0031abf
CB
1023 if (total_len > size)
1024 total_len = size;
1f5596dd
CB
1025
1026 /* read from off 0 */
1027 memcpy(buf, d->buf, total_len);
d0031abf 1028
1f5596dd
CB
1029 return total_len;
1030}
1031
1032/*
1033 * Returns 0 on success.
1034 * It is the caller's responsibility to free `return_usage`, unless this
1035 * function returns an error.
1036 */
1037int read_cpuacct_usage_all(char *cg, char *cpuset,
1038 struct cpuacct_usage **return_usage, int *size)
1039{
1040 __do_free char *usage_str = NULL;
1041 __do_free struct cpuacct_usage *cpu_usage = NULL;
9ce186dc 1042 int i = 0, j = 0, read_pos = 0, read_cnt = 0;
8b6987a2 1043 int cpucount;
9ce186dc 1044 int ret;
1f5596dd
CB
1045 int cg_cpu;
1046 uint64_t cg_user, cg_system;
1047 int64_t ticks_per_sec;
1048
1049 ticks_per_sec = sysconf(_SC_CLK_TCK);
1f5596dd 1050 if (ticks_per_sec < 0 && errno == EINVAL) {
8b6987a2 1051 lxcfs_debug("%m - Failed to determine number of ticks per second");
1f5596dd
CB
1052 return -1;
1053 }
1054
f9434b9a 1055 cpucount = get_nprocs_conf();
1f5596dd
CB
1056 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
1057 if (!cpu_usage)
1058 return -ENOMEM;
1059
1060 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1061 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
8b6987a2
CB
1062 char *sep = " \t\n";
1063 char *tok;
1f5596dd 1064
8b6987a2
CB
1065 /* Read cpuacct.usage_percpu instead. */
1066 lxcfs_debug("Falling back to cpuacct.usage_percpu");
1f5596dd
CB
1067 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
1068 return -1;
1f5596dd 1069
8b6987a2
CB
1070 lxc_iterate_parts(tok, usage_str, sep) {
1071 uint64_t percpu_user;
1072
1073 if (i >= cpucount)
1074 break;
1f5596dd 1075
8b6987a2
CB
1076 tok = trim_whitespace_in_place(tok);
1077 ret = safe_uint64(tok, &percpu_user, 10);
1078 if (ret)
1079 return -1;
1f5596dd 1080
8b6987a2
CB
1081 /* Convert the time from nanoseconds to USER_HZ */
1082 cpu_usage[i].user = percpu_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1083 cpu_usage[i].system = cpu_usage[i].user;
1f5596dd 1084 i++;
8b6987a2 1085 lxcfs_debug("cpu%d with time %s", i, tok);
1f5596dd 1086 }
8b6987a2
CB
1087 } else {
1088 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0)
1089 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg);
1f5596dd 1090
8b6987a2 1091 read_pos += read_cnt;
1f5596dd 1092
8b6987a2
CB
1093 for (i = 0, j = 0; i < cpucount; i++) {
1094 ret = sscanf(usage_str + read_pos,
1095 "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu,
1096 &cg_user, &cg_system, &read_cnt);
1f5596dd 1097
8b6987a2
CB
1098 if (ret == EOF)
1099 break;
1f5596dd 1100
8b6987a2
CB
1101 if (ret != 3)
1102 return log_error(-EINVAL, "Failed to parse cpuacct.usage_all line %s from cgroup %s",
1103 usage_str + read_pos, cg);
1f5596dd 1104
8b6987a2 1105 read_pos += read_cnt;
1f5596dd 1106
8b6987a2
CB
1107 /* Convert the time from nanoseconds to USER_HZ */
1108 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1109 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
1110 j++;
1111 }
1f5596dd
CB
1112 }
1113
1114 *return_usage = move_ptr(cpu_usage);
1115 *size = cpucount;
1116 return 0;
1117}
1118
1119static bool cpuview_init_head(struct cg_proc_stat_head **head)
1120{
9d7fc1a3 1121 __do_free struct cg_proc_stat_head *h;
1f5596dd 1122
9d7fc1a3
CB
1123 h = zalloc(sizeof(struct cg_proc_stat_head));
1124 if (!h)
1125 return false;
1f5596dd 1126
9d7fc1a3
CB
1127 if (pthread_rwlock_init(&h->lock, NULL))
1128 return false;
1129
1130 h->lastcheck = time(NULL);
1f5596dd 1131
9d7fc1a3 1132 *head = move_ptr(h);
1f5596dd
CB
1133 return true;
1134}
1135
4ec5c9da 1136bool init_cpuview(void)
1f5596dd
CB
1137{
1138 int i;
1139
1140 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
1141 proc_stat_history[i] = NULL;
1142
1143 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1144 if (!cpuview_init_head(&proc_stat_history[i]))
1145 goto err;
1146 }
1147
1148 return true;
1149
1150err:
1151 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1152 if (proc_stat_history[i])
1153 free_disarm(proc_stat_history[i]);
1154 }
1155
1156 return false;
1157}
1158
1f5596dd
CB
1159static void cpuview_free_head(struct cg_proc_stat_head *head)
1160{
905769cd 1161 struct cg_proc_stat *node;
1f5596dd
CB
1162
1163 if (head->next) {
1164 node = head->next;
1165
1166 for (;;) {
905769cd 1167 struct cg_proc_stat *cur = node;
1f5596dd 1168 node = node->next;
905769cd 1169 free_proc_stat_node(cur);
1f5596dd
CB
1170 if (!node)
1171 break;
1172 }
1173 }
1174
1175 pthread_rwlock_destroy(&head->lock);
1176 free_disarm(head);
1177}
1178
4ec5c9da 1179void free_cpuview(void)
1f5596dd 1180{
4ec5c9da 1181 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++)
1f5596dd
CB
1182 if (proc_stat_history[i])
1183 cpuview_free_head(proc_stat_history[i]);
1f5596dd 1184}