]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/proc_cpuview.c
proc_cpuview: fix compiler warning
[mirror_lxcfs.git] / src / proc_cpuview.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
1f5596dd
CB
2
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE
5#endif
6
7#ifndef FUSE_USE_VERSION
8#define FUSE_USE_VERSION 26
9#endif
10
11#define _FILE_OFFSET_BITS 64
12
13#define __STDC_FORMAT_MACROS
14#include <dirent.h>
15#include <errno.h>
16#include <fcntl.h>
17#include <fuse.h>
18#include <inttypes.h>
19#include <libgen.h>
20#include <pthread.h>
21#include <sched.h>
22#include <stdarg.h>
23#include <stdbool.h>
24#include <stdint.h>
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <time.h>
29#include <unistd.h>
30#include <wait.h>
31#include <linux/magic.h>
32#include <linux/sched.h>
33#include <sys/epoll.h>
34#include <sys/mman.h>
35#include <sys/mount.h>
36#include <sys/param.h>
37#include <sys/socket.h>
38#include <sys/syscall.h>
39#include <sys/sysinfo.h>
40#include <sys/vfs.h>
41
42#include "bindings.h"
43#include "config.h"
44#include "cgroup_fuse.h"
45#include "cpuset_parse.h"
46#include "cgroups/cgroup.h"
47#include "cgroups/cgroup_utils.h"
48#include "memory_utils.h"
4ec5c9da 49#include "proc_loadavg.h"
1f5596dd
CB
50#include "utils.h"
51
1f5596dd
CB
52/* Data for CPU view */
53struct cg_proc_stat {
54 char *cg;
55 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
56 struct cpuacct_usage *view; // Usage stats reported to the container
57 int cpu_count;
58 pthread_mutex_t lock; // For node manipulation
59 struct cg_proc_stat *next;
60};
61
62struct cg_proc_stat_head {
63 struct cg_proc_stat *next;
64 time_t lastcheck;
65
66 /*
67 * For access to the list. Reading can be parallel, pruning is exclusive.
68 */
69 pthread_rwlock_t lock;
70};
71
72#define CPUVIEW_HASH_SIZE 100
73static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
74
b456d40d
CB
75static void reset_proc_stat_node(struct cg_proc_stat *node,
76 struct cpuacct_usage *usage, int cpu_count)
1f5596dd 77{
1f5596dd
CB
78 lxcfs_debug("Resetting stat node for %s\n", node->cg);
79 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
80
b456d40d 81 for (int i = 0; i < cpu_count; i++) {
1f5596dd
CB
82 node->view[i].user = 0;
83 node->view[i].system = 0;
84 node->view[i].idle = 0;
85 }
86
87 node->cpu_count = cpu_count;
88}
89
90static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
91{
92 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
93
94 /* Allocate new memory */
95 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
96 if (!new_usage)
97 return false;
98
99 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
100 if (!new_view)
101 return false;
102
103 /* Copy existing data & initialize new elements */
104 for (int i = 0; i < cpu_count; i++) {
105 if (i < node->cpu_count) {
106 new_usage[i].user = node->usage[i].user;
107 new_usage[i].system = node->usage[i].system;
108 new_usage[i].idle = node->usage[i].idle;
109
110 new_view[i].user = node->view[i].user;
111 new_view[i].system = node->view[i].system;
112 new_view[i].idle = node->view[i].idle;
113 } else {
114 new_usage[i].user = 0;
115 new_usage[i].system = 0;
116 new_usage[i].idle = 0;
117
118 new_view[i].user = 0;
119 new_view[i].system = 0;
120 new_view[i].idle = 0;
121 }
122 }
123
124 free(node->usage);
125 node->usage = move_ptr(new_usage);
126
127 free(node->view);
128 node->view = move_ptr(new_view);
129 node->cpu_count = cpu_count;
130
131 return true;
132}
133
4ec5c9da
CB
134static void free_proc_stat_node(struct cg_proc_stat *node)
135{
136 pthread_mutex_destroy(&node->lock);
137 free_disarm(node->cg);
138 free_disarm(node->usage);
139 free_disarm(node->view);
140 free_disarm(node);
141}
142
1f5596dd
CB
143static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
144{
145 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
146 struct cg_proc_stat_head *head = proc_stat_history[hash];
147 struct cg_proc_stat *node, *rv = new_node;
148
149 pthread_rwlock_wrlock(&head->lock);
150
151 if (!head->next) {
152 head->next = new_node;
153 goto out;
154 }
155
156 node = head->next;
157
158 for (;;) {
159 if (strcmp(node->cg, new_node->cg) == 0) {
160 /* The node is already present, return it */
161 free_proc_stat_node(new_node);
162 rv = node;
163 goto out;
164 }
165
166 if (node->next) {
167 node = node->next;
168 continue;
169 }
170
171 node->next = new_node;
172 goto out;
173 }
174
175out:
176 pthread_rwlock_unlock(&head->lock);
177 return rv;
178}
179
180static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
181{
182 struct cg_proc_stat *node;
183 int i;
184
185 node = malloc(sizeof(struct cg_proc_stat));
186 if (!node)
187 goto err;
188
189 node->cg = NULL;
190 node->usage = NULL;
191 node->view = NULL;
192
193 node->cg = malloc(strlen(cg) + 1);
194 if (!node->cg)
195 goto err;
196
197 strcpy(node->cg, cg);
198
199 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
200 if (!node->usage)
201 goto err;
202
203 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
204
205 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
206 if (!node->view)
207 goto err;
208
209 node->cpu_count = cpu_count;
210 node->next = NULL;
211
b456d40d
CB
212 if (pthread_mutex_init(&node->lock, NULL) != 0)
213 log_error(goto err, "Failed to initialize node lock");
1f5596dd
CB
214
215 for (i = 0; i < cpu_count; i++) {
216 node->view[i].user = 0;
217 node->view[i].system = 0;
218 node->view[i].idle = 0;
219 }
220
221 return node;
222
223err:
224 if (node && node->cg)
225 free(node->cg);
226 if (node && node->usage)
227 free(node->usage);
228 if (node && node->view)
229 free(node->view);
230 if (node)
231 free(node);
232
233 return NULL;
234}
235
4ec5c9da
CB
236static bool cgfs_param_exist(const char *controller, const char *cgroup,
237 const char *file)
238{
2c990b1d
CB
239 __do_free char *path = NULL;
240 int cfd;
4ec5c9da
CB
241
242 cfd = get_cgroup_fd(controller);
243 if (cfd < 0)
244 return false;
245
e7f9baee
CB
246 if (is_relative(cgroup))
247 path = must_make_path(cgroup, file, NULL);
248 else
249 path = must_make_path(".", cgroup, file, NULL);
250
2c990b1d 251 return (faccessat(cfd, path, F_OK, 0) == 0);
4ec5c9da
CB
252}
253
1f5596dd
CB
254static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
255{
b456d40d 256 struct cg_proc_stat *first = NULL;
1f5596dd 257
b456d40d 258 for (struct cg_proc_stat *prev = NULL; node; ) {
1f5596dd 259 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
b456d40d
CB
260 struct cg_proc_stat *tmp = node;
261
1f5596dd
CB
262 lxcfs_debug("Removing stat node for %s\n", node->cg);
263
264 if (prev)
265 prev->next = node->next;
266 else
267 first = node->next;
268
269 node = node->next;
270 free_proc_stat_node(tmp);
271 } else {
272 if (!first)
273 first = node;
274 prev = node;
275 node = node->next;
276 }
277 }
278
279 return first;
280}
281
282#define PROC_STAT_PRUNE_INTERVAL 10
283static void prune_proc_stat_history(void)
284{
1f5596dd
CB
285 time_t now = time(NULL);
286
b456d40d 287 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1f5596dd
CB
288 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
289
290 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
291 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
292 return;
293 }
294
295 if (proc_stat_history[i]->next) {
296 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
297 proc_stat_history[i]->lastcheck = now;
298 }
299
300 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
301 }
302}
303
304static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head,
305 const char *cg)
306{
307 struct cg_proc_stat *node;
308
309 pthread_rwlock_rdlock(&head->lock);
310
311 if (!head->next) {
312 pthread_rwlock_unlock(&head->lock);
313 return NULL;
314 }
315
316 node = head->next;
317
318 do {
319 if (strcmp(cg, node->cg) == 0)
320 goto out;
321 } while ((node = node->next));
322
323 node = NULL;
324
325out:
326 pthread_rwlock_unlock(&head->lock);
327 prune_proc_stat_history();
328 return node;
329}
330
331static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
332{
333 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
334 struct cg_proc_stat_head *head = proc_stat_history[hash];
335 struct cg_proc_stat *node;
336
337 node = find_proc_stat_node(head, cg);
1f5596dd
CB
338 if (!node) {
339 node = new_proc_stat_node(usage, cpu_count, cg);
340 if (!node)
341 return NULL;
342
343 node = add_proc_stat_node(node);
344 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
345 }
346
347 pthread_mutex_lock(&node->lock);
348
349 /* If additional CPUs on the host have been enabled, CPU usage counter
350 * arrays have to be expanded */
351 if (node->cpu_count < cpu_count) {
352 lxcfs_debug("Expanding stat node %d->%d for %s\n",
353 node->cpu_count, cpu_count, cg);
354
355 if (!expand_proc_stat_node(node, cpu_count)) {
356 pthread_mutex_unlock(&node->lock);
b456d40d 357 return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg);
1f5596dd
CB
358 }
359 }
360
361 return node;
362}
363
2b8eff1d
CB
364static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage,
365 uint64_t *counter, uint64_t threshold)
1f5596dd 366{
1ba088ae 367 uint64_t free_space, to_add;
1f5596dd
CB
368
369 free_space = threshold - usage->user - usage->system;
370
371 if (free_space > usage->idle)
372 free_space = usage->idle;
373
374 to_add = free_space > *surplus ? *surplus : free_space;
375
376 *counter += to_add;
377 usage->idle -= to_add;
378 *surplus -= to_add;
379}
380
1ba088ae
CB
381static uint64_t diff_cpu_usage(struct cpuacct_usage *older,
382 struct cpuacct_usage *newer,
383 struct cpuacct_usage *diff, int cpu_count)
1f5596dd 384{
1ba088ae 385 uint64_t sum = 0;
1f5596dd 386
b456d40d 387 for (int i = 0; i < cpu_count; i++) {
1f5596dd
CB
388 if (!newer[i].online)
389 continue;
390
b456d40d
CB
391 /*
392 * When cpuset is changed on the fly, the CPUs might get
393 * reordered. We could either reset all counters, or check
394 * that the substractions below will return expected results.
1f5596dd
CB
395 */
396 if (newer[i].user > older[i].user)
397 diff[i].user = newer[i].user - older[i].user;
398 else
399 diff[i].user = 0;
400
401 if (newer[i].system > older[i].system)
402 diff[i].system = newer[i].system - older[i].system;
403 else
404 diff[i].system = 0;
405
406 if (newer[i].idle > older[i].idle)
407 diff[i].idle = newer[i].idle - older[i].idle;
408 else
409 diff[i].idle = 0;
410
411 sum += diff[i].user;
412 sum += diff[i].system;
413 sum += diff[i].idle;
414 }
415
416 return sum;
417}
418
419/*
b456d40d
CB
420 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
421 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
422 * throuh `value`.
1f5596dd
CB
423 */
424static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
425{
426 __do_free char *str = NULL;
427 char file[11 + 6 + 1]; /* cpu.cfs__us + quota/period + \0 */
9844eea7 428 bool first = true;
1f5596dd 429
9844eea7
JC
430 if (!pure_unified_layout(cgroup_ops)) {
431 snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
432 } else {
433 strcpy(file, "cpu.max");
434 first = !strcmp(param, "quota");
435 }
1f5596dd
CB
436
437 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
438 return false;
439
8220dfcd 440 if (sscanf(str, first ? "%" PRId64 : "%*d %" PRId64, value) != 1)
1f5596dd
CB
441 return false;
442
443 return true;
444}
445
446/*
447 * Return the exact number of visible CPUs based on CPU quotas.
448 * If there is no quota set, zero is returned.
449 */
450static double exact_cpu_count(const char *cg)
451{
452 double rv;
453 int nprocs;
454 int64_t cfs_quota, cfs_period;
455
9844eea7
JC
456 read_cpu_cfs_param(cg, "quota", &cfs_quota);
457 read_cpu_cfs_param(cg, "period", &cfs_period);
1f5596dd
CB
458
459 if (cfs_quota <= 0 || cfs_period <= 0)
460 return 0;
461
462 rv = (double)cfs_quota / (double)cfs_period;
463
464 nprocs = get_nprocs();
465
466 if (rv > nprocs)
467 rv = nprocs;
468
469 return rv;
470}
471
472/*
473 * Return the maximum number of visible CPUs based on CPU quotas.
474 * If there is no quota set, zero is returned.
475 */
4ec5c9da 476int max_cpu_count(const char *cg)
1f5596dd 477{
700dd417 478 __do_free char *cpuset = NULL;
1f5596dd
CB
479 int rv, nprocs;
480 int64_t cfs_quota, cfs_period;
481 int nr_cpus_in_cpuset = 0;
1f5596dd 482
9844eea7
JC
483 read_cpu_cfs_param(cg, "quota", &cfs_quota);
484 read_cpu_cfs_param(cg, "period", &cfs_period);
1f5596dd
CB
485
486 cpuset = get_cpuset(cg);
487 if (cpuset)
488 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
489
490 if (cfs_quota <= 0 || cfs_period <= 0){
491 if (nr_cpus_in_cpuset > 0)
492 return nr_cpus_in_cpuset;
493
494 return 0;
495 }
496
497 rv = cfs_quota / cfs_period;
498
499 /* In case quota/period does not yield a whole number, add one CPU for
500 * the remainder.
501 */
502 if ((cfs_quota % cfs_period) > 0)
503 rv += 1;
504
505 nprocs = get_nprocs();
1f5596dd
CB
506 if (rv > nprocs)
507 rv = nprocs;
508
509 /* use min value in cpu quota and cpuset */
510 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
511 rv = nr_cpus_in_cpuset;
512
513 return rv;
514}
515
516int cpuview_proc_stat(const char *cg, const char *cpuset,
517 struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size,
518 FILE *f, char *buf, size_t buf_size)
519{
520 __do_free char *line = NULL;
521 __do_free struct cpuacct_usage *diff = NULL;
4f18a602 522 size_t linelen = 0, total_len = 0;
1f5596dd
CB
523 int curcpu = -1; /* cpu numbering starts at 0 */
524 int physcpu, i;
39f231da 525 int cpu_cnt = 0;
2b8eff1d
CB
526 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
527 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
528 uint64_t user_sum = 0, system_sum = 0, idle_sum = 0;
529 uint64_t user_surplus = 0, system_surplus = 0;
39f231da 530 int nprocs, max_cpus;
4f18a602 531 ssize_t l;
2b8eff1d 532 uint64_t total_sum, threshold;
1f5596dd 533 struct cg_proc_stat *stat_node;
1f5596dd 534
39f231da 535 nprocs = get_nprocs_conf();
1f5596dd
CB
536 if (cg_cpu_usage_size < nprocs)
537 nprocs = cg_cpu_usage_size;
538
539 /* Read all CPU stats and stop when we've encountered other lines */
540 while (getline(&line, &linelen, f) != -1) {
541 int ret;
542 char cpu_char[10]; /* That's a lot of cores */
543 uint64_t all_used, cg_used;
544
545 if (strlen(line) == 0)
546 continue;
547
548 /* not a ^cpuN line containing a number N */
549 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
550 break;
551
552 if (sscanf(cpu_char, "%d", &physcpu) != 1)
553 continue;
554
555 if (physcpu >= cg_cpu_usage_size)
556 continue;
557
fd65c77c
CB
558 curcpu++;
559 cpu_cnt++;
1f5596dd
CB
560
561 if (!cpu_in_cpuset(physcpu, cpuset)) {
562 for (i = curcpu; i <= physcpu; i++)
563 cg_cpu_usage[i].online = false;
564 continue;
565 }
566
567 if (curcpu < physcpu) {
568 /* Some CPUs may be disabled */
569 for (i = curcpu; i < physcpu; i++)
570 cg_cpu_usage[i].online = false;
571
572 curcpu = physcpu;
573 }
574
575 cg_cpu_usage[curcpu].online = true;
576
2b8eff1d 577 ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu",
1f5596dd
CB
578 &user,
579 &nice,
580 &system,
581 &idle,
582 &iowait,
583 &irq,
584 &softirq,
585 &steal,
586 &guest,
587 &guest_nice);
1f5596dd
CB
588 if (ret != 10)
589 continue;
590
591 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
592 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
593
594 if (all_used >= cg_used) {
595 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
596
597 } else {
2b8eff1d
CB
598 lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
599 curcpu, cg, all_used, cg_used);
1f5596dd
CB
600 cg_cpu_usage[curcpu].idle = idle;
601 }
602 }
603
f9434b9a
CB
604 /* Cannot use more CPUs than is available in cpuset. */
605 max_cpus = max_cpu_count(cg);
606 if (max_cpus > cpu_cnt || !max_cpus)
607 max_cpus = cpu_cnt;
608
1f5596dd 609 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
b456d40d
CB
610 if (!stat_node)
611 return log_error(0, "Failed to find/create stat node for %s", cg);
1f5596dd
CB
612
613 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
700dd417 614 if (!diff)
1f5596dd 615 return 0;
1f5596dd
CB
616
617 /*
618 * If the new values are LOWER than values stored in memory, it means
619 * the cgroup has been reset/recreated and we should reset too.
620 */
621 for (curcpu = 0; curcpu < nprocs; curcpu++) {
622 if (!cg_cpu_usage[curcpu].online)
623 continue;
624
625 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
626 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
627
628 break;
629 }
630
631 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
632
633 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
634 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
635
636 if (!stat_node->usage[curcpu].online)
637 continue;
638
639 i++;
640
641 stat_node->usage[curcpu].user += diff[curcpu].user;
642 stat_node->usage[curcpu].system += diff[curcpu].system;
643 stat_node->usage[curcpu].idle += diff[curcpu].idle;
644
645 if (max_cpus > 0 && i >= max_cpus) {
646 user_surplus += diff[curcpu].user;
647 system_surplus += diff[curcpu].system;
648 }
649 }
650
651 /* Calculate usage counters of visible CPUs */
652 if (max_cpus > 0) {
2b8eff1d
CB
653 uint64_t diff_user = 0;
654 uint64_t diff_system = 0;
655 uint64_t diff_idle = 0;
656 uint64_t max_diff_idle = 0;
657 uint64_t max_diff_idle_index = 0;
1f5596dd
CB
658 double exact_cpus;
659
660 /* threshold = maximum usage per cpu, including idle */
661 threshold = total_sum / cpu_cnt * max_cpus;
662
663 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
664 if (!stat_node->usage[curcpu].online)
665 continue;
666
667 i++;
668
669 if (i == max_cpus)
670 break;
671
672 if (diff[curcpu].user + diff[curcpu].system >= threshold)
673 continue;
674
675 /* Add user */
676 add_cpu_usage(&user_surplus, &diff[curcpu],
677 &diff[curcpu].user, threshold);
678
679 if (diff[curcpu].user + diff[curcpu].system >= threshold)
680 continue;
681
682 /* If there is still room, add system */
683 add_cpu_usage(&system_surplus, &diff[curcpu],
684 &diff[curcpu].system, threshold);
685 }
686
687 if (user_surplus > 0)
688 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
689 if (system_surplus > 0)
690 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
691
692 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
693 if (!stat_node->usage[curcpu].online)
694 continue;
695
696 i++;
697
698 if (i == max_cpus)
699 break;
700
701 stat_node->view[curcpu].user += diff[curcpu].user;
702 stat_node->view[curcpu].system += diff[curcpu].system;
703 stat_node->view[curcpu].idle += diff[curcpu].idle;
704
705 user_sum += stat_node->view[curcpu].user;
706 system_sum += stat_node->view[curcpu].system;
707 idle_sum += stat_node->view[curcpu].idle;
708
709 diff_user += diff[curcpu].user;
710 diff_system += diff[curcpu].system;
711 diff_idle += diff[curcpu].idle;
712 if (diff[curcpu].idle > max_diff_idle) {
713 max_diff_idle = diff[curcpu].idle;
714 max_diff_idle_index = curcpu;
715 }
716
717 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
718 }
719 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
720
721 /* revise cpu usage view to support partial cpu case. */
722 exact_cpus = exact_cpu_count(cg);
723 if (exact_cpus < (double)max_cpus){
1ba088ae 724 uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
1f5596dd
CB
725
726 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
727 lxcfs_v("delta: %lu\n", delta);
728 lxcfs_v("idle_sum before: %lu\n", idle_sum);
729 idle_sum = idle_sum > delta ? idle_sum - delta : 0;
730 lxcfs_v("idle_sum after: %lu\n", idle_sum);
731
732 curcpu = max_diff_idle_index;
733 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
734 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
735 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
736 }
737 } else {
738 for (curcpu = 0; curcpu < nprocs; curcpu++) {
739 if (!stat_node->usage[curcpu].online)
740 continue;
741
742 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
743 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
744 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
745
746 user_sum += stat_node->view[curcpu].user;
747 system_sum += stat_node->view[curcpu].system;
748 idle_sum += stat_node->view[curcpu].idle;
749 }
750 }
751
752 /* Render the file */
753 /* cpu-all */
2b8eff1d
CB
754 l = snprintf(buf, buf_size,
755 "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
756 user_sum, system_sum, idle_sum);
1f5596dd 757 lxcfs_v("cpu-all: %s\n", buf);
b456d40d
CB
758 if (l < 0)
759 return log_error(0, "Failed to write cache");
760 if (l >= buf_size)
761 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
762
763 buf += l;
764 buf_size -= l;
765 total_len += l;
766
767 /* Render visible CPUs */
768 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
769 if (!stat_node->usage[curcpu].online)
770 continue;
771
772 i++;
773
774 if (max_cpus > 0 && i == max_cpus)
775 break;
776
2b8eff1d
CB
777 l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
778 i,
779 stat_node->view[curcpu].user,
780 stat_node->view[curcpu].system,
781 stat_node->view[curcpu].idle);
1f5596dd 782 lxcfs_v("cpu: %s\n", buf);
b456d40d
CB
783 if (l < 0)
784 return log_error(0, "Failed to write cache");
785 if (l >= buf_size)
786 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
787
788 buf += l;
789 buf_size -= l;
790 total_len += l;
791 }
792
793 /* Pass the rest of /proc/stat, start with the last line read */
794 l = snprintf(buf, buf_size, "%s", line);
b456d40d
CB
795 if (l < 0)
796 return log_error(0, "Failed to write cache");
797 if (l >= buf_size)
798 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
799
800 buf += l;
801 buf_size -= l;
802 total_len += l;
803
804 /* Pass the rest of the host's /proc/stat */
805 while (getline(&line, &linelen, f) != -1) {
806 l = snprintf(buf, buf_size, "%s", line);
b456d40d
CB
807 if (l < 0)
808 return log_error(0, "Failed to write cache");
809 if (l >= buf_size)
810 return log_error(0, "Write to cache was truncated");
811
1f5596dd
CB
812 buf += l;
813 buf_size -= l;
814 total_len += l;
815 }
816
817 if (stat_node)
818 pthread_mutex_unlock(&stat_node->lock);
b456d40d 819
1f5596dd
CB
820 return total_len;
821}
822
823/*
824 * check whether this is a '^processor" line in /proc/cpuinfo
825 */
b456d40d 826static inline bool is_processor_line(const char *line)
1f5596dd
CB
827{
828 int cpu;
b456d40d 829 return sscanf(line, "processor : %d", &cpu) == 1;
1f5596dd
CB
830}
831
b456d40d 832static inline bool cpuline_in_cpuset(const char *line, const char *cpuset)
1f5596dd
CB
833{
834 int cpu;
c539526c
CB
835
836 if (sscanf(line, "processor : %d", &cpu) == 1)
837 return cpu_in_cpuset(cpu, cpuset);
838
839 return false;
1f5596dd
CB
840}
841
842int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
843 struct fuse_file_info *fi)
844{
845 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
757a63e7 846 __do_free void *fopen_cache = NULL;
1f5596dd
CB
847 __do_fclose FILE *f = NULL;
848 struct fuse_context *fc = fuse_get_context();
0274438c 849 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
99b183fb 850 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1f5596dd
CB
851 size_t linelen = 0, total_len = 0;
852 bool am_printing = false, firstline = true, is_s390x = false;
853 int curcpu = -1, cpu, max_cpus = 0;
854 bool use_view;
855 char *cache = d->buf;
856 size_t cache_size = d->buflen;
857
f9434b9a 858 if (offset) {
1f5596dd
CB
859 int left;
860
861 if (offset > d->size)
862 return -EINVAL;
863
864 if (!d->cached)
865 return 0;
866
867 left = d->size - offset;
868 total_len = left > size ? size: left;
869 memcpy(buf, cache + offset, total_len);
870
871 return total_len;
872 }
873
874 pid_t initpid = lookup_initpid_in_store(fc->pid);
875 if (initpid <= 1 || is_shared_pidns(initpid))
876 initpid = fc->pid;
b456d40d 877
1f5596dd
CB
878 cg = get_pid_cgroup(initpid, "cpuset");
879 if (!cg)
880 return read_file_fuse("proc/cpuinfo", buf, size, d);
881 prune_init_slice(cg);
882
883 cpuset = get_cpuset(cg);
884 if (!cpuset)
885 return 0;
886
8044f626 887 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
0274438c 888 use_view = true;
8044f626
CB
889 else
890 use_view = false;
1f5596dd
CB
891 if (use_view)
892 max_cpus = max_cpu_count(cg);
893
757a63e7 894 f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache);
1f5596dd
CB
895 if (!f)
896 return 0;
897
898 while (getline(&line, &linelen, f) != -1) {
899 ssize_t l;
900 if (firstline) {
901 firstline = false;
902 if (strstr(line, "IBM/S390") != NULL) {
903 is_s390x = true;
904 am_printing = true;
905 continue;
906 }
907 }
b456d40d 908
1f5596dd
CB
909 if (strncmp(line, "# processors:", 12) == 0)
910 continue;
b456d40d 911
1f5596dd 912 if (is_processor_line(line)) {
d0031abf 913 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
1f5596dd 914 break;
b456d40d 915
1f5596dd
CB
916 am_printing = cpuline_in_cpuset(line, cpuset);
917 if (am_printing) {
d0031abf 918 curcpu++;
1f5596dd 919 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
b456d40d
CB
920 if (l < 0)
921 return log_error(0, "Failed to write cache");
922 if (l >= cache_size)
923 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
924 cache += l;
925 cache_size -= l;
926 total_len += l;
927 }
928 continue;
929 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
930 char *p;
b456d40d 931
d0031abf 932 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
1f5596dd 933 break;
b456d40d 934
1f5596dd
CB
935 if (!cpu_in_cpuset(cpu, cpuset))
936 continue;
b456d40d 937
1f5596dd
CB
938 curcpu ++;
939 p = strchr(line, ':');
940 if (!p || !*p)
941 return 0;
942 p++;
b456d40d 943
1f5596dd 944 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
b456d40d
CB
945 if (l < 0)
946 return log_error(0, "Failed to write cache");
947 if (l >= cache_size)
948 return log_error(0, "Write to cache was truncated");
949
1f5596dd
CB
950 cache += l;
951 cache_size -= l;
952 total_len += l;
953 continue;
954
955 }
956 if (am_printing) {
957 l = snprintf(cache, cache_size, "%s", line);
b456d40d
CB
958 if (l < 0)
959 return log_error(0, "Failed to write cache");
960 if (l >= cache_size)
961 return log_error(0, "Write to cache was truncated");
962
1f5596dd
CB
963 cache += l;
964 cache_size -= l;
965 total_len += l;
966 }
967 }
968
969 if (is_s390x) {
970 __do_free char *origcache = d->buf;
971 ssize_t l;
972
973 d->buf = malloc(d->buflen);
974 if (!d->buf) {
975 d->buf = move_ptr(origcache);
976 return 0;
977 }
978
979 cache = d->buf;
980 cache_size = d->buflen;
981 total_len = 0;
982 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
983 if (l < 0 || l >= cache_size)
984 return 0;
985
986 cache_size -= l;
987 cache += l;
988 total_len += l;
989 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
990 if (l < 0 || l >= cache_size)
991 return 0;
992
993 cache_size -= l;
994 cache += l;
995 total_len += l;
996 l = snprintf(cache, cache_size, "%s", origcache);
997 if (l < 0 || l >= cache_size)
998 return 0;
999 total_len += l;
1000 }
1001
1002 d->cached = 1;
1003 d->size = total_len;
d0031abf
CB
1004 if (total_len > size)
1005 total_len = size;
1f5596dd
CB
1006
1007 /* read from off 0 */
1008 memcpy(buf, d->buf, total_len);
d0031abf 1009
1f5596dd
CB
1010 return total_len;
1011}
1012
1013/*
1014 * Returns 0 on success.
1015 * It is the caller's responsibility to free `return_usage`, unless this
1016 * function returns an error.
1017 */
1018int read_cpuacct_usage_all(char *cg, char *cpuset,
1019 struct cpuacct_usage **return_usage, int *size)
1020{
1021 __do_free char *usage_str = NULL;
1022 __do_free struct cpuacct_usage *cpu_usage = NULL;
f9434b9a 1023 int cpucount;
9ce186dc
CB
1024 int i = 0, j = 0, read_pos = 0, read_cnt = 0;
1025 int ret;
1f5596dd
CB
1026 int cg_cpu;
1027 uint64_t cg_user, cg_system;
1028 int64_t ticks_per_sec;
1029
1030 ticks_per_sec = sysconf(_SC_CLK_TCK);
1f5596dd
CB
1031 if (ticks_per_sec < 0 && errno == EINVAL) {
1032 lxcfs_v(
1033 "%s\n",
1034 "read_cpuacct_usage_all failed to determine number of clock ticks "
1035 "in a second");
1036 return -1;
1037 }
1038
f9434b9a 1039 cpucount = get_nprocs_conf();
1f5596dd
CB
1040 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
1041 if (!cpu_usage)
1042 return -ENOMEM;
1043
1044 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1045 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
1046 char *data = NULL;
1f5596dd
CB
1047 size_t sz = 0, asz = 0;
1048
1049 /* read cpuacct.usage_percpu instead. */
1050 lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", "");
1051 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
1052 return -1;
1053 lxcfs_v("usage_str: %s\n", usage_str);
1054
1055 /* convert cpuacct.usage_percpu into cpuacct.usage_all. */
1056 lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", "");
1057
1058 must_strcat(&data, &sz, &asz, "cpu user system\n");
1059
2b8eff1d
CB
1060 while (sscanf(usage_str + read_pos, "%" PRIu64 " %n", &cg_user, &read_cnt) > 0) {
1061 lxcfs_debug("i: %d, cg_user: %" PRIu64 ", read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt);
1f5596dd
CB
1062 must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user);
1063 i++;
1064 read_pos += read_cnt;
1065 }
1066
1067 usage_str = data;
1068
1069 lxcfs_v("usage_str: %s\n", usage_str);
1070 }
1071
b456d40d
CB
1072 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0)
1073 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg);
1f5596dd
CB
1074
1075 read_pos += read_cnt;
1076
1077 for (i = 0, j = 0; i < cpucount; i++) {
2b8eff1d
CB
1078 ret = sscanf(usage_str + read_pos,
1079 "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu,
1080 &cg_user, &cg_system, &read_cnt);
1f5596dd
CB
1081
1082 if (ret == EOF)
1083 break;
1084
b456d40d
CB
1085 if (ret != 3)
1086 return log_error(-1, "read_cpuacct_usage_all reading from %s/cpuacct.usage_all failed", cg);
1f5596dd
CB
1087
1088 read_pos += read_cnt;
1089
1090 /* Convert the time from nanoseconds to USER_HZ */
1091 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1092 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
1093 j++;
1094 }
1095
1096 *return_usage = move_ptr(cpu_usage);
1097 *size = cpucount;
1098 return 0;
1099}
1100
1101static bool cpuview_init_head(struct cg_proc_stat_head **head)
1102{
1103 *head = malloc(sizeof(struct cg_proc_stat_head));
b456d40d
CB
1104 if (!(*head))
1105 return log_error(false, "%s", strerror(errno));
1f5596dd
CB
1106
1107 (*head)->lastcheck = time(NULL);
1108 (*head)->next = NULL;
1109
1110 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
1f5596dd 1111 free_disarm(*head);
b456d40d 1112 return log_error(false, "Failed to initialize list lock");
1f5596dd
CB
1113 }
1114
1115 return true;
1116}
1117
4ec5c9da 1118bool init_cpuview(void)
1f5596dd
CB
1119{
1120 int i;
1121
1122 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
1123 proc_stat_history[i] = NULL;
1124
1125 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1126 if (!cpuview_init_head(&proc_stat_history[i]))
1127 goto err;
1128 }
1129
1130 return true;
1131
1132err:
1133 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1134 if (proc_stat_history[i])
1135 free_disarm(proc_stat_history[i]);
1136 }
1137
1138 return false;
1139}
1140
1f5596dd
CB
1141static void cpuview_free_head(struct cg_proc_stat_head *head)
1142{
1143 struct cg_proc_stat *node, *tmp;
1144
1145 if (head->next) {
1146 node = head->next;
1147
1148 for (;;) {
1149 tmp = node;
1150 node = node->next;
1151 free_proc_stat_node(tmp);
1152
1153 if (!node)
1154 break;
1155 }
1156 }
1157
1158 pthread_rwlock_destroy(&head->lock);
1159 free_disarm(head);
1160}
1161
4ec5c9da 1162void free_cpuview(void)
1f5596dd 1163{
4ec5c9da 1164 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++)
1f5596dd
CB
1165 if (proc_stat_history[i])
1166 cpuview_free_head(proc_stat_history[i]);
1f5596dd 1167}