]>
git.proxmox.com Git - mirror_lxcfs.git/blob - src/proc_cpuview.c
89f366684881bacc921f6f47728c5816247909f2
1 /* SPDX-License-Identifier: LGPL-2.1+ */
21 #include <linux/magic.h>
22 #include <linux/sched.h>
23 #include <sys/epoll.h>
25 #include <sys/mount.h>
26 #include <sys/param.h>
27 #include <sys/socket.h>
28 #include <sys/syscall.h>
29 #include <sys/sysinfo.h>
32 #include "proc_cpuview.h"
35 #include "cgroup_fuse.h"
36 #include "cpuset_parse.h"
37 #include "cgroups/cgroup.h"
38 #include "cgroups/cgroup_utils.h"
39 #include "memory_utils.h"
40 #include "proc_loadavg.h"
43 /* Data for CPU view */
46 struct cpuacct_usage
*usage
; /* Real usage as read from the host's /proc/stat. */
47 struct cpuacct_usage
*view
; /* Usage stats reported to the container. */
49 pthread_mutex_t lock
; /* For node manipulation. */
50 struct cg_proc_stat
*next
;
53 struct cg_proc_stat_head
{
54 struct cg_proc_stat
*next
;
58 * For access to the list. Reading can be parallel, pruning is exclusive.
60 pthread_rwlock_t lock
;
63 #define CPUVIEW_HASH_SIZE 100
64 static struct cg_proc_stat_head
*proc_stat_history
[CPUVIEW_HASH_SIZE
];
66 static void reset_proc_stat_node(struct cg_proc_stat
*node
,
67 struct cpuacct_usage
*usage
, int cpu_count
)
69 lxcfs_debug("Resetting stat node for %s\n", node
->cg
);
70 memcpy(node
->usage
, usage
, sizeof(struct cpuacct_usage
) * cpu_count
);
72 for (int i
= 0; i
< cpu_count
; i
++) {
73 node
->view
[i
].user
= 0;
74 node
->view
[i
].system
= 0;
75 node
->view
[i
].idle
= 0;
78 node
->cpu_count
= cpu_count
;
81 static bool expand_proc_stat_node(struct cg_proc_stat
*node
, int cpu_count
)
83 __do_free
struct cpuacct_usage
*new_usage
= NULL
, *new_view
= NULL
;
85 /* Allocate new memory */
86 new_usage
= zalloc(sizeof(struct cpuacct_usage
) * cpu_count
);
90 new_view
= zalloc(sizeof(struct cpuacct_usage
) * cpu_count
);
94 /* Copy existing data & initialize new elements */
95 for (int i
= 0; i
< cpu_count
; i
++) {
96 if (i
< node
->cpu_count
) {
97 new_usage
[i
].user
= node
->usage
[i
].user
;
98 new_usage
[i
].system
= node
->usage
[i
].system
;
99 new_usage
[i
].idle
= node
->usage
[i
].idle
;
101 new_view
[i
].user
= node
->view
[i
].user
;
102 new_view
[i
].system
= node
->view
[i
].system
;
103 new_view
[i
].idle
= node
->view
[i
].idle
;
108 node
->usage
= move_ptr(new_usage
);
111 node
->view
= move_ptr(new_view
);
112 node
->cpu_count
= cpu_count
;
117 static void free_proc_stat_node(struct cg_proc_stat
*node
)
121 * We're abusing the usage pointer to indicate that
122 * pthread_mutex_init() was successful. Don't judge me.
125 pthread_mutex_destroy(&node
->lock
);
126 free_disarm(node
->cg
);
127 free_disarm(node
->usage
);
128 free_disarm(node
->view
);
133 define_cleanup_function(struct cg_proc_stat
*, free_proc_stat_node
);
135 static struct cg_proc_stat
*add_proc_stat_node(struct cg_proc_stat
*new_node
)
137 call_cleaner(free_proc_stat_node
) struct cg_proc_stat
*new = new_node
;
138 struct cg_proc_stat
*rv
= new_node
;
139 int hash
= calc_hash(new->cg
) % CPUVIEW_HASH_SIZE
;
140 struct cg_proc_stat_head
*head
= proc_stat_history
[hash
];
141 struct cg_proc_stat
*cur
;
143 pthread_rwlock_wrlock(&head
->lock
);
146 head
->next
= move_ptr(new);
147 goto out_rwlock_unlock
;
154 * The node to be added is already present in the list, so
155 * free the newly allocated one and return the one we found.
157 if (strcmp(cur
->cg
, new->cg
) == 0) {
159 goto out_rwlock_unlock
;
168 /* Add new node to end of list. */
169 cur
->next
= move_ptr(new);
170 goto out_rwlock_unlock
;
174 pthread_mutex_lock(&rv
->lock
);
175 pthread_rwlock_unlock(&head
->lock
);
179 static struct cg_proc_stat
*new_proc_stat_node(struct cpuacct_usage
*usage
,
180 int cpu_count
, const char *cg
)
182 call_cleaner(free_proc_stat_node
) struct cg_proc_stat
*node
= NULL
;
183 __do_free
struct cpuacct_usage
*new_usage
= NULL
;
185 node
= zalloc(sizeof(struct cg_proc_stat
));
189 node
->cg
= strdup(cg
);
193 new_usage
= memdup(usage
, sizeof(struct cpuacct_usage
) * cpu_count
);
197 node
->view
= zalloc(sizeof(struct cpuacct_usage
) * cpu_count
);
201 node
->cpu_count
= cpu_count
;
203 if (pthread_mutex_init(&node
->lock
, NULL
))
206 * We're abusing the usage pointer to indicate that
207 * pthread_mutex_init() was successful. Don't judge me.
209 node
->usage
= move_ptr(new_usage
);
211 return move_ptr(node
);
214 static bool cgroup_supports(const char *controller
, const char *cgroup
,
217 __do_free
char *path
= NULL
;
220 cfd
= get_cgroup_fd(controller
);
224 path
= must_make_path_relative(cgroup
, file
, NULL
);
225 return faccessat(cfd
, path
, F_OK
, 0) == 0;
228 /* should be called with wr-locked list */
229 static struct cg_proc_stat
*prune_proc_stat_list(struct cg_proc_stat
*node
)
231 struct cg_proc_stat
*first
= NULL
;
233 for (struct cg_proc_stat
*prev
= NULL
; node
; ) {
234 if (!cgroup_supports("cpu", node
->cg
, "cpu.shares")) {
235 struct cg_proc_stat
*cur
= node
;
238 * We need to ensure that no one referenced this node,
239 * because we are going to remove it from the list and free memory.
241 * If we can't grab the lock then just keep this node for now.
243 if (pthread_mutex_trylock(&cur
->lock
))
247 * Yes, we can put lock back just after taking it, as we ensured
248 * that we are only one user of it right now.
250 * It follows from three facts:
251 * - we are under pthread_rwlock_wrlock(hash_table_bucket)
252 * - pthread_mutex_lock is taken by find_proc_stat_node()
253 * with pthread_rwlock_rdlock(hash_table_bucket) held.
254 * - pthread_mutex_lock is taken by add_proc_stat_node()
255 * with pthread_rwlock_wrlock(hash_table_bucket) held.
257 * It means that nobody can get a pointer to (cur) node in a parallel
258 * thread and all old users of (cur) node have released pthread_mutex_lock(cur).
260 pthread_mutex_unlock(&cur
->lock
);
263 prev
->next
= node
->next
;
268 lxcfs_debug("Removing stat node for %s\n", cur
);
270 free_proc_stat_node(cur
);
283 #define PROC_STAT_PRUNE_INTERVAL 10
284 static void prune_proc_stat_history(void)
286 time_t now
= time(NULL
);
288 for (int i
= 0; i
< CPUVIEW_HASH_SIZE
; i
++) {
289 pthread_rwlock_wrlock(&proc_stat_history
[i
]->lock
);
291 if ((proc_stat_history
[i
]->lastcheck
+ PROC_STAT_PRUNE_INTERVAL
) > now
) {
292 pthread_rwlock_unlock(&proc_stat_history
[i
]->lock
);
296 if (proc_stat_history
[i
]->next
) {
297 proc_stat_history
[i
]->next
= prune_proc_stat_list(proc_stat_history
[i
]->next
);
298 proc_stat_history
[i
]->lastcheck
= now
;
301 pthread_rwlock_unlock(&proc_stat_history
[i
]->lock
);
305 static struct cg_proc_stat
*find_proc_stat_node(struct cg_proc_stat_head
*head
,
308 struct cg_proc_stat
*node
;
310 prune_proc_stat_history();
311 pthread_rwlock_rdlock(&head
->lock
);
314 pthread_rwlock_unlock(&head
->lock
);
321 if (strcmp(cg
, node
->cg
) == 0) {
322 pthread_mutex_lock(&node
->lock
);
325 } while ((node
= node
->next
));
330 pthread_rwlock_unlock(&head
->lock
);
334 static struct cg_proc_stat
*find_or_create_proc_stat_node(struct cpuacct_usage
*usage
,
335 int cpu_count
, const char *cg
)
337 int hash
= calc_hash(cg
) % CPUVIEW_HASH_SIZE
;
338 struct cg_proc_stat_head
*head
= proc_stat_history
[hash
];
339 struct cg_proc_stat
*node
;
341 node
= find_proc_stat_node(head
, cg
);
343 node
= new_proc_stat_node(usage
, cpu_count
, cg
);
347 node
= add_proc_stat_node(node
);
348 lxcfs_debug("New stat node (%d) for %s\n", cpu_count
, cg
);
352 * If additional CPUs on the host have been enabled, CPU usage counter
353 * arrays have to be expanded.
355 if (node
->cpu_count
< cpu_count
) {
356 lxcfs_debug("Expanding stat node %d->%d for %s\n",
357 node
->cpu_count
, cpu_count
, cg
);
359 if (!expand_proc_stat_node(node
, cpu_count
)) {
360 pthread_mutex_unlock(&node
->lock
);
361 return log_debug(NULL
, "Unable to expand stat node %d->%d for %s", node
->cpu_count
, cpu_count
, cg
);
368 static void add_cpu_usage(uint64_t *surplus
, struct cpuacct_usage
*usage
,
369 uint64_t *counter
, uint64_t threshold
)
371 uint64_t free_space
, to_add
;
373 free_space
= threshold
- usage
->user
- usage
->system
;
375 if (free_space
> usage
->idle
)
376 free_space
= usage
->idle
;
378 if (free_space
> *surplus
)
384 usage
->idle
-= to_add
;
388 static uint64_t diff_cpu_usage(struct cpuacct_usage
*older
,
389 struct cpuacct_usage
*newer
,
390 struct cpuacct_usage
*diff
, int cpu_count
)
394 for (int i
= 0; i
< cpu_count
; i
++) {
395 if (!newer
[i
].online
)
399 * When cpuset is changed on the fly, the CPUs might get
400 * reordered. We could either reset all counters, or check
401 * that the substractions below will return expected results.
403 if (newer
[i
].user
> older
[i
].user
)
404 diff
[i
].user
= newer
[i
].user
- older
[i
].user
;
408 if (newer
[i
].system
> older
[i
].system
)
409 diff
[i
].system
= newer
[i
].system
- older
[i
].system
;
413 if (newer
[i
].idle
> older
[i
].idle
)
414 diff
[i
].idle
= newer
[i
].idle
- older
[i
].idle
;
419 sum
+= diff
[i
].system
;
427 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
428 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
431 static bool read_cpu_cfs_param(const char *cg
, const char *param
, int64_t *value
)
433 __do_free
char *str
= NULL
;
434 char file
[STRLITERALLEN("cpu.cfs_period_us") + 1];
438 if (pure_unified_layout(cgroup_ops
)) {
439 first
= !strcmp(param
, "quota");
440 ret
= snprintf(file
, sizeof(file
), "cpu.max");
442 ret
= snprintf(file
, sizeof(file
), "cpu.cfs_%s_us", param
);
444 if (ret
< 0 || (size_t)ret
>= sizeof(file
))
447 if (!cgroup_ops
->get(cgroup_ops
, "cpu", cg
, file
, &str
))
450 return sscanf(str
, first
? "%" PRId64
: "%*d %" PRId64
, value
) == 1;
454 * Return the exact number of visible CPUs based on CPU quotas.
455 * If there is no quota set, zero is returned.
457 static double exact_cpu_count(const char *cg
)
461 int64_t cfs_quota
, cfs_period
;
463 if (!read_cpu_cfs_param(cg
, "quota", &cfs_quota
))
466 if (!read_cpu_cfs_param(cg
, "period", &cfs_period
))
469 if (cfs_quota
<= 0 || cfs_period
<= 0)
472 rv
= (double)cfs_quota
/ (double)cfs_period
;
474 nprocs
= get_nprocs();
483 * Return true if cfs quota of the cgroup is neg / not set
485 static bool cfs_quota_disabled(const char *cg
)
489 if (!read_cpu_cfs_param(cg
, "quota", &cfs_quota
))
492 return cfs_quota
< 0;
496 * Return the maximum number of visible CPUs based on CPU quotas.
497 * If there is no quota set, cpu number in cpuset value is returned.
499 int max_cpu_count(const char *cg
)
501 __do_free
char *cpuset
= NULL
;
503 int64_t cfs_quota
, cfs_period
;
504 int nr_cpus_in_cpuset
= 0;
506 if (!read_cpu_cfs_param(cg
, "quota", &cfs_quota
))
509 if (!read_cpu_cfs_param(cg
, "period", &cfs_period
))
512 cpuset
= get_cpuset(cg
);
514 nr_cpus_in_cpuset
= cpu_number_in_cpuset(cpuset
);
516 if (cfs_quota
<= 0 || cfs_period
<= 0) {
517 if (nr_cpus_in_cpuset
> 0)
518 return nr_cpus_in_cpuset
;
523 rv
= cfs_quota
/ cfs_period
;
526 * In case quota/period does not yield a whole number, add one CPU for
529 if ((cfs_quota
% cfs_period
) > 0)
532 nprocs
= get_nprocs();
536 /* Use min value in cpu quota and cpuset. */
537 if (nr_cpus_in_cpuset
> 0 && nr_cpus_in_cpuset
< rv
)
538 rv
= nr_cpus_in_cpuset
;
543 int cpuview_proc_stat(const char *cg
, const char *cpuset
,
544 struct cpuacct_usage
*cg_cpu_usage
, int cg_cpu_usage_size
,
545 FILE *f
, char *buf
, size_t buf_size
)
547 __do_free
char *line
= NULL
;
548 __do_free
struct cpuacct_usage
*diff
= NULL
;
549 size_t linelen
= 0, total_len
= 0;
550 int curcpu
= -1; /* cpu numbering starts at 0 */
553 uint64_t user
= 0, nice
= 0, system
= 0, idle
= 0, iowait
= 0, irq
= 0,
554 softirq
= 0, steal
= 0, guest
= 0, guest_nice
= 0;
555 uint64_t user_sum
= 0, system_sum
= 0, idle_sum
= 0;
556 uint64_t user_surplus
= 0, system_surplus
= 0;
557 int nprocs
, max_cpus
;
559 uint64_t total_sum
, threshold
;
560 struct cg_proc_stat
*stat_node
;
562 nprocs
= get_nprocs_conf();
563 if (cg_cpu_usage_size
< nprocs
)
564 nprocs
= cg_cpu_usage_size
;
566 /* Read all CPU stats and stop when we've encountered other lines */
567 while (getline(&line
, &linelen
, f
) != -1) {
569 char cpu_char
[10]; /* That's a lot of cores */
570 uint64_t all_used
, cg_used
;
572 if (strlen(line
) == 0)
575 /* not a ^cpuN line containing a number N */
576 if (sscanf(line
, "cpu%9[^ ]", cpu_char
) != 1)
579 if (sscanf(cpu_char
, "%d", &physcpu
) != 1)
582 if (physcpu
>= cg_cpu_usage_size
)
588 if (!cpu_in_cpuset(physcpu
, cpuset
)) {
589 for (i
= curcpu
; i
<= physcpu
; i
++)
590 cg_cpu_usage
[i
].online
= false;
594 if (curcpu
< physcpu
) {
595 /* Some CPUs may be disabled */
596 for (i
= curcpu
; i
< physcpu
; i
++)
597 cg_cpu_usage
[i
].online
= false;
602 cg_cpu_usage
[curcpu
].online
= true;
604 ret
= sscanf(line
, "%*s %" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
"lu",
618 all_used
= user
+ nice
+ system
+ iowait
+ irq
+ softirq
+ steal
+ guest
+ guest_nice
;
619 cg_used
= cg_cpu_usage
[curcpu
].user
+ cg_cpu_usage
[curcpu
].system
;
621 if (all_used
>= cg_used
) {
622 cg_cpu_usage
[curcpu
].idle
= idle
+ (all_used
- cg_used
);
624 lxcfs_v("cpu%d from %s has unexpected cpu time: %" PRIu64
" in /proc/stat, %" PRIu64
" in cpuacct.usage_all; unable to determine idle time",
625 curcpu
, cg
, all_used
, cg_used
);
626 cg_cpu_usage
[curcpu
].idle
= idle
;
630 /* Cannot use more CPUs than is available in cpuset. */
631 max_cpus
= max_cpu_count(cg
);
632 if (max_cpus
> cpu_cnt
|| !max_cpus
)
635 /* takes lock pthread_mutex_lock(&node->lock) */
636 stat_node
= find_or_create_proc_stat_node(cg_cpu_usage
, nprocs
, cg
);
638 return log_error(0, "Failed to find/create stat node for %s", cg
);
640 diff
= zalloc(sizeof(struct cpuacct_usage
) * nprocs
);
642 goto out_pthread_mutex_unlock
;
645 * If the new values are LOWER than values stored in memory, it means
646 * the cgroup has been reset/recreated and we should reset too.
648 for (curcpu
= 0; curcpu
< nprocs
; curcpu
++) {
649 if (!cg_cpu_usage
[curcpu
].online
)
652 if (cg_cpu_usage
[curcpu
].user
< stat_node
->usage
[curcpu
].user
)
653 reset_proc_stat_node(stat_node
, cg_cpu_usage
, nprocs
);
658 total_sum
= diff_cpu_usage(stat_node
->usage
, cg_cpu_usage
, diff
, nprocs
);
660 for (curcpu
= 0, i
= -1; curcpu
< nprocs
; curcpu
++) {
661 stat_node
->usage
[curcpu
].online
= cg_cpu_usage
[curcpu
].online
;
663 if (!stat_node
->usage
[curcpu
].online
)
668 stat_node
->usage
[curcpu
].user
+= diff
[curcpu
].user
;
669 stat_node
->usage
[curcpu
].system
+= diff
[curcpu
].system
;
670 stat_node
->usage
[curcpu
].idle
+= diff
[curcpu
].idle
;
672 if (max_cpus
> 0 && i
>= max_cpus
) {
673 user_surplus
+= diff
[curcpu
].user
;
674 system_surplus
+= diff
[curcpu
].system
;
678 /* Calculate usage counters of visible CPUs */
680 uint64_t diff_user
= 0;
681 uint64_t diff_system
= 0;
682 uint64_t diff_idle
= 0;
683 uint64_t max_diff_idle
= 0;
684 uint64_t max_diff_idle_index
= 0;
686 /* threshold = maximum usage per cpu, including idle */
687 threshold
= total_sum
/ cpu_cnt
* max_cpus
;
689 for (curcpu
= 0, i
= -1; curcpu
< nprocs
; curcpu
++) {
690 if (!stat_node
->usage
[curcpu
].online
)
698 if (diff
[curcpu
].user
+ diff
[curcpu
].system
>= threshold
)
702 add_cpu_usage(&user_surplus
, &diff
[curcpu
],
703 &diff
[curcpu
].user
, threshold
);
705 if (diff
[curcpu
].user
+ diff
[curcpu
].system
>= threshold
)
708 /* If there is still room, add system */
709 add_cpu_usage(&system_surplus
, &diff
[curcpu
],
710 &diff
[curcpu
].system
, threshold
);
713 if (user_surplus
> 0)
714 lxcfs_debug("leftover user: %" PRIu64
"for %s\n", user_surplus
, cg
);
715 if (system_surplus
> 0)
716 lxcfs_debug("leftover system: %" PRIu64
"for %s\n", system_surplus
, cg
);
718 for (curcpu
= 0, i
= -1; curcpu
< nprocs
; curcpu
++) {
719 if (!stat_node
->usage
[curcpu
].online
)
727 stat_node
->view
[curcpu
].user
+= diff
[curcpu
].user
;
728 stat_node
->view
[curcpu
].system
+= diff
[curcpu
].system
;
729 stat_node
->view
[curcpu
].idle
+= diff
[curcpu
].idle
;
731 diff_user
+= diff
[curcpu
].user
;
732 diff_system
+= diff
[curcpu
].system
;
733 diff_idle
+= diff
[curcpu
].idle
;
734 if (diff
[curcpu
].idle
> max_diff_idle
) {
735 max_diff_idle
= diff
[curcpu
].idle
;
736 max_diff_idle_index
= curcpu
;
739 lxcfs_v("curcpu: %d, diff_user: %" PRIu64
", diff_system: %" PRIu64
", diff_idle: %" PRIu64
"\n", curcpu
, diff
[curcpu
].user
, diff
[curcpu
].system
, diff
[curcpu
].idle
);
741 lxcfs_v("total. diff_user: %" PRIu64
", diff_system: %" PRIu64
", diff_idle: %" PRIu64
"\n", diff_user
, diff_system
, diff_idle
);
743 for (curcpu
= 0; curcpu
< nprocs
; curcpu
++) {
744 user_sum
+= stat_node
->view
[curcpu
].user
;
745 system_sum
+= stat_node
->view
[curcpu
].system
;
746 idle_sum
+= stat_node
->view
[curcpu
].idle
;
749 /* revise cpu usage view to support partial cpu case. */
750 exact_cpus
= exact_cpu_count(cg
);
752 /* skip revise cpu when cfs quota is disabled (exact_cpus == 0) */
753 if (!cfs_quota_disabled(cg
) && exact_cpus
< (double)max_cpus
){
754 uint64_t delta
= (uint64_t)((double)(diff_user
+ diff_system
+ diff_idle
) * (1 - exact_cpus
/ (double)max_cpus
));
756 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus
);
757 lxcfs_v("delta: %" PRIu64
"\n", delta
);
758 lxcfs_v("idle_sum before: %" PRIu64
"\n", idle_sum
);
759 if (idle_sum
> delta
)
760 idle_sum
= idle_sum
- delta
;
763 lxcfs_v("idle_sum after: %l" PRIu64
"\n", idle_sum
);
765 curcpu
= max_diff_idle_index
;
766 lxcfs_v("curcpu: %d, idle before: %" PRIu64
"\n", curcpu
, stat_node
->view
[curcpu
].idle
);
767 if (stat_node
->view
[curcpu
].idle
> delta
)
768 stat_node
->view
[curcpu
].idle
= stat_node
->view
[curcpu
].idle
- delta
;
770 stat_node
->view
[curcpu
].idle
= 0;
771 lxcfs_v("curcpu: %d, idle after: %" PRIu64
"\n", curcpu
, stat_node
->view
[curcpu
].idle
);
774 for (curcpu
= 0; curcpu
< nprocs
; curcpu
++) {
775 if (!stat_node
->usage
[curcpu
].online
)
778 stat_node
->view
[curcpu
].user
= stat_node
->usage
[curcpu
].user
;
779 stat_node
->view
[curcpu
].system
= stat_node
->usage
[curcpu
].system
;
780 stat_node
->view
[curcpu
].idle
= stat_node
->usage
[curcpu
].idle
;
782 user_sum
+= stat_node
->view
[curcpu
].user
;
783 system_sum
+= stat_node
->view
[curcpu
].system
;
784 idle_sum
+= stat_node
->view
[curcpu
].idle
;
788 /* Render the file */
790 l
= snprintf(buf
, buf_size
,
791 "cpu %" PRIu64
" 0 %" PRIu64
" %" PRIu64
" 0 0 0 0 0 0\n",
792 user_sum
, system_sum
, idle_sum
);
793 lxcfs_v("cpu-all: %s\n", buf
);
795 lxcfs_error("Failed to write cache");
797 goto out_pthread_mutex_unlock
;
799 if ((size_t)l
>= buf_size
) {
800 lxcfs_error("Write to cache was truncated");
802 goto out_pthread_mutex_unlock
;
809 /* Render visible CPUs
810 Assume there are K CPUs: 0, 1, 2, ..., K-1.
811 Among them, there are M online CPUs with index: a1, a2, ... aN ... aM (M >= N)
812 N = max_cpus, M = number of online CPUs
814 There will be N rendered cpus, indexed from 0 to N-1, cpu times of the cpus are calculated from those formula:
815 - user_time[0] = stat_node->view[0].user + stat_node->view[1].user + ... + stat_node->view[a1].user
816 - user_time[1] = stat_node->view[a1+1].user + stat_node->view[a1+1].user + ... + stat_node->view[a2].user
818 - user_time[N-2] = stat_node->view[a(N-2)+1].user + stat_node->view[a(N-2)+2].user + ...
819 + stat_node->view[a(N-1)].user
820 - user_time[N-1] = stat_node->view[a(N-1)+1].user + stat_node->view[a(N-1)+2].user + ...
821 + stat_node->view[aN] + ... + stat_node->view[K-1] (sum of all remaining CPUs)
823 Similar formula applied for system and idle time
826 uint64_t curcpu_view_user_sum
= 0, curcpu_view_system_sum
= 0, curcpu_view_idle_sum
= 0;
827 for (curcpu
= 0, i
= -1; curcpu
< nprocs
; curcpu
++) {
828 curcpu_view_user_sum
+= stat_node
->view
[curcpu
].user
;
829 curcpu_view_system_sum
+= stat_node
->view
[curcpu
].system
;
830 curcpu_view_idle_sum
+= stat_node
->view
[curcpu
].idle
;
832 if (!stat_node
->usage
[curcpu
].online
&& curcpu
< nprocs
- 1) {
838 if (max_cpus
> 0 && i
>= max_cpus
) {
839 // max(i) = count(rendered cpus) = max_cpus - 1
843 if (max_cpus
> 0 && i
== max_cpus
- 1 && curcpu
< nprocs
- 1) {
844 // last 'rendered' cpu, sum until reaches the last cpu
848 l
= snprintf(buf
, buf_size
, "cpu%d %" PRIu64
" 0 %" PRIu64
" %" PRIu64
" 0 0 0 0 0 0\n",
850 curcpu_view_user_sum
,
851 curcpu_view_system_sum
,
852 curcpu_view_idle_sum
);
853 lxcfs_v("cpu: %s\n", buf
);
855 lxcfs_error("Failed to write cache");
857 goto out_pthread_mutex_unlock
;
859 if ((size_t)l
>= buf_size
) {
860 lxcfs_error("Write to cache was truncated");
862 goto out_pthread_mutex_unlock
;
869 curcpu_view_user_sum
= 0;
870 curcpu_view_system_sum
= 0;
871 curcpu_view_idle_sum
= 0;
874 /* Pass the rest of /proc/stat, start with the last line read */
875 l
= snprintf(buf
, buf_size
, "%s", line
);
877 lxcfs_error("Failed to write cache");
879 goto out_pthread_mutex_unlock
;
881 if ((size_t)l
>= buf_size
) {
882 lxcfs_error("Write to cache was truncated");
884 goto out_pthread_mutex_unlock
;
891 /* Pass the rest of the host's /proc/stat */
892 while (getline(&line
, &linelen
, f
) != -1) {
893 l
= snprintf(buf
, buf_size
, "%s", line
);
895 lxcfs_error("Failed to write cache");
897 goto out_pthread_mutex_unlock
;
899 if ((size_t)l
>= buf_size
) {
900 lxcfs_error("Write to cache was truncated");
902 goto out_pthread_mutex_unlock
;
910 out_pthread_mutex_unlock
:
912 pthread_mutex_unlock(&stat_node
->lock
);
918 * check whether this is a '^processor" line in /proc/cpuinfo
920 static inline bool is_processor_line(const char *line
)
923 return sscanf(line
, "processor : %d", &cpu
) == 1;
926 static inline bool cpuline_in_cpuset(const char *line
, const char *cpuset
)
930 if (sscanf(line
, "processor : %d", &cpu
) == 1)
931 return cpu_in_cpuset(cpu
, cpuset
);
936 int proc_cpuinfo_read(char *buf
, size_t size
, off_t offset
,
937 struct fuse_file_info
*fi
)
939 __do_free
char *cg
= NULL
, *cpuset
= NULL
, *line
= NULL
;
940 __do_free
void *fopen_cache
= NULL
;
941 __do_fclose
FILE *f
= NULL
;
942 struct fuse_context
*fc
= fuse_get_context();
943 struct lxcfs_opts
*opts
= (struct lxcfs_opts
*)fc
->private_data
;
944 struct file_info
*d
= INTTYPE_TO_PTR(fi
->fh
);
945 size_t linelen
= 0, total_len
= 0;
946 bool am_printing
= false, firstline
= true, is_s390x
= false;
947 int curcpu
= -1, cpu
, max_cpus
= 0;
949 char *cache
= d
->buf
;
950 size_t cache_size
= d
->buflen
;
955 if (offset
> d
->size
)
961 left
= d
->size
- offset
;
962 total_len
= left
> size
? size
: left
;
963 memcpy(buf
, cache
+ offset
, total_len
);
968 pid_t initpid
= lookup_initpid_in_store(fc
->pid
);
969 if (initpid
<= 1 || is_shared_pidns(initpid
))
972 cg
= get_pid_cgroup(initpid
, "cpuset");
974 return read_file_fuse("proc/cpuinfo", buf
, size
, d
);
975 prune_init_slice(cg
);
977 cpuset
= get_cpuset(cg
);
981 if (cgroup_ops
->can_use_cpuview(cgroup_ops
) && opts
&& opts
->use_cfs
)
986 max_cpus
= max_cpu_count(cg
);
988 f
= fopen_cached("/proc/cpuinfo", "re", &fopen_cache
);
992 while (getline(&line
, &linelen
, f
) != -1) {
996 if (strstr(line
, "IBM/S390") != NULL
) {
1003 if (strncmp(line
, "# processors:", 12) == 0)
1006 if (is_processor_line(line
)) {
1007 if (use_view
&& max_cpus
> 0 && (curcpu
+ 1) == max_cpus
)
1010 am_printing
= cpuline_in_cpuset(line
, cpuset
);
1013 l
= snprintf(cache
, cache_size
, "processor : %d\n", curcpu
);
1015 return log_error(0, "Failed to write cache");
1016 if ((size_t)l
>= cache_size
)
1017 return log_error(0, "Write to cache was truncated");
1023 } else if (is_s390x
&& sscanf(line
, "processor %d:", &cpu
) == 1) {
1026 if (use_view
&& max_cpus
> 0 && (curcpu
+ 1) == max_cpus
)
1029 if (!cpu_in_cpuset(cpu
, cpuset
))
1033 p
= strchr(line
, ':');
1038 l
= snprintf(cache
, cache_size
, "processor %d:%s", curcpu
, p
);
1040 return log_error(0, "Failed to write cache");
1041 if ((size_t)l
>= cache_size
)
1042 return log_error(0, "Write to cache was truncated");
1051 l
= snprintf(cache
, cache_size
, "%s", line
);
1053 return log_error(0, "Failed to write cache");
1054 if ((size_t)l
>= cache_size
)
1055 return log_error(0, "Write to cache was truncated");
1064 __do_free
char *origcache
= d
->buf
;
1067 d
->buf
= malloc(d
->buflen
);
1069 d
->buf
= move_ptr(origcache
);
1074 cache_size
= d
->buflen
;
1076 l
= snprintf(cache
, cache_size
, "vendor_id : IBM/S390\n");
1077 if (l
< 0 || (size_t)l
>= cache_size
)
1083 l
= snprintf(cache
, cache_size
, "# processors : %d\n", curcpu
+ 1);
1084 if (l
< 0 || (size_t)l
>= cache_size
)
1090 l
= snprintf(cache
, cache_size
, "%s", origcache
);
1091 if (l
< 0 || (size_t)l
>= cache_size
)
1097 d
->size
= total_len
;
1098 if (total_len
> size
)
1101 /* read from off 0 */
1102 memcpy(buf
, d
->buf
, total_len
);
1108 * Returns 0 on success.
1109 * It is the caller's responsibility to free `return_usage`, unless this
1110 * function returns an error.
1112 int read_cpuacct_usage_all(char *cg
, char *cpuset
,
1113 struct cpuacct_usage
**return_usage
, int *size
)
1115 __do_free
char *usage_str
= NULL
;
1116 __do_free
struct cpuacct_usage
*cpu_usage
= NULL
;
1117 int i
= 0, j
= 0, read_pos
= 0, read_cnt
= 0;
1121 uint64_t cg_user
, cg_system
;
1122 int64_t ticks_per_sec
;
1124 ticks_per_sec
= sysconf(_SC_CLK_TCK
);
1125 if (ticks_per_sec
< 0 && errno
== EINVAL
) {
1126 lxcfs_debug("%m - Failed to determine number of ticks per second");
1130 cpucount
= get_nprocs_conf();
1131 cpu_usage
= malloc(sizeof(struct cpuacct_usage
) * cpucount
);
1135 memset(cpu_usage
, 0, sizeof(struct cpuacct_usage
) * cpucount
);
1136 if (!cgroup_ops
->get(cgroup_ops
, "cpuacct", cg
, "cpuacct.usage_all", &usage_str
)) {
1137 char *sep
= " \t\n";
1140 /* Read cpuacct.usage_percpu instead. */
1141 lxcfs_debug("Falling back to cpuacct.usage_percpu");
1142 if (!cgroup_ops
->get(cgroup_ops
, "cpuacct", cg
, "cpuacct.usage_percpu", &usage_str
))
1145 lxc_iterate_parts(tok
, usage_str
, sep
) {
1146 uint64_t percpu_user
;
1151 tok
= trim_whitespace_in_place(tok
);
1152 ret
= safe_uint64(tok
, &percpu_user
, 10);
1156 /* Convert the time from nanoseconds to USER_HZ */
1157 cpu_usage
[i
].user
= percpu_user
/ 1000.0 / 1000 / 1000 * ticks_per_sec
;
1158 cpu_usage
[i
].system
= cpu_usage
[i
].user
;
1160 lxcfs_debug("cpu%d with time %s", i
, tok
);
1163 if (sscanf(usage_str
, "cpu user system\n%n", &read_cnt
) != 0)
1164 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg
);
1166 read_pos
+= read_cnt
;
1168 for (i
= 0, j
= 0; i
< cpucount
; i
++) {
1169 ret
= sscanf(usage_str
+ read_pos
,
1170 "%d %" PRIu64
" %" PRIu64
"\n%n", &cg_cpu
,
1171 &cg_user
, &cg_system
, &read_cnt
);
1177 return log_error(-EINVAL
, "Failed to parse cpuacct.usage_all line %s from cgroup %s",
1178 usage_str
+ read_pos
, cg
);
1180 read_pos
+= read_cnt
;
1182 /* Convert the time from nanoseconds to USER_HZ */
1183 cpu_usage
[j
].user
= cg_user
/ 1000.0 / 1000 / 1000 * ticks_per_sec
;
1184 cpu_usage
[j
].system
= cg_system
/ 1000.0 / 1000 / 1000 * ticks_per_sec
;
1189 *return_usage
= move_ptr(cpu_usage
);
1194 static bool cpuview_init_head(struct cg_proc_stat_head
**head
)
1196 __do_free
struct cg_proc_stat_head
*h
;
1198 h
= zalloc(sizeof(struct cg_proc_stat_head
));
1202 if (pthread_rwlock_init(&h
->lock
, NULL
))
1205 h
->lastcheck
= time(NULL
);
1207 *head
= move_ptr(h
);
1211 bool init_cpuview(void)
1215 for (i
= 0; i
< CPUVIEW_HASH_SIZE
; i
++)
1216 proc_stat_history
[i
] = NULL
;
1218 for (i
= 0; i
< CPUVIEW_HASH_SIZE
; i
++) {
1219 if (!cpuview_init_head(&proc_stat_history
[i
]))
1226 for (i
= 0; i
< CPUVIEW_HASH_SIZE
; i
++) {
1227 if (proc_stat_history
[i
])
1228 free_disarm(proc_stat_history
[i
]);
1234 static void cpuview_free_head(struct cg_proc_stat_head
*head
)
1236 struct cg_proc_stat
*node
;
1242 struct cg_proc_stat
*cur
= node
;
1244 free_proc_stat_node(cur
);
1250 pthread_rwlock_destroy(&head
->lock
);
1254 void free_cpuview(void)
1256 for (int i
= 0; i
< CPUVIEW_HASH_SIZE
; i
++)
1257 if (proc_stat_history
[i
])
1258 cpuview_free_head(proc_stat_history
[i
]);