]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/proc_cpuview.c
proc_cpuview: cleanup new_proc_stat_node()
[mirror_lxcfs.git] / src / proc_cpuview.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
1f5596dd
CB
2
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE
5#endif
6
7#ifndef FUSE_USE_VERSION
8#define FUSE_USE_VERSION 26
9#endif
10
11#define _FILE_OFFSET_BITS 64
12
13#define __STDC_FORMAT_MACROS
14#include <dirent.h>
15#include <errno.h>
16#include <fcntl.h>
17#include <fuse.h>
18#include <inttypes.h>
19#include <libgen.h>
20#include <pthread.h>
21#include <sched.h>
22#include <stdarg.h>
23#include <stdbool.h>
24#include <stdint.h>
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <time.h>
29#include <unistd.h>
30#include <wait.h>
31#include <linux/magic.h>
32#include <linux/sched.h>
33#include <sys/epoll.h>
34#include <sys/mman.h>
35#include <sys/mount.h>
36#include <sys/param.h>
37#include <sys/socket.h>
38#include <sys/syscall.h>
39#include <sys/sysinfo.h>
40#include <sys/vfs.h>
41
42#include "bindings.h"
43#include "config.h"
44#include "cgroup_fuse.h"
45#include "cpuset_parse.h"
46#include "cgroups/cgroup.h"
47#include "cgroups/cgroup_utils.h"
48#include "memory_utils.h"
4ec5c9da 49#include "proc_loadavg.h"
1f5596dd
CB
50#include "utils.h"
51
1f5596dd
CB
52/* Data for CPU view */
53struct cg_proc_stat {
54 char *cg;
55 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
56 struct cpuacct_usage *view; // Usage stats reported to the container
57 int cpu_count;
58 pthread_mutex_t lock; // For node manipulation
59 struct cg_proc_stat *next;
60};
61
62struct cg_proc_stat_head {
63 struct cg_proc_stat *next;
64 time_t lastcheck;
65
66 /*
67 * For access to the list. Reading can be parallel, pruning is exclusive.
68 */
69 pthread_rwlock_t lock;
70};
71
72#define CPUVIEW_HASH_SIZE 100
73static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
74
b456d40d
CB
75static void reset_proc_stat_node(struct cg_proc_stat *node,
76 struct cpuacct_usage *usage, int cpu_count)
1f5596dd 77{
1f5596dd
CB
78 lxcfs_debug("Resetting stat node for %s\n", node->cg);
79 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
80
b456d40d 81 for (int i = 0; i < cpu_count; i++) {
1f5596dd
CB
82 node->view[i].user = 0;
83 node->view[i].system = 0;
84 node->view[i].idle = 0;
85 }
86
87 node->cpu_count = cpu_count;
88}
89
90static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
91{
92 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
93
94 /* Allocate new memory */
95 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
96 if (!new_usage)
97 return false;
98
99 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
100 if (!new_view)
101 return false;
102
103 /* Copy existing data & initialize new elements */
104 for (int i = 0; i < cpu_count; i++) {
105 if (i < node->cpu_count) {
106 new_usage[i].user = node->usage[i].user;
107 new_usage[i].system = node->usage[i].system;
108 new_usage[i].idle = node->usage[i].idle;
109
110 new_view[i].user = node->view[i].user;
111 new_view[i].system = node->view[i].system;
112 new_view[i].idle = node->view[i].idle;
113 } else {
114 new_usage[i].user = 0;
115 new_usage[i].system = 0;
116 new_usage[i].idle = 0;
117
118 new_view[i].user = 0;
119 new_view[i].system = 0;
120 new_view[i].idle = 0;
121 }
122 }
123
124 free(node->usage);
125 node->usage = move_ptr(new_usage);
126
127 free(node->view);
128 node->view = move_ptr(new_view);
129 node->cpu_count = cpu_count;
130
131 return true;
132}
133
4ec5c9da
CB
134static void free_proc_stat_node(struct cg_proc_stat *node)
135{
6a4dceb1
CB
136 if (node) {
137 /*
138 * We're abusing the usage pointer to indicate that
139 * pthread_mutex_init() was successful. Don't judge me.
140 */
141 if (node->usage)
142 pthread_mutex_destroy(&node->lock);
143 free_disarm(node->cg);
144 free_disarm(node->usage);
145 free_disarm(node->view);
146 free_disarm(node);
147 }
4ec5c9da
CB
148}
149
6a4dceb1
CB
150define_cleanup_function(struct cg_proc_stat *, free_proc_stat_node);
151
1f5596dd
CB
152static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
153{
154 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
155 struct cg_proc_stat_head *head = proc_stat_history[hash];
156 struct cg_proc_stat *node, *rv = new_node;
157
158 pthread_rwlock_wrlock(&head->lock);
159
160 if (!head->next) {
161 head->next = new_node;
162 goto out;
163 }
164
165 node = head->next;
166
167 for (;;) {
168 if (strcmp(node->cg, new_node->cg) == 0) {
169 /* The node is already present, return it */
170 free_proc_stat_node(new_node);
171 rv = node;
172 goto out;
173 }
174
175 if (node->next) {
176 node = node->next;
177 continue;
178 }
179
180 node->next = new_node;
181 goto out;
182 }
183
184out:
185 pthread_rwlock_unlock(&head->lock);
186 return rv;
187}
188
6a4dceb1
CB
189static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage,
190 int cpu_count, const char *cg)
1f5596dd 191{
6a4dceb1
CB
192 call_cleaner(free_proc_stat_node) struct cg_proc_stat *node = NULL;
193 __do_free struct cpuacct_usage *new_usage = NULL;
1f5596dd 194
6a4dceb1 195 node = zalloc(sizeof(struct cg_proc_stat));
1f5596dd 196 if (!node)
6a4dceb1 197 return NULL;
1f5596dd 198
6a4dceb1 199 node->cg = strdup(cg);
1f5596dd 200 if (!node->cg)
6a4dceb1 201 return NULL;
1f5596dd 202
6a4dceb1
CB
203 new_usage = memdup(usage, sizeof(struct cpuacct_usage) * cpu_count);
204 if (!new_usage)
205 return NULL;
1f5596dd 206
6a4dceb1 207 node->view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
1f5596dd 208 if (!node->view)
6a4dceb1 209 return NULL;
1f5596dd
CB
210
211 node->cpu_count = cpu_count;
1f5596dd 212
6a4dceb1
CB
213 if (pthread_mutex_init(&node->lock, NULL))
214 return NULL;
215 /*
216 * We're abusing the usage pointer to indicate that
217 * pthread_mutex_init() was successful. Don't judge me.
218 */
219 node->usage = move_ptr(new_usage);
1f5596dd 220
6a4dceb1 221 return move_ptr(node);
1f5596dd
CB
222}
223
4ec5c9da
CB
224static bool cgfs_param_exist(const char *controller, const char *cgroup,
225 const char *file)
226{
2c990b1d
CB
227 __do_free char *path = NULL;
228 int cfd;
4ec5c9da
CB
229
230 cfd = get_cgroup_fd(controller);
231 if (cfd < 0)
232 return false;
233
925d5849 234 path = must_make_path_relative(cgroup, file, NULL);
2c990b1d 235 return (faccessat(cfd, path, F_OK, 0) == 0);
4ec5c9da
CB
236}
237
1f5596dd
CB
238static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
239{
b456d40d 240 struct cg_proc_stat *first = NULL;
1f5596dd 241
b456d40d 242 for (struct cg_proc_stat *prev = NULL; node; ) {
1f5596dd 243 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
b456d40d
CB
244 struct cg_proc_stat *tmp = node;
245
1f5596dd
CB
246 lxcfs_debug("Removing stat node for %s\n", node->cg);
247
248 if (prev)
249 prev->next = node->next;
250 else
251 first = node->next;
252
253 node = node->next;
254 free_proc_stat_node(tmp);
255 } else {
256 if (!first)
257 first = node;
258 prev = node;
259 node = node->next;
260 }
261 }
262
263 return first;
264}
265
266#define PROC_STAT_PRUNE_INTERVAL 10
267static void prune_proc_stat_history(void)
268{
1f5596dd
CB
269 time_t now = time(NULL);
270
b456d40d 271 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1f5596dd
CB
272 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
273
274 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
275 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
276 return;
277 }
278
279 if (proc_stat_history[i]->next) {
280 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
281 proc_stat_history[i]->lastcheck = now;
282 }
283
284 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
285 }
286}
287
288static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head,
289 const char *cg)
290{
291 struct cg_proc_stat *node;
292
293 pthread_rwlock_rdlock(&head->lock);
294
295 if (!head->next) {
296 pthread_rwlock_unlock(&head->lock);
297 return NULL;
298 }
299
300 node = head->next;
301
302 do {
303 if (strcmp(cg, node->cg) == 0)
304 goto out;
305 } while ((node = node->next));
306
307 node = NULL;
308
309out:
310 pthread_rwlock_unlock(&head->lock);
311 prune_proc_stat_history();
312 return node;
313}
314
315static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
316{
317 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
318 struct cg_proc_stat_head *head = proc_stat_history[hash];
319 struct cg_proc_stat *node;
320
321 node = find_proc_stat_node(head, cg);
1f5596dd
CB
322 if (!node) {
323 node = new_proc_stat_node(usage, cpu_count, cg);
324 if (!node)
325 return NULL;
326
327 node = add_proc_stat_node(node);
328 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
329 }
330
331 pthread_mutex_lock(&node->lock);
332
333 /* If additional CPUs on the host have been enabled, CPU usage counter
334 * arrays have to be expanded */
335 if (node->cpu_count < cpu_count) {
336 lxcfs_debug("Expanding stat node %d->%d for %s\n",
337 node->cpu_count, cpu_count, cg);
338
339 if (!expand_proc_stat_node(node, cpu_count)) {
340 pthread_mutex_unlock(&node->lock);
b456d40d 341 return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg);
1f5596dd
CB
342 }
343 }
344
345 return node;
346}
347
2b8eff1d
CB
348static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage,
349 uint64_t *counter, uint64_t threshold)
1f5596dd 350{
1ba088ae 351 uint64_t free_space, to_add;
1f5596dd
CB
352
353 free_space = threshold - usage->user - usage->system;
354
355 if (free_space > usage->idle)
356 free_space = usage->idle;
357
358 to_add = free_space > *surplus ? *surplus : free_space;
359
360 *counter += to_add;
361 usage->idle -= to_add;
362 *surplus -= to_add;
363}
364
1ba088ae
CB
365static uint64_t diff_cpu_usage(struct cpuacct_usage *older,
366 struct cpuacct_usage *newer,
367 struct cpuacct_usage *diff, int cpu_count)
1f5596dd 368{
1ba088ae 369 uint64_t sum = 0;
1f5596dd 370
b456d40d 371 for (int i = 0; i < cpu_count; i++) {
1f5596dd
CB
372 if (!newer[i].online)
373 continue;
374
b456d40d
CB
375 /*
376 * When cpuset is changed on the fly, the CPUs might get
377 * reordered. We could either reset all counters, or check
378 * that the substractions below will return expected results.
1f5596dd
CB
379 */
380 if (newer[i].user > older[i].user)
381 diff[i].user = newer[i].user - older[i].user;
382 else
383 diff[i].user = 0;
384
385 if (newer[i].system > older[i].system)
386 diff[i].system = newer[i].system - older[i].system;
387 else
388 diff[i].system = 0;
389
390 if (newer[i].idle > older[i].idle)
391 diff[i].idle = newer[i].idle - older[i].idle;
392 else
393 diff[i].idle = 0;
394
395 sum += diff[i].user;
396 sum += diff[i].system;
397 sum += diff[i].idle;
398 }
399
400 return sum;
401}
402
403/*
b456d40d
CB
404 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
405 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
406 * throuh `value`.
1f5596dd
CB
407 */
408static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
409{
410 __do_free char *str = NULL;
411 char file[11 + 6 + 1]; /* cpu.cfs__us + quota/period + \0 */
9844eea7 412 bool first = true;
1f5596dd 413
9844eea7
JC
414 if (!pure_unified_layout(cgroup_ops)) {
415 snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
416 } else {
417 strcpy(file, "cpu.max");
418 first = !strcmp(param, "quota");
419 }
1f5596dd
CB
420
421 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
422 return false;
423
8220dfcd 424 if (sscanf(str, first ? "%" PRId64 : "%*d %" PRId64, value) != 1)
1f5596dd
CB
425 return false;
426
427 return true;
428}
429
430/*
431 * Return the exact number of visible CPUs based on CPU quotas.
432 * If there is no quota set, zero is returned.
433 */
434static double exact_cpu_count(const char *cg)
435{
436 double rv;
437 int nprocs;
438 int64_t cfs_quota, cfs_period;
439
9844eea7
JC
440 read_cpu_cfs_param(cg, "quota", &cfs_quota);
441 read_cpu_cfs_param(cg, "period", &cfs_period);
1f5596dd
CB
442
443 if (cfs_quota <= 0 || cfs_period <= 0)
444 return 0;
445
446 rv = (double)cfs_quota / (double)cfs_period;
447
448 nprocs = get_nprocs();
449
450 if (rv > nprocs)
451 rv = nprocs;
452
453 return rv;
454}
455
456/*
457 * Return the maximum number of visible CPUs based on CPU quotas.
458 * If there is no quota set, zero is returned.
459 */
4ec5c9da 460int max_cpu_count(const char *cg)
1f5596dd 461{
700dd417 462 __do_free char *cpuset = NULL;
1f5596dd
CB
463 int rv, nprocs;
464 int64_t cfs_quota, cfs_period;
465 int nr_cpus_in_cpuset = 0;
1f5596dd 466
9844eea7
JC
467 read_cpu_cfs_param(cg, "quota", &cfs_quota);
468 read_cpu_cfs_param(cg, "period", &cfs_period);
1f5596dd
CB
469
470 cpuset = get_cpuset(cg);
471 if (cpuset)
472 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
473
474 if (cfs_quota <= 0 || cfs_period <= 0){
475 if (nr_cpus_in_cpuset > 0)
476 return nr_cpus_in_cpuset;
477
478 return 0;
479 }
480
481 rv = cfs_quota / cfs_period;
482
483 /* In case quota/period does not yield a whole number, add one CPU for
484 * the remainder.
485 */
486 if ((cfs_quota % cfs_period) > 0)
487 rv += 1;
488
489 nprocs = get_nprocs();
1f5596dd
CB
490 if (rv > nprocs)
491 rv = nprocs;
492
493 /* use min value in cpu quota and cpuset */
494 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
495 rv = nr_cpus_in_cpuset;
496
497 return rv;
498}
499
500int cpuview_proc_stat(const char *cg, const char *cpuset,
501 struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size,
502 FILE *f, char *buf, size_t buf_size)
503{
504 __do_free char *line = NULL;
505 __do_free struct cpuacct_usage *diff = NULL;
4f18a602 506 size_t linelen = 0, total_len = 0;
1f5596dd
CB
507 int curcpu = -1; /* cpu numbering starts at 0 */
508 int physcpu, i;
39f231da 509 int cpu_cnt = 0;
2b8eff1d
CB
510 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
511 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
512 uint64_t user_sum = 0, system_sum = 0, idle_sum = 0;
513 uint64_t user_surplus = 0, system_surplus = 0;
39f231da 514 int nprocs, max_cpus;
4f18a602 515 ssize_t l;
2b8eff1d 516 uint64_t total_sum, threshold;
1f5596dd 517 struct cg_proc_stat *stat_node;
1f5596dd 518
39f231da 519 nprocs = get_nprocs_conf();
1f5596dd
CB
520 if (cg_cpu_usage_size < nprocs)
521 nprocs = cg_cpu_usage_size;
522
523 /* Read all CPU stats and stop when we've encountered other lines */
524 while (getline(&line, &linelen, f) != -1) {
525 int ret;
526 char cpu_char[10]; /* That's a lot of cores */
527 uint64_t all_used, cg_used;
528
529 if (strlen(line) == 0)
530 continue;
531
532 /* not a ^cpuN line containing a number N */
533 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
534 break;
535
536 if (sscanf(cpu_char, "%d", &physcpu) != 1)
537 continue;
538
539 if (physcpu >= cg_cpu_usage_size)
540 continue;
541
fd65c77c
CB
542 curcpu++;
543 cpu_cnt++;
1f5596dd
CB
544
545 if (!cpu_in_cpuset(physcpu, cpuset)) {
546 for (i = curcpu; i <= physcpu; i++)
547 cg_cpu_usage[i].online = false;
548 continue;
549 }
550
551 if (curcpu < physcpu) {
552 /* Some CPUs may be disabled */
553 for (i = curcpu; i < physcpu; i++)
554 cg_cpu_usage[i].online = false;
555
556 curcpu = physcpu;
557 }
558
559 cg_cpu_usage[curcpu].online = true;
560
2b8eff1d 561 ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu",
1f5596dd
CB
562 &user,
563 &nice,
564 &system,
565 &idle,
566 &iowait,
567 &irq,
568 &softirq,
569 &steal,
570 &guest,
571 &guest_nice);
1f5596dd
CB
572 if (ret != 10)
573 continue;
574
575 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
576 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
577
578 if (all_used >= cg_used) {
579 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
580
581 } else {
2b8eff1d
CB
582 lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
583 curcpu, cg, all_used, cg_used);
1f5596dd
CB
584 cg_cpu_usage[curcpu].idle = idle;
585 }
586 }
587
f9434b9a
CB
588 /* Cannot use more CPUs than is available in cpuset. */
589 max_cpus = max_cpu_count(cg);
590 if (max_cpus > cpu_cnt || !max_cpus)
591 max_cpus = cpu_cnt;
592
1f5596dd 593 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
b456d40d
CB
594 if (!stat_node)
595 return log_error(0, "Failed to find/create stat node for %s", cg);
1f5596dd
CB
596
597 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
700dd417 598 if (!diff)
1f5596dd 599 return 0;
1f5596dd
CB
600
601 /*
602 * If the new values are LOWER than values stored in memory, it means
603 * the cgroup has been reset/recreated and we should reset too.
604 */
605 for (curcpu = 0; curcpu < nprocs; curcpu++) {
606 if (!cg_cpu_usage[curcpu].online)
607 continue;
608
609 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
610 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
611
612 break;
613 }
614
615 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
616
617 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
618 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
619
620 if (!stat_node->usage[curcpu].online)
621 continue;
622
623 i++;
624
625 stat_node->usage[curcpu].user += diff[curcpu].user;
626 stat_node->usage[curcpu].system += diff[curcpu].system;
627 stat_node->usage[curcpu].idle += diff[curcpu].idle;
628
629 if (max_cpus > 0 && i >= max_cpus) {
630 user_surplus += diff[curcpu].user;
631 system_surplus += diff[curcpu].system;
632 }
633 }
634
635 /* Calculate usage counters of visible CPUs */
636 if (max_cpus > 0) {
2b8eff1d
CB
637 uint64_t diff_user = 0;
638 uint64_t diff_system = 0;
639 uint64_t diff_idle = 0;
640 uint64_t max_diff_idle = 0;
641 uint64_t max_diff_idle_index = 0;
1f5596dd
CB
642 double exact_cpus;
643
644 /* threshold = maximum usage per cpu, including idle */
645 threshold = total_sum / cpu_cnt * max_cpus;
646
647 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
648 if (!stat_node->usage[curcpu].online)
649 continue;
650
651 i++;
652
653 if (i == max_cpus)
654 break;
655
656 if (diff[curcpu].user + diff[curcpu].system >= threshold)
657 continue;
658
659 /* Add user */
660 add_cpu_usage(&user_surplus, &diff[curcpu],
661 &diff[curcpu].user, threshold);
662
663 if (diff[curcpu].user + diff[curcpu].system >= threshold)
664 continue;
665
666 /* If there is still room, add system */
667 add_cpu_usage(&system_surplus, &diff[curcpu],
668 &diff[curcpu].system, threshold);
669 }
670
671 if (user_surplus > 0)
672 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
673 if (system_surplus > 0)
674 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
675
676 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
677 if (!stat_node->usage[curcpu].online)
678 continue;
679
680 i++;
681
682 if (i == max_cpus)
683 break;
684
685 stat_node->view[curcpu].user += diff[curcpu].user;
686 stat_node->view[curcpu].system += diff[curcpu].system;
687 stat_node->view[curcpu].idle += diff[curcpu].idle;
688
689 user_sum += stat_node->view[curcpu].user;
690 system_sum += stat_node->view[curcpu].system;
691 idle_sum += stat_node->view[curcpu].idle;
692
693 diff_user += diff[curcpu].user;
694 diff_system += diff[curcpu].system;
695 diff_idle += diff[curcpu].idle;
696 if (diff[curcpu].idle > max_diff_idle) {
697 max_diff_idle = diff[curcpu].idle;
698 max_diff_idle_index = curcpu;
699 }
700
701 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
702 }
703 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
704
705 /* revise cpu usage view to support partial cpu case. */
706 exact_cpus = exact_cpu_count(cg);
707 if (exact_cpus < (double)max_cpus){
1ba088ae 708 uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
1f5596dd
CB
709
710 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
711 lxcfs_v("delta: %lu\n", delta);
712 lxcfs_v("idle_sum before: %lu\n", idle_sum);
713 idle_sum = idle_sum > delta ? idle_sum - delta : 0;
714 lxcfs_v("idle_sum after: %lu\n", idle_sum);
715
716 curcpu = max_diff_idle_index;
717 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
718 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
719 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
720 }
721 } else {
722 for (curcpu = 0; curcpu < nprocs; curcpu++) {
723 if (!stat_node->usage[curcpu].online)
724 continue;
725
726 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
727 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
728 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
729
730 user_sum += stat_node->view[curcpu].user;
731 system_sum += stat_node->view[curcpu].system;
732 idle_sum += stat_node->view[curcpu].idle;
733 }
734 }
735
736 /* Render the file */
737 /* cpu-all */
2b8eff1d
CB
738 l = snprintf(buf, buf_size,
739 "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
740 user_sum, system_sum, idle_sum);
1f5596dd 741 lxcfs_v("cpu-all: %s\n", buf);
b456d40d
CB
742 if (l < 0)
743 return log_error(0, "Failed to write cache");
744 if (l >= buf_size)
745 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
746
747 buf += l;
748 buf_size -= l;
749 total_len += l;
750
751 /* Render visible CPUs */
752 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
753 if (!stat_node->usage[curcpu].online)
754 continue;
755
756 i++;
757
758 if (max_cpus > 0 && i == max_cpus)
759 break;
760
2b8eff1d
CB
761 l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
762 i,
763 stat_node->view[curcpu].user,
764 stat_node->view[curcpu].system,
765 stat_node->view[curcpu].idle);
1f5596dd 766 lxcfs_v("cpu: %s\n", buf);
b456d40d
CB
767 if (l < 0)
768 return log_error(0, "Failed to write cache");
769 if (l >= buf_size)
770 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
771
772 buf += l;
773 buf_size -= l;
774 total_len += l;
775 }
776
777 /* Pass the rest of /proc/stat, start with the last line read */
778 l = snprintf(buf, buf_size, "%s", line);
b456d40d
CB
779 if (l < 0)
780 return log_error(0, "Failed to write cache");
781 if (l >= buf_size)
782 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
783
784 buf += l;
785 buf_size -= l;
786 total_len += l;
787
788 /* Pass the rest of the host's /proc/stat */
789 while (getline(&line, &linelen, f) != -1) {
790 l = snprintf(buf, buf_size, "%s", line);
b456d40d
CB
791 if (l < 0)
792 return log_error(0, "Failed to write cache");
793 if (l >= buf_size)
794 return log_error(0, "Write to cache was truncated");
795
1f5596dd
CB
796 buf += l;
797 buf_size -= l;
798 total_len += l;
799 }
800
801 if (stat_node)
802 pthread_mutex_unlock(&stat_node->lock);
b456d40d 803
1f5596dd
CB
804 return total_len;
805}
806
807/*
808 * check whether this is a '^processor" line in /proc/cpuinfo
809 */
b456d40d 810static inline bool is_processor_line(const char *line)
1f5596dd
CB
811{
812 int cpu;
b456d40d 813 return sscanf(line, "processor : %d", &cpu) == 1;
1f5596dd
CB
814}
815
b456d40d 816static inline bool cpuline_in_cpuset(const char *line, const char *cpuset)
1f5596dd
CB
817{
818 int cpu;
c539526c
CB
819
820 if (sscanf(line, "processor : %d", &cpu) == 1)
821 return cpu_in_cpuset(cpu, cpuset);
822
823 return false;
1f5596dd
CB
824}
825
826int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
827 struct fuse_file_info *fi)
828{
829 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
757a63e7 830 __do_free void *fopen_cache = NULL;
1f5596dd
CB
831 __do_fclose FILE *f = NULL;
832 struct fuse_context *fc = fuse_get_context();
0274438c 833 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
99b183fb 834 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1f5596dd
CB
835 size_t linelen = 0, total_len = 0;
836 bool am_printing = false, firstline = true, is_s390x = false;
837 int curcpu = -1, cpu, max_cpus = 0;
838 bool use_view;
839 char *cache = d->buf;
840 size_t cache_size = d->buflen;
841
f9434b9a 842 if (offset) {
1f5596dd
CB
843 int left;
844
845 if (offset > d->size)
846 return -EINVAL;
847
848 if (!d->cached)
849 return 0;
850
851 left = d->size - offset;
852 total_len = left > size ? size: left;
853 memcpy(buf, cache + offset, total_len);
854
855 return total_len;
856 }
857
858 pid_t initpid = lookup_initpid_in_store(fc->pid);
859 if (initpid <= 1 || is_shared_pidns(initpid))
860 initpid = fc->pid;
b456d40d 861
1f5596dd
CB
862 cg = get_pid_cgroup(initpid, "cpuset");
863 if (!cg)
864 return read_file_fuse("proc/cpuinfo", buf, size, d);
865 prune_init_slice(cg);
866
867 cpuset = get_cpuset(cg);
868 if (!cpuset)
869 return 0;
870
8044f626 871 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
0274438c 872 use_view = true;
8044f626
CB
873 else
874 use_view = false;
1f5596dd
CB
875 if (use_view)
876 max_cpus = max_cpu_count(cg);
877
757a63e7 878 f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache);
1f5596dd
CB
879 if (!f)
880 return 0;
881
882 while (getline(&line, &linelen, f) != -1) {
883 ssize_t l;
884 if (firstline) {
885 firstline = false;
886 if (strstr(line, "IBM/S390") != NULL) {
887 is_s390x = true;
888 am_printing = true;
889 continue;
890 }
891 }
b456d40d 892
1f5596dd
CB
893 if (strncmp(line, "# processors:", 12) == 0)
894 continue;
b456d40d 895
1f5596dd 896 if (is_processor_line(line)) {
d0031abf 897 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
1f5596dd 898 break;
b456d40d 899
1f5596dd
CB
900 am_printing = cpuline_in_cpuset(line, cpuset);
901 if (am_printing) {
d0031abf 902 curcpu++;
1f5596dd 903 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
b456d40d
CB
904 if (l < 0)
905 return log_error(0, "Failed to write cache");
906 if (l >= cache_size)
907 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
908 cache += l;
909 cache_size -= l;
910 total_len += l;
911 }
912 continue;
913 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
914 char *p;
b456d40d 915
d0031abf 916 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
1f5596dd 917 break;
b456d40d 918
1f5596dd
CB
919 if (!cpu_in_cpuset(cpu, cpuset))
920 continue;
b456d40d 921
1f5596dd
CB
922 curcpu ++;
923 p = strchr(line, ':');
924 if (!p || !*p)
925 return 0;
926 p++;
b456d40d 927
1f5596dd 928 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
b456d40d
CB
929 if (l < 0)
930 return log_error(0, "Failed to write cache");
931 if (l >= cache_size)
932 return log_error(0, "Write to cache was truncated");
933
1f5596dd
CB
934 cache += l;
935 cache_size -= l;
936 total_len += l;
937 continue;
938
939 }
940 if (am_printing) {
941 l = snprintf(cache, cache_size, "%s", line);
b456d40d
CB
942 if (l < 0)
943 return log_error(0, "Failed to write cache");
944 if (l >= cache_size)
945 return log_error(0, "Write to cache was truncated");
946
1f5596dd
CB
947 cache += l;
948 cache_size -= l;
949 total_len += l;
950 }
951 }
952
953 if (is_s390x) {
954 __do_free char *origcache = d->buf;
955 ssize_t l;
956
957 d->buf = malloc(d->buflen);
958 if (!d->buf) {
959 d->buf = move_ptr(origcache);
960 return 0;
961 }
962
963 cache = d->buf;
964 cache_size = d->buflen;
965 total_len = 0;
966 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
967 if (l < 0 || l >= cache_size)
968 return 0;
969
970 cache_size -= l;
971 cache += l;
972 total_len += l;
973 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
974 if (l < 0 || l >= cache_size)
975 return 0;
976
977 cache_size -= l;
978 cache += l;
979 total_len += l;
980 l = snprintf(cache, cache_size, "%s", origcache);
981 if (l < 0 || l >= cache_size)
982 return 0;
983 total_len += l;
984 }
985
986 d->cached = 1;
987 d->size = total_len;
d0031abf
CB
988 if (total_len > size)
989 total_len = size;
1f5596dd
CB
990
991 /* read from off 0 */
992 memcpy(buf, d->buf, total_len);
d0031abf 993
1f5596dd
CB
994 return total_len;
995}
996
997/*
998 * Returns 0 on success.
999 * It is the caller's responsibility to free `return_usage`, unless this
1000 * function returns an error.
1001 */
1002int read_cpuacct_usage_all(char *cg, char *cpuset,
1003 struct cpuacct_usage **return_usage, int *size)
1004{
1005 __do_free char *usage_str = NULL;
1006 __do_free struct cpuacct_usage *cpu_usage = NULL;
9ce186dc 1007 int i = 0, j = 0, read_pos = 0, read_cnt = 0;
8b6987a2 1008 int cpucount;
9ce186dc 1009 int ret;
1f5596dd
CB
1010 int cg_cpu;
1011 uint64_t cg_user, cg_system;
1012 int64_t ticks_per_sec;
1013
1014 ticks_per_sec = sysconf(_SC_CLK_TCK);
1f5596dd 1015 if (ticks_per_sec < 0 && errno == EINVAL) {
8b6987a2 1016 lxcfs_debug("%m - Failed to determine number of ticks per second");
1f5596dd
CB
1017 return -1;
1018 }
1019
f9434b9a 1020 cpucount = get_nprocs_conf();
1f5596dd
CB
1021 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
1022 if (!cpu_usage)
1023 return -ENOMEM;
1024
1025 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1026 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
8b6987a2
CB
1027 char *sep = " \t\n";
1028 char *tok;
1f5596dd 1029
8b6987a2
CB
1030 /* Read cpuacct.usage_percpu instead. */
1031 lxcfs_debug("Falling back to cpuacct.usage_percpu");
1f5596dd
CB
1032 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
1033 return -1;
1f5596dd 1034
8b6987a2
CB
1035 lxc_iterate_parts(tok, usage_str, sep) {
1036 uint64_t percpu_user;
1037
1038 if (i >= cpucount)
1039 break;
1f5596dd 1040
8b6987a2
CB
1041 tok = trim_whitespace_in_place(tok);
1042 ret = safe_uint64(tok, &percpu_user, 10);
1043 if (ret)
1044 return -1;
1f5596dd 1045
8b6987a2
CB
1046 /* Convert the time from nanoseconds to USER_HZ */
1047 cpu_usage[i].user = percpu_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1048 cpu_usage[i].system = cpu_usage[i].user;
1f5596dd 1049 i++;
8b6987a2 1050 lxcfs_debug("cpu%d with time %s", i, tok);
1f5596dd 1051 }
8b6987a2
CB
1052 } else {
1053 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0)
1054 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg);
1f5596dd 1055
8b6987a2 1056 read_pos += read_cnt;
1f5596dd 1057
8b6987a2
CB
1058 for (i = 0, j = 0; i < cpucount; i++) {
1059 ret = sscanf(usage_str + read_pos,
1060 "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu,
1061 &cg_user, &cg_system, &read_cnt);
1f5596dd 1062
8b6987a2
CB
1063 if (ret == EOF)
1064 break;
1f5596dd 1065
8b6987a2
CB
1066 if (ret != 3)
1067 return log_error(-EINVAL, "Failed to parse cpuacct.usage_all line %s from cgroup %s",
1068 usage_str + read_pos, cg);
1f5596dd 1069
8b6987a2 1070 read_pos += read_cnt;
1f5596dd 1071
8b6987a2
CB
1072 /* Convert the time from nanoseconds to USER_HZ */
1073 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1074 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
1075 j++;
1076 }
1f5596dd
CB
1077 }
1078
1079 *return_usage = move_ptr(cpu_usage);
1080 *size = cpucount;
1081 return 0;
1082}
1083
1084static bool cpuview_init_head(struct cg_proc_stat_head **head)
1085{
1086 *head = malloc(sizeof(struct cg_proc_stat_head));
b456d40d
CB
1087 if (!(*head))
1088 return log_error(false, "%s", strerror(errno));
1f5596dd
CB
1089
1090 (*head)->lastcheck = time(NULL);
1091 (*head)->next = NULL;
1092
1093 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
1f5596dd 1094 free_disarm(*head);
b456d40d 1095 return log_error(false, "Failed to initialize list lock");
1f5596dd
CB
1096 }
1097
1098 return true;
1099}
1100
4ec5c9da 1101bool init_cpuview(void)
1f5596dd
CB
1102{
1103 int i;
1104
1105 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
1106 proc_stat_history[i] = NULL;
1107
1108 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1109 if (!cpuview_init_head(&proc_stat_history[i]))
1110 goto err;
1111 }
1112
1113 return true;
1114
1115err:
1116 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1117 if (proc_stat_history[i])
1118 free_disarm(proc_stat_history[i]);
1119 }
1120
1121 return false;
1122}
1123
1f5596dd
CB
1124static void cpuview_free_head(struct cg_proc_stat_head *head)
1125{
1126 struct cg_proc_stat *node, *tmp;
1127
1128 if (head->next) {
1129 node = head->next;
1130
1131 for (;;) {
1132 tmp = node;
1133 node = node->next;
1134 free_proc_stat_node(tmp);
1135
1136 if (!node)
1137 break;
1138 }
1139 }
1140
1141 pthread_rwlock_destroy(&head->lock);
1142 free_disarm(head);
1143}
1144
4ec5c9da 1145void free_cpuview(void)
1f5596dd 1146{
4ec5c9da 1147 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++)
1f5596dd
CB
1148 if (proc_stat_history[i])
1149 cpuview_free_head(proc_stat_history[i]);
1f5596dd 1150}