]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/proc_cpuview.c
tree-wide: align lxcfs and lxc licensing
[mirror_lxcfs.git] / src / proc_cpuview.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
1f5596dd
CB
2
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE
5#endif
6
7#ifndef FUSE_USE_VERSION
8#define FUSE_USE_VERSION 26
9#endif
10
11#define _FILE_OFFSET_BITS 64
12
13#define __STDC_FORMAT_MACROS
14#include <dirent.h>
15#include <errno.h>
16#include <fcntl.h>
17#include <fuse.h>
18#include <inttypes.h>
19#include <libgen.h>
20#include <pthread.h>
21#include <sched.h>
22#include <stdarg.h>
23#include <stdbool.h>
24#include <stdint.h>
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <time.h>
29#include <unistd.h>
30#include <wait.h>
31#include <linux/magic.h>
32#include <linux/sched.h>
33#include <sys/epoll.h>
34#include <sys/mman.h>
35#include <sys/mount.h>
36#include <sys/param.h>
37#include <sys/socket.h>
38#include <sys/syscall.h>
39#include <sys/sysinfo.h>
40#include <sys/vfs.h>
41
42#include "bindings.h"
43#include "config.h"
44#include "cgroup_fuse.h"
45#include "cpuset_parse.h"
46#include "cgroups/cgroup.h"
47#include "cgroups/cgroup_utils.h"
48#include "memory_utils.h"
4ec5c9da 49#include "proc_loadavg.h"
1f5596dd
CB
50#include "utils.h"
51
1f5596dd
CB
52/* Data for CPU view */
53struct cg_proc_stat {
54 char *cg;
55 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
56 struct cpuacct_usage *view; // Usage stats reported to the container
57 int cpu_count;
58 pthread_mutex_t lock; // For node manipulation
59 struct cg_proc_stat *next;
60};
61
62struct cg_proc_stat_head {
63 struct cg_proc_stat *next;
64 time_t lastcheck;
65
66 /*
67 * For access to the list. Reading can be parallel, pruning is exclusive.
68 */
69 pthread_rwlock_t lock;
70};
71
72#define CPUVIEW_HASH_SIZE 100
73static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
74
b456d40d
CB
75static void reset_proc_stat_node(struct cg_proc_stat *node,
76 struct cpuacct_usage *usage, int cpu_count)
1f5596dd 77{
1f5596dd
CB
78 lxcfs_debug("Resetting stat node for %s\n", node->cg);
79 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
80
b456d40d 81 for (int i = 0; i < cpu_count; i++) {
1f5596dd
CB
82 node->view[i].user = 0;
83 node->view[i].system = 0;
84 node->view[i].idle = 0;
85 }
86
87 node->cpu_count = cpu_count;
88}
89
90static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
91{
92 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
93
94 /* Allocate new memory */
95 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
96 if (!new_usage)
97 return false;
98
99 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
100 if (!new_view)
101 return false;
102
103 /* Copy existing data & initialize new elements */
104 for (int i = 0; i < cpu_count; i++) {
105 if (i < node->cpu_count) {
106 new_usage[i].user = node->usage[i].user;
107 new_usage[i].system = node->usage[i].system;
108 new_usage[i].idle = node->usage[i].idle;
109
110 new_view[i].user = node->view[i].user;
111 new_view[i].system = node->view[i].system;
112 new_view[i].idle = node->view[i].idle;
113 } else {
114 new_usage[i].user = 0;
115 new_usage[i].system = 0;
116 new_usage[i].idle = 0;
117
118 new_view[i].user = 0;
119 new_view[i].system = 0;
120 new_view[i].idle = 0;
121 }
122 }
123
124 free(node->usage);
125 node->usage = move_ptr(new_usage);
126
127 free(node->view);
128 node->view = move_ptr(new_view);
129 node->cpu_count = cpu_count;
130
131 return true;
132}
133
4ec5c9da
CB
134static void free_proc_stat_node(struct cg_proc_stat *node)
135{
136 pthread_mutex_destroy(&node->lock);
137 free_disarm(node->cg);
138 free_disarm(node->usage);
139 free_disarm(node->view);
140 free_disarm(node);
141}
142
1f5596dd
CB
143static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
144{
145 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
146 struct cg_proc_stat_head *head = proc_stat_history[hash];
147 struct cg_proc_stat *node, *rv = new_node;
148
149 pthread_rwlock_wrlock(&head->lock);
150
151 if (!head->next) {
152 head->next = new_node;
153 goto out;
154 }
155
156 node = head->next;
157
158 for (;;) {
159 if (strcmp(node->cg, new_node->cg) == 0) {
160 /* The node is already present, return it */
161 free_proc_stat_node(new_node);
162 rv = node;
163 goto out;
164 }
165
166 if (node->next) {
167 node = node->next;
168 continue;
169 }
170
171 node->next = new_node;
172 goto out;
173 }
174
175out:
176 pthread_rwlock_unlock(&head->lock);
177 return rv;
178}
179
180static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
181{
182 struct cg_proc_stat *node;
183 int i;
184
185 node = malloc(sizeof(struct cg_proc_stat));
186 if (!node)
187 goto err;
188
189 node->cg = NULL;
190 node->usage = NULL;
191 node->view = NULL;
192
193 node->cg = malloc(strlen(cg) + 1);
194 if (!node->cg)
195 goto err;
196
197 strcpy(node->cg, cg);
198
199 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
200 if (!node->usage)
201 goto err;
202
203 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
204
205 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
206 if (!node->view)
207 goto err;
208
209 node->cpu_count = cpu_count;
210 node->next = NULL;
211
b456d40d
CB
212 if (pthread_mutex_init(&node->lock, NULL) != 0)
213 log_error(goto err, "Failed to initialize node lock");
1f5596dd
CB
214
215 for (i = 0; i < cpu_count; i++) {
216 node->view[i].user = 0;
217 node->view[i].system = 0;
218 node->view[i].idle = 0;
219 }
220
221 return node;
222
223err:
224 if (node && node->cg)
225 free(node->cg);
226 if (node && node->usage)
227 free(node->usage);
228 if (node && node->view)
229 free(node->view);
230 if (node)
231 free(node);
232
233 return NULL;
234}
235
4ec5c9da
CB
236static bool cgfs_param_exist(const char *controller, const char *cgroup,
237 const char *file)
238{
2c990b1d
CB
239 __do_free char *path = NULL;
240 int cfd;
4ec5c9da
CB
241
242 cfd = get_cgroup_fd(controller);
243 if (cfd < 0)
244 return false;
245
2c990b1d
CB
246 path = must_make_path(dot_or_empty(cgroup), cgroup, file);
247 return (faccessat(cfd, path, F_OK, 0) == 0);
4ec5c9da
CB
248}
249
1f5596dd
CB
250static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
251{
b456d40d 252 struct cg_proc_stat *first = NULL;
1f5596dd 253
b456d40d 254 for (struct cg_proc_stat *prev = NULL; node; ) {
1f5596dd 255 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
b456d40d
CB
256 struct cg_proc_stat *tmp = node;
257
1f5596dd
CB
258 lxcfs_debug("Removing stat node for %s\n", node->cg);
259
260 if (prev)
261 prev->next = node->next;
262 else
263 first = node->next;
264
265 node = node->next;
266 free_proc_stat_node(tmp);
267 } else {
268 if (!first)
269 first = node;
270 prev = node;
271 node = node->next;
272 }
273 }
274
275 return first;
276}
277
278#define PROC_STAT_PRUNE_INTERVAL 10
279static void prune_proc_stat_history(void)
280{
1f5596dd
CB
281 time_t now = time(NULL);
282
b456d40d 283 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1f5596dd
CB
284 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
285
286 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
287 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
288 return;
289 }
290
291 if (proc_stat_history[i]->next) {
292 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
293 proc_stat_history[i]->lastcheck = now;
294 }
295
296 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
297 }
298}
299
300static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head,
301 const char *cg)
302{
303 struct cg_proc_stat *node;
304
305 pthread_rwlock_rdlock(&head->lock);
306
307 if (!head->next) {
308 pthread_rwlock_unlock(&head->lock);
309 return NULL;
310 }
311
312 node = head->next;
313
314 do {
315 if (strcmp(cg, node->cg) == 0)
316 goto out;
317 } while ((node = node->next));
318
319 node = NULL;
320
321out:
322 pthread_rwlock_unlock(&head->lock);
323 prune_proc_stat_history();
324 return node;
325}
326
327static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
328{
329 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
330 struct cg_proc_stat_head *head = proc_stat_history[hash];
331 struct cg_proc_stat *node;
332
333 node = find_proc_stat_node(head, cg);
334
335 if (!node) {
336 node = new_proc_stat_node(usage, cpu_count, cg);
337 if (!node)
338 return NULL;
339
340 node = add_proc_stat_node(node);
341 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
342 }
343
344 pthread_mutex_lock(&node->lock);
345
346 /* If additional CPUs on the host have been enabled, CPU usage counter
347 * arrays have to be expanded */
348 if (node->cpu_count < cpu_count) {
349 lxcfs_debug("Expanding stat node %d->%d for %s\n",
350 node->cpu_count, cpu_count, cg);
351
352 if (!expand_proc_stat_node(node, cpu_count)) {
353 pthread_mutex_unlock(&node->lock);
b456d40d 354 return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg);
1f5596dd
CB
355 }
356 }
357
358 return node;
359}
360
2b8eff1d
CB
361static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage,
362 uint64_t *counter, uint64_t threshold)
1f5596dd
CB
363{
364 unsigned long free_space, to_add;
365
366 free_space = threshold - usage->user - usage->system;
367
368 if (free_space > usage->idle)
369 free_space = usage->idle;
370
371 to_add = free_space > *surplus ? *surplus : free_space;
372
373 *counter += to_add;
374 usage->idle -= to_add;
375 *surplus -= to_add;
376}
377
378static unsigned long diff_cpu_usage(struct cpuacct_usage *older,
379 struct cpuacct_usage *newer,
380 struct cpuacct_usage *diff, int cpu_count)
381{
1f5596dd
CB
382 unsigned long sum = 0;
383
b456d40d 384 for (int i = 0; i < cpu_count; i++) {
1f5596dd
CB
385 if (!newer[i].online)
386 continue;
387
b456d40d
CB
388 /*
389 * When cpuset is changed on the fly, the CPUs might get
390 * reordered. We could either reset all counters, or check
391 * that the substractions below will return expected results.
1f5596dd
CB
392 */
393 if (newer[i].user > older[i].user)
394 diff[i].user = newer[i].user - older[i].user;
395 else
396 diff[i].user = 0;
397
398 if (newer[i].system > older[i].system)
399 diff[i].system = newer[i].system - older[i].system;
400 else
401 diff[i].system = 0;
402
403 if (newer[i].idle > older[i].idle)
404 diff[i].idle = newer[i].idle - older[i].idle;
405 else
406 diff[i].idle = 0;
407
408 sum += diff[i].user;
409 sum += diff[i].system;
410 sum += diff[i].idle;
411 }
412
413 return sum;
414}
415
416/*
b456d40d
CB
417 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
418 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
419 * throuh `value`.
1f5596dd
CB
420 */
421static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
422{
423 __do_free char *str = NULL;
424 char file[11 + 6 + 1]; /* cpu.cfs__us + quota/period + \0 */
425
426 snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
427
428 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
429 return false;
430
2b8eff1d 431 if (sscanf(str, "%"PRId64, value) != 1)
1f5596dd
CB
432 return false;
433
434 return true;
435}
436
437/*
438 * Return the exact number of visible CPUs based on CPU quotas.
439 * If there is no quota set, zero is returned.
440 */
441static double exact_cpu_count(const char *cg)
442{
443 double rv;
444 int nprocs;
445 int64_t cfs_quota, cfs_period;
446
447 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
448 return 0;
449
450 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
451 return 0;
452
453 if (cfs_quota <= 0 || cfs_period <= 0)
454 return 0;
455
456 rv = (double)cfs_quota / (double)cfs_period;
457
458 nprocs = get_nprocs();
459
460 if (rv > nprocs)
461 rv = nprocs;
462
463 return rv;
464}
465
466/*
467 * Return the maximum number of visible CPUs based on CPU quotas.
468 * If there is no quota set, zero is returned.
469 */
4ec5c9da 470int max_cpu_count(const char *cg)
1f5596dd 471{
700dd417 472 __do_free char *cpuset = NULL;
1f5596dd
CB
473 int rv, nprocs;
474 int64_t cfs_quota, cfs_period;
475 int nr_cpus_in_cpuset = 0;
1f5596dd
CB
476
477 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
478 return 0;
479
480 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
481 return 0;
482
483 cpuset = get_cpuset(cg);
484 if (cpuset)
485 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
486
487 if (cfs_quota <= 0 || cfs_period <= 0){
488 if (nr_cpus_in_cpuset > 0)
489 return nr_cpus_in_cpuset;
490
491 return 0;
492 }
493
494 rv = cfs_quota / cfs_period;
495
496 /* In case quota/period does not yield a whole number, add one CPU for
497 * the remainder.
498 */
499 if ((cfs_quota % cfs_period) > 0)
500 rv += 1;
501
502 nprocs = get_nprocs();
1f5596dd
CB
503 if (rv > nprocs)
504 rv = nprocs;
505
506 /* use min value in cpu quota and cpuset */
507 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
508 rv = nr_cpus_in_cpuset;
509
510 return rv;
511}
512
513int cpuview_proc_stat(const char *cg, const char *cpuset,
514 struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size,
515 FILE *f, char *buf, size_t buf_size)
516{
517 __do_free char *line = NULL;
518 __do_free struct cpuacct_usage *diff = NULL;
519 size_t linelen = 0, total_len = 0, l;
520 int curcpu = -1; /* cpu numbering starts at 0 */
521 int physcpu, i;
522 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
2b8eff1d
CB
523 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
524 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
525 uint64_t user_sum = 0, system_sum = 0, idle_sum = 0;
526 uint64_t user_surplus = 0, system_surplus = 0;
527 uint64_t total_sum, threshold;
1f5596dd
CB
528 struct cg_proc_stat *stat_node;
529 int nprocs = get_nprocs_conf();
530
531 if (cg_cpu_usage_size < nprocs)
532 nprocs = cg_cpu_usage_size;
533
534 /* Read all CPU stats and stop when we've encountered other lines */
535 while (getline(&line, &linelen, f) != -1) {
536 int ret;
537 char cpu_char[10]; /* That's a lot of cores */
538 uint64_t all_used, cg_used;
539
540 if (strlen(line) == 0)
541 continue;
542
543 /* not a ^cpuN line containing a number N */
544 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
545 break;
546
547 if (sscanf(cpu_char, "%d", &physcpu) != 1)
548 continue;
549
550 if (physcpu >= cg_cpu_usage_size)
551 continue;
552
553 curcpu ++;
554 cpu_cnt ++;
555
556 if (!cpu_in_cpuset(physcpu, cpuset)) {
557 for (i = curcpu; i <= physcpu; i++)
558 cg_cpu_usage[i].online = false;
559 continue;
560 }
561
562 if (curcpu < physcpu) {
563 /* Some CPUs may be disabled */
564 for (i = curcpu; i < physcpu; i++)
565 cg_cpu_usage[i].online = false;
566
567 curcpu = physcpu;
568 }
569
570 cg_cpu_usage[curcpu].online = true;
571
2b8eff1d 572 ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu",
1f5596dd
CB
573 &user,
574 &nice,
575 &system,
576 &idle,
577 &iowait,
578 &irq,
579 &softirq,
580 &steal,
581 &guest,
582 &guest_nice);
1f5596dd
CB
583 if (ret != 10)
584 continue;
585
586 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
587 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
588
589 if (all_used >= cg_used) {
590 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
591
592 } else {
2b8eff1d
CB
593 lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
594 curcpu, cg, all_used, cg_used);
1f5596dd
CB
595 cg_cpu_usage[curcpu].idle = idle;
596 }
597 }
598
599 /* Cannot use more CPUs than is available due to cpuset */
600 if (max_cpus > cpu_cnt)
601 max_cpus = cpu_cnt;
602
603 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
b456d40d
CB
604 if (!stat_node)
605 return log_error(0, "Failed to find/create stat node for %s", cg);
1f5596dd
CB
606
607 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
700dd417 608 if (!diff)
1f5596dd 609 return 0;
1f5596dd
CB
610
611 /*
612 * If the new values are LOWER than values stored in memory, it means
613 * the cgroup has been reset/recreated and we should reset too.
614 */
615 for (curcpu = 0; curcpu < nprocs; curcpu++) {
616 if (!cg_cpu_usage[curcpu].online)
617 continue;
618
619 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
620 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
621
622 break;
623 }
624
625 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
626
627 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
628 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
629
630 if (!stat_node->usage[curcpu].online)
631 continue;
632
633 i++;
634
635 stat_node->usage[curcpu].user += diff[curcpu].user;
636 stat_node->usage[curcpu].system += diff[curcpu].system;
637 stat_node->usage[curcpu].idle += diff[curcpu].idle;
638
639 if (max_cpus > 0 && i >= max_cpus) {
640 user_surplus += diff[curcpu].user;
641 system_surplus += diff[curcpu].system;
642 }
643 }
644
645 /* Calculate usage counters of visible CPUs */
646 if (max_cpus > 0) {
2b8eff1d
CB
647 uint64_t diff_user = 0;
648 uint64_t diff_system = 0;
649 uint64_t diff_idle = 0;
650 uint64_t max_diff_idle = 0;
651 uint64_t max_diff_idle_index = 0;
1f5596dd
CB
652 double exact_cpus;
653
654 /* threshold = maximum usage per cpu, including idle */
655 threshold = total_sum / cpu_cnt * max_cpus;
656
657 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
658 if (!stat_node->usage[curcpu].online)
659 continue;
660
661 i++;
662
663 if (i == max_cpus)
664 break;
665
666 if (diff[curcpu].user + diff[curcpu].system >= threshold)
667 continue;
668
669 /* Add user */
670 add_cpu_usage(&user_surplus, &diff[curcpu],
671 &diff[curcpu].user, threshold);
672
673 if (diff[curcpu].user + diff[curcpu].system >= threshold)
674 continue;
675
676 /* If there is still room, add system */
677 add_cpu_usage(&system_surplus, &diff[curcpu],
678 &diff[curcpu].system, threshold);
679 }
680
681 if (user_surplus > 0)
682 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
683 if (system_surplus > 0)
684 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
685
686 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
687 if (!stat_node->usage[curcpu].online)
688 continue;
689
690 i++;
691
692 if (i == max_cpus)
693 break;
694
695 stat_node->view[curcpu].user += diff[curcpu].user;
696 stat_node->view[curcpu].system += diff[curcpu].system;
697 stat_node->view[curcpu].idle += diff[curcpu].idle;
698
699 user_sum += stat_node->view[curcpu].user;
700 system_sum += stat_node->view[curcpu].system;
701 idle_sum += stat_node->view[curcpu].idle;
702
703 diff_user += diff[curcpu].user;
704 diff_system += diff[curcpu].system;
705 diff_idle += diff[curcpu].idle;
706 if (diff[curcpu].idle > max_diff_idle) {
707 max_diff_idle = diff[curcpu].idle;
708 max_diff_idle_index = curcpu;
709 }
710
711 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
712 }
713 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
714
715 /* revise cpu usage view to support partial cpu case. */
716 exact_cpus = exact_cpu_count(cg);
717 if (exact_cpus < (double)max_cpus){
718 unsigned long delta = (unsigned long)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
719
720 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
721 lxcfs_v("delta: %lu\n", delta);
722 lxcfs_v("idle_sum before: %lu\n", idle_sum);
723 idle_sum = idle_sum > delta ? idle_sum - delta : 0;
724 lxcfs_v("idle_sum after: %lu\n", idle_sum);
725
726 curcpu = max_diff_idle_index;
727 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
728 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
729 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
730 }
731 } else {
732 for (curcpu = 0; curcpu < nprocs; curcpu++) {
733 if (!stat_node->usage[curcpu].online)
734 continue;
735
736 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
737 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
738 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
739
740 user_sum += stat_node->view[curcpu].user;
741 system_sum += stat_node->view[curcpu].system;
742 idle_sum += stat_node->view[curcpu].idle;
743 }
744 }
745
746 /* Render the file */
747 /* cpu-all */
2b8eff1d
CB
748 l = snprintf(buf, buf_size,
749 "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
750 user_sum, system_sum, idle_sum);
1f5596dd 751 lxcfs_v("cpu-all: %s\n", buf);
b456d40d
CB
752 if (l < 0)
753 return log_error(0, "Failed to write cache");
754 if (l >= buf_size)
755 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
756
757 buf += l;
758 buf_size -= l;
759 total_len += l;
760
761 /* Render visible CPUs */
762 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
763 if (!stat_node->usage[curcpu].online)
764 continue;
765
766 i++;
767
768 if (max_cpus > 0 && i == max_cpus)
769 break;
770
2b8eff1d
CB
771 l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
772 i,
773 stat_node->view[curcpu].user,
774 stat_node->view[curcpu].system,
775 stat_node->view[curcpu].idle);
1f5596dd 776 lxcfs_v("cpu: %s\n", buf);
b456d40d
CB
777 if (l < 0)
778 return log_error(0, "Failed to write cache");
779 if (l >= buf_size)
780 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
781
782 buf += l;
783 buf_size -= l;
784 total_len += l;
785 }
786
787 /* Pass the rest of /proc/stat, start with the last line read */
788 l = snprintf(buf, buf_size, "%s", line);
b456d40d
CB
789 if (l < 0)
790 return log_error(0, "Failed to write cache");
791 if (l >= buf_size)
792 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
793
794 buf += l;
795 buf_size -= l;
796 total_len += l;
797
798 /* Pass the rest of the host's /proc/stat */
799 while (getline(&line, &linelen, f) != -1) {
800 l = snprintf(buf, buf_size, "%s", line);
b456d40d
CB
801 if (l < 0)
802 return log_error(0, "Failed to write cache");
803 if (l >= buf_size)
804 return log_error(0, "Write to cache was truncated");
805
1f5596dd
CB
806 buf += l;
807 buf_size -= l;
808 total_len += l;
809 }
810
811 if (stat_node)
812 pthread_mutex_unlock(&stat_node->lock);
b456d40d 813
1f5596dd
CB
814 return total_len;
815}
816
817/*
818 * check whether this is a '^processor" line in /proc/cpuinfo
819 */
b456d40d 820static inline bool is_processor_line(const char *line)
1f5596dd
CB
821{
822 int cpu;
b456d40d 823 return sscanf(line, "processor : %d", &cpu) == 1;
1f5596dd
CB
824}
825
b456d40d 826static inline bool cpuline_in_cpuset(const char *line, const char *cpuset)
1f5596dd
CB
827{
828 int cpu;
b456d40d 829 return sscanf(line, "processor : %d", &cpu) == 1;
1f5596dd
CB
830}
831
832int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
833 struct fuse_file_info *fi)
834{
835 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
757a63e7 836 __do_free void *fopen_cache = NULL;
1f5596dd
CB
837 __do_fclose FILE *f = NULL;
838 struct fuse_context *fc = fuse_get_context();
0274438c 839 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
99b183fb 840 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1f5596dd
CB
841 size_t linelen = 0, total_len = 0;
842 bool am_printing = false, firstline = true, is_s390x = false;
843 int curcpu = -1, cpu, max_cpus = 0;
844 bool use_view;
845 char *cache = d->buf;
846 size_t cache_size = d->buflen;
847
848 if (offset){
849 int left;
850
851 if (offset > d->size)
852 return -EINVAL;
853
854 if (!d->cached)
855 return 0;
856
857 left = d->size - offset;
858 total_len = left > size ? size: left;
859 memcpy(buf, cache + offset, total_len);
860
861 return total_len;
862 }
863
864 pid_t initpid = lookup_initpid_in_store(fc->pid);
865 if (initpid <= 1 || is_shared_pidns(initpid))
866 initpid = fc->pid;
b456d40d 867
1f5596dd
CB
868 cg = get_pid_cgroup(initpid, "cpuset");
869 if (!cg)
870 return read_file_fuse("proc/cpuinfo", buf, size, d);
871 prune_init_slice(cg);
872
873 cpuset = get_cpuset(cg);
874 if (!cpuset)
875 return 0;
876
9973cc06 877 if (cgroup_ops->can_use_cpuview(cgroup_ops) && (opts && opts->use_cfs))
0274438c
CB
878 use_view = true;
879
1f5596dd
CB
880 if (use_view)
881 max_cpus = max_cpu_count(cg);
882
757a63e7 883 f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache);
1f5596dd
CB
884 if (!f)
885 return 0;
886
887 while (getline(&line, &linelen, f) != -1) {
888 ssize_t l;
889 if (firstline) {
890 firstline = false;
891 if (strstr(line, "IBM/S390") != NULL) {
892 is_s390x = true;
893 am_printing = true;
894 continue;
895 }
896 }
b456d40d 897
1f5596dd
CB
898 if (strncmp(line, "# processors:", 12) == 0)
899 continue;
b456d40d 900
1f5596dd
CB
901 if (is_processor_line(line)) {
902 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
903 break;
b456d40d 904
1f5596dd
CB
905 am_printing = cpuline_in_cpuset(line, cpuset);
906 if (am_printing) {
907 curcpu ++;
908 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
b456d40d
CB
909 if (l < 0)
910 return log_error(0, "Failed to write cache");
911 if (l >= cache_size)
912 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
913 cache += l;
914 cache_size -= l;
915 total_len += l;
916 }
917 continue;
918 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
919 char *p;
b456d40d 920
1f5596dd
CB
921 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
922 break;
b456d40d 923
1f5596dd
CB
924 if (!cpu_in_cpuset(cpu, cpuset))
925 continue;
b456d40d 926
1f5596dd
CB
927 curcpu ++;
928 p = strchr(line, ':');
929 if (!p || !*p)
930 return 0;
931 p++;
b456d40d 932
1f5596dd 933 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
b456d40d
CB
934 if (l < 0)
935 return log_error(0, "Failed to write cache");
936 if (l >= cache_size)
937 return log_error(0, "Write to cache was truncated");
938
1f5596dd
CB
939 cache += l;
940 cache_size -= l;
941 total_len += l;
942 continue;
943
944 }
945 if (am_printing) {
946 l = snprintf(cache, cache_size, "%s", line);
b456d40d
CB
947 if (l < 0)
948 return log_error(0, "Failed to write cache");
949 if (l >= cache_size)
950 return log_error(0, "Write to cache was truncated");
951
1f5596dd
CB
952 cache += l;
953 cache_size -= l;
954 total_len += l;
955 }
956 }
957
958 if (is_s390x) {
959 __do_free char *origcache = d->buf;
960 ssize_t l;
961
962 d->buf = malloc(d->buflen);
963 if (!d->buf) {
964 d->buf = move_ptr(origcache);
965 return 0;
966 }
967
968 cache = d->buf;
969 cache_size = d->buflen;
970 total_len = 0;
971 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
972 if (l < 0 || l >= cache_size)
973 return 0;
974
975 cache_size -= l;
976 cache += l;
977 total_len += l;
978 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
979 if (l < 0 || l >= cache_size)
980 return 0;
981
982 cache_size -= l;
983 cache += l;
984 total_len += l;
985 l = snprintf(cache, cache_size, "%s", origcache);
986 if (l < 0 || l >= cache_size)
987 return 0;
988 total_len += l;
989 }
990
991 d->cached = 1;
992 d->size = total_len;
993 if (total_len > size ) total_len = size;
994
995 /* read from off 0 */
996 memcpy(buf, d->buf, total_len);
997 return total_len;
998}
999
1000/*
1001 * Returns 0 on success.
1002 * It is the caller's responsibility to free `return_usage`, unless this
1003 * function returns an error.
1004 */
1005int read_cpuacct_usage_all(char *cg, char *cpuset,
1006 struct cpuacct_usage **return_usage, int *size)
1007{
1008 __do_free char *usage_str = NULL;
1009 __do_free struct cpuacct_usage *cpu_usage = NULL;
1010 int cpucount = get_nprocs_conf();
9ce186dc
CB
1011 int i = 0, j = 0, read_pos = 0, read_cnt = 0;
1012 int ret;
1f5596dd
CB
1013 int cg_cpu;
1014 uint64_t cg_user, cg_system;
1015 int64_t ticks_per_sec;
1016
1017 ticks_per_sec = sysconf(_SC_CLK_TCK);
1018
1019 if (ticks_per_sec < 0 && errno == EINVAL) {
1020 lxcfs_v(
1021 "%s\n",
1022 "read_cpuacct_usage_all failed to determine number of clock ticks "
1023 "in a second");
1024 return -1;
1025 }
1026
1027 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
1028 if (!cpu_usage)
1029 return -ENOMEM;
1030
1031 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1032 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
1033 char *data = NULL;
1f5596dd
CB
1034 size_t sz = 0, asz = 0;
1035
1036 /* read cpuacct.usage_percpu instead. */
1037 lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", "");
1038 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
1039 return -1;
1040 lxcfs_v("usage_str: %s\n", usage_str);
1041
1042 /* convert cpuacct.usage_percpu into cpuacct.usage_all. */
1043 lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", "");
1044
1045 must_strcat(&data, &sz, &asz, "cpu user system\n");
1046
2b8eff1d
CB
1047 while (sscanf(usage_str + read_pos, "%" PRIu64 " %n", &cg_user, &read_cnt) > 0) {
1048 lxcfs_debug("i: %d, cg_user: %" PRIu64 ", read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt);
1f5596dd
CB
1049 must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user);
1050 i++;
1051 read_pos += read_cnt;
1052 }
1053
1054 usage_str = data;
1055
1056 lxcfs_v("usage_str: %s\n", usage_str);
1057 }
1058
b456d40d
CB
1059 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0)
1060 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg);
1f5596dd
CB
1061
1062 read_pos += read_cnt;
1063
1064 for (i = 0, j = 0; i < cpucount; i++) {
2b8eff1d
CB
1065 ret = sscanf(usage_str + read_pos,
1066 "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu,
1067 &cg_user, &cg_system, &read_cnt);
1f5596dd
CB
1068
1069 if (ret == EOF)
1070 break;
1071
b456d40d
CB
1072 if (ret != 3)
1073 return log_error(-1, "read_cpuacct_usage_all reading from %s/cpuacct.usage_all failed", cg);
1f5596dd
CB
1074
1075 read_pos += read_cnt;
1076
1077 /* Convert the time from nanoseconds to USER_HZ */
1078 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1079 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
1080 j++;
1081 }
1082
1083 *return_usage = move_ptr(cpu_usage);
1084 *size = cpucount;
1085 return 0;
1086}
1087
1088static bool cpuview_init_head(struct cg_proc_stat_head **head)
1089{
1090 *head = malloc(sizeof(struct cg_proc_stat_head));
b456d40d
CB
1091 if (!(*head))
1092 return log_error(false, "%s", strerror(errno));
1f5596dd
CB
1093
1094 (*head)->lastcheck = time(NULL);
1095 (*head)->next = NULL;
1096
1097 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
1f5596dd 1098 free_disarm(*head);
b456d40d 1099 return log_error(false, "Failed to initialize list lock");
1f5596dd
CB
1100 }
1101
1102 return true;
1103}
1104
4ec5c9da 1105bool init_cpuview(void)
1f5596dd
CB
1106{
1107 int i;
1108
1109 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
1110 proc_stat_history[i] = NULL;
1111
1112 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1113 if (!cpuview_init_head(&proc_stat_history[i]))
1114 goto err;
1115 }
1116
1117 return true;
1118
1119err:
1120 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1121 if (proc_stat_history[i])
1122 free_disarm(proc_stat_history[i]);
1123 }
1124
1125 return false;
1126}
1127
1f5596dd
CB
1128static void cpuview_free_head(struct cg_proc_stat_head *head)
1129{
1130 struct cg_proc_stat *node, *tmp;
1131
1132 if (head->next) {
1133 node = head->next;
1134
1135 for (;;) {
1136 tmp = node;
1137 node = node->next;
1138 free_proc_stat_node(tmp);
1139
1140 if (!node)
1141 break;
1142 }
1143 }
1144
1145 pthread_rwlock_destroy(&head->lock);
1146 free_disarm(head);
1147}
1148
4ec5c9da 1149void free_cpuview(void)
1f5596dd 1150{
4ec5c9da 1151 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++)
1f5596dd
CB
1152 if (proc_stat_history[i])
1153 cpuview_free_head(proc_stat_history[i]);
1f5596dd 1154}