]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/proc_cpuview.c
proc_cpuview: cleanup read_cpu_cfs_param()
[mirror_lxcfs.git] / src / proc_cpuview.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
1f5596dd
CB
2
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE
5#endif
6
7#ifndef FUSE_USE_VERSION
8#define FUSE_USE_VERSION 26
9#endif
10
11#define _FILE_OFFSET_BITS 64
12
13#define __STDC_FORMAT_MACROS
14#include <dirent.h>
15#include <errno.h>
16#include <fcntl.h>
17#include <fuse.h>
18#include <inttypes.h>
19#include <libgen.h>
20#include <pthread.h>
21#include <sched.h>
22#include <stdarg.h>
23#include <stdbool.h>
24#include <stdint.h>
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <time.h>
29#include <unistd.h>
30#include <wait.h>
31#include <linux/magic.h>
32#include <linux/sched.h>
33#include <sys/epoll.h>
34#include <sys/mman.h>
35#include <sys/mount.h>
36#include <sys/param.h>
37#include <sys/socket.h>
38#include <sys/syscall.h>
39#include <sys/sysinfo.h>
40#include <sys/vfs.h>
41
42#include "bindings.h"
43#include "config.h"
44#include "cgroup_fuse.h"
45#include "cpuset_parse.h"
46#include "cgroups/cgroup.h"
47#include "cgroups/cgroup_utils.h"
48#include "memory_utils.h"
4ec5c9da 49#include "proc_loadavg.h"
1f5596dd
CB
50#include "utils.h"
51
1f5596dd
CB
52/* Data for CPU view */
53struct cg_proc_stat {
54 char *cg;
ce617d73
CB
55 struct cpuacct_usage *usage; /* Real usage as read from the host's /proc/stat. */
56 struct cpuacct_usage *view; /* Usage stats reported to the container. */
1f5596dd 57 int cpu_count;
ce617d73 58 pthread_mutex_t lock; /* For node manipulation. */
1f5596dd
CB
59 struct cg_proc_stat *next;
60};
61
62struct cg_proc_stat_head {
63 struct cg_proc_stat *next;
64 time_t lastcheck;
65
66 /*
67 * For access to the list. Reading can be parallel, pruning is exclusive.
68 */
69 pthread_rwlock_t lock;
70};
71
72#define CPUVIEW_HASH_SIZE 100
73static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
74
b456d40d
CB
75static void reset_proc_stat_node(struct cg_proc_stat *node,
76 struct cpuacct_usage *usage, int cpu_count)
1f5596dd 77{
1f5596dd
CB
78 lxcfs_debug("Resetting stat node for %s\n", node->cg);
79 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
80
b456d40d 81 for (int i = 0; i < cpu_count; i++) {
1f5596dd
CB
82 node->view[i].user = 0;
83 node->view[i].system = 0;
84 node->view[i].idle = 0;
85 }
86
87 node->cpu_count = cpu_count;
88}
89
90static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
91{
92 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
93
94 /* Allocate new memory */
82d74a95 95 new_usage = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
1f5596dd
CB
96 if (!new_usage)
97 return false;
98
82d74a95 99 new_view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
1f5596dd
CB
100 if (!new_view)
101 return false;
102
103 /* Copy existing data & initialize new elements */
104 for (int i = 0; i < cpu_count; i++) {
105 if (i < node->cpu_count) {
82d74a95
CB
106 new_usage[i].user = node->usage[i].user;
107 new_usage[i].system = node->usage[i].system;
108 new_usage[i].idle = node->usage[i].idle;
109
110 new_view[i].user = node->view[i].user;
111 new_view[i].system = node->view[i].system;
112 new_view[i].idle = node->view[i].idle;
1f5596dd
CB
113 }
114 }
115
116 free(node->usage);
117 node->usage = move_ptr(new_usage);
118
119 free(node->view);
120 node->view = move_ptr(new_view);
121 node->cpu_count = cpu_count;
122
123 return true;
124}
125
4ec5c9da
CB
126static void free_proc_stat_node(struct cg_proc_stat *node)
127{
6a4dceb1
CB
128 if (node) {
129 /*
130 * We're abusing the usage pointer to indicate that
131 * pthread_mutex_init() was successful. Don't judge me.
132 */
133 if (node->usage)
134 pthread_mutex_destroy(&node->lock);
135 free_disarm(node->cg);
136 free_disarm(node->usage);
137 free_disarm(node->view);
138 free_disarm(node);
139 }
4ec5c9da
CB
140}
141
6a4dceb1
CB
142define_cleanup_function(struct cg_proc_stat *, free_proc_stat_node);
143
1f5596dd
CB
144static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
145{
0d129671
CB
146 call_cleaner(free_proc_stat_node) struct cg_proc_stat *new = new_node;
147 struct cg_proc_stat *rv = new_node;
148 int hash = calc_hash(new->cg) % CPUVIEW_HASH_SIZE;
1f5596dd 149 struct cg_proc_stat_head *head = proc_stat_history[hash];
0d129671 150 struct cg_proc_stat *cur;
1f5596dd
CB
151
152 pthread_rwlock_wrlock(&head->lock);
153
154 if (!head->next) {
0d129671 155 head->next = move_ptr(new);
164acda7 156 goto out_rwlock_unlock;
1f5596dd
CB
157 }
158
0d129671 159 cur = head->next;
1f5596dd
CB
160
161 for (;;) {
0d129671
CB
162 /*
163 * The node to be added is already present in the list, so
164 * free the newly allocated one and return the one we found.
165 */
166 if (strcmp(cur->cg, new->cg) == 0) {
167 rv = cur;
164acda7 168 goto out_rwlock_unlock;
1f5596dd
CB
169 }
170
0d129671
CB
171 /* Keep walking. */
172 if (cur->next) {
173 cur = cur->next;
1f5596dd
CB
174 continue;
175 }
176
0d129671
CB
177 /* Add new node to end of list. */
178 cur->next = move_ptr(new);
164acda7 179 goto out_rwlock_unlock;
1f5596dd
CB
180 }
181
164acda7 182out_rwlock_unlock:
1f5596dd 183 pthread_rwlock_unlock(&head->lock);
0d129671 184 return move_ptr(rv);
1f5596dd
CB
185}
186
6a4dceb1
CB
187static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage,
188 int cpu_count, const char *cg)
1f5596dd 189{
6a4dceb1
CB
190 call_cleaner(free_proc_stat_node) struct cg_proc_stat *node = NULL;
191 __do_free struct cpuacct_usage *new_usage = NULL;
1f5596dd 192
6a4dceb1 193 node = zalloc(sizeof(struct cg_proc_stat));
1f5596dd 194 if (!node)
6a4dceb1 195 return NULL;
1f5596dd 196
6a4dceb1 197 node->cg = strdup(cg);
1f5596dd 198 if (!node->cg)
6a4dceb1 199 return NULL;
1f5596dd 200
6a4dceb1
CB
201 new_usage = memdup(usage, sizeof(struct cpuacct_usage) * cpu_count);
202 if (!new_usage)
203 return NULL;
1f5596dd 204
6a4dceb1 205 node->view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
1f5596dd 206 if (!node->view)
6a4dceb1 207 return NULL;
1f5596dd
CB
208
209 node->cpu_count = cpu_count;
1f5596dd 210
6a4dceb1
CB
211 if (pthread_mutex_init(&node->lock, NULL))
212 return NULL;
213 /*
214 * We're abusing the usage pointer to indicate that
215 * pthread_mutex_init() was successful. Don't judge me.
216 */
217 node->usage = move_ptr(new_usage);
1f5596dd 218
6a4dceb1 219 return move_ptr(node);
1f5596dd
CB
220}
221
2d00d04c
CB
222static bool cgroup_supports(const char *controller, const char *cgroup,
223 const char *file)
4ec5c9da 224{
2c990b1d
CB
225 __do_free char *path = NULL;
226 int cfd;
4ec5c9da
CB
227
228 cfd = get_cgroup_fd(controller);
229 if (cfd < 0)
230 return false;
231
925d5849 232 path = must_make_path_relative(cgroup, file, NULL);
2d00d04c 233 return faccessat(cfd, path, F_OK, 0) == 0;
4ec5c9da
CB
234}
235
1f5596dd
CB
236static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
237{
b456d40d 238 struct cg_proc_stat *first = NULL;
1f5596dd 239
b456d40d 240 for (struct cg_proc_stat *prev = NULL; node; ) {
2d00d04c
CB
241 if (!cgroup_supports("cpu", node->cg, "cpu.shares")) {
242 call_cleaner(free_proc_stat_node) struct cg_proc_stat *cur = node;
1f5596dd
CB
243
244 if (prev)
245 prev->next = node->next;
246 else
247 first = node->next;
248
249 node = node->next;
2d00d04c 250 lxcfs_debug("Removing stat node for %s\n", cur->cg);
1f5596dd
CB
251 } else {
252 if (!first)
253 first = node;
254 prev = node;
255 node = node->next;
256 }
257 }
258
259 return first;
260}
261
262#define PROC_STAT_PRUNE_INTERVAL 10
263static void prune_proc_stat_history(void)
264{
1f5596dd
CB
265 time_t now = time(NULL);
266
b456d40d 267 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1f5596dd
CB
268 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
269
270 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
271 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
272 return;
273 }
274
275 if (proc_stat_history[i]->next) {
276 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
277 proc_stat_history[i]->lastcheck = now;
278 }
279
280 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
281 }
282}
283
284static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head,
285 const char *cg)
286{
287 struct cg_proc_stat *node;
288
289 pthread_rwlock_rdlock(&head->lock);
290
291 if (!head->next) {
292 pthread_rwlock_unlock(&head->lock);
293 return NULL;
294 }
295
296 node = head->next;
297
298 do {
299 if (strcmp(cg, node->cg) == 0)
300 goto out;
301 } while ((node = node->next));
302
303 node = NULL;
304
305out:
306 pthread_rwlock_unlock(&head->lock);
307 prune_proc_stat_history();
308 return node;
309}
310
311static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
312{
313 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
314 struct cg_proc_stat_head *head = proc_stat_history[hash];
315 struct cg_proc_stat *node;
316
317 node = find_proc_stat_node(head, cg);
1f5596dd
CB
318 if (!node) {
319 node = new_proc_stat_node(usage, cpu_count, cg);
320 if (!node)
321 return NULL;
322
323 node = add_proc_stat_node(node);
324 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
325 }
326
327 pthread_mutex_lock(&node->lock);
328
ce089f10
CB
329 /*
330 * If additional CPUs on the host have been enabled, CPU usage counter
331 * arrays have to be expanded.
332 */
1f5596dd
CB
333 if (node->cpu_count < cpu_count) {
334 lxcfs_debug("Expanding stat node %d->%d for %s\n",
ce089f10 335 node->cpu_count, cpu_count, cg);
1f5596dd
CB
336
337 if (!expand_proc_stat_node(node, cpu_count)) {
338 pthread_mutex_unlock(&node->lock);
b456d40d 339 return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg);
1f5596dd
CB
340 }
341 }
342
343 return node;
344}
345
2b8eff1d
CB
346static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage,
347 uint64_t *counter, uint64_t threshold)
1f5596dd 348{
1ba088ae 349 uint64_t free_space, to_add;
1f5596dd
CB
350
351 free_space = threshold - usage->user - usage->system;
352
353 if (free_space > usage->idle)
354 free_space = usage->idle;
355
8206874a
CB
356 if (free_space > *surplus)
357 to_add = *surplus;
358 else
359 to_add = free_space;
1f5596dd
CB
360
361 *counter += to_add;
362 usage->idle -= to_add;
363 *surplus -= to_add;
364}
365
1ba088ae
CB
366static uint64_t diff_cpu_usage(struct cpuacct_usage *older,
367 struct cpuacct_usage *newer,
368 struct cpuacct_usage *diff, int cpu_count)
1f5596dd 369{
1ba088ae 370 uint64_t sum = 0;
1f5596dd 371
b456d40d 372 for (int i = 0; i < cpu_count; i++) {
1f5596dd
CB
373 if (!newer[i].online)
374 continue;
375
b456d40d
CB
376 /*
377 * When cpuset is changed on the fly, the CPUs might get
378 * reordered. We could either reset all counters, or check
379 * that the substractions below will return expected results.
1f5596dd
CB
380 */
381 if (newer[i].user > older[i].user)
382 diff[i].user = newer[i].user - older[i].user;
383 else
384 diff[i].user = 0;
385
386 if (newer[i].system > older[i].system)
387 diff[i].system = newer[i].system - older[i].system;
388 else
389 diff[i].system = 0;
390
391 if (newer[i].idle > older[i].idle)
392 diff[i].idle = newer[i].idle - older[i].idle;
393 else
394 diff[i].idle = 0;
395
396 sum += diff[i].user;
397 sum += diff[i].system;
398 sum += diff[i].idle;
399 }
400
401 return sum;
402}
403
404/*
b456d40d
CB
405 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
406 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
407 * throuh `value`.
1f5596dd
CB
408 */
409static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
410{
411 __do_free char *str = NULL;
48f6862e 412 char file[STRLITERALLEN("cpu.cfs_period_us") + 1];
9844eea7 413 bool first = true;
48f6862e 414 int ret;
1f5596dd 415
48f6862e 416 if (pure_unified_layout(cgroup_ops)) {
9844eea7 417 first = !strcmp(param, "quota");
48f6862e
CB
418 ret = snprintf(file, sizeof(file), "cpu.max");
419 } else {
420 ret = snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
9844eea7 421 }
48f6862e 422 if (ret < 0 || (size_t)ret >= sizeof(file))
1f5596dd
CB
423 return false;
424
48f6862e 425 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
1f5596dd
CB
426 return false;
427
48f6862e 428 return sscanf(str, first ? "%" PRId64 : "%*d %" PRId64, value) == 1;
1f5596dd
CB
429}
430
431/*
432 * Return the exact number of visible CPUs based on CPU quotas.
433 * If there is no quota set, zero is returned.
434 */
435static double exact_cpu_count(const char *cg)
436{
437 double rv;
438 int nprocs;
439 int64_t cfs_quota, cfs_period;
440
9844eea7
JC
441 read_cpu_cfs_param(cg, "quota", &cfs_quota);
442 read_cpu_cfs_param(cg, "period", &cfs_period);
1f5596dd
CB
443
444 if (cfs_quota <= 0 || cfs_period <= 0)
445 return 0;
446
447 rv = (double)cfs_quota / (double)cfs_period;
448
449 nprocs = get_nprocs();
450
451 if (rv > nprocs)
452 rv = nprocs;
453
454 return rv;
455}
456
457/*
458 * Return the maximum number of visible CPUs based on CPU quotas.
459 * If there is no quota set, zero is returned.
460 */
4ec5c9da 461int max_cpu_count(const char *cg)
1f5596dd 462{
700dd417 463 __do_free char *cpuset = NULL;
1f5596dd
CB
464 int rv, nprocs;
465 int64_t cfs_quota, cfs_period;
466 int nr_cpus_in_cpuset = 0;
1f5596dd 467
9844eea7
JC
468 read_cpu_cfs_param(cg, "quota", &cfs_quota);
469 read_cpu_cfs_param(cg, "period", &cfs_period);
1f5596dd
CB
470
471 cpuset = get_cpuset(cg);
472 if (cpuset)
473 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
474
475 if (cfs_quota <= 0 || cfs_period <= 0){
476 if (nr_cpus_in_cpuset > 0)
477 return nr_cpus_in_cpuset;
478
479 return 0;
480 }
481
482 rv = cfs_quota / cfs_period;
483
484 /* In case quota/period does not yield a whole number, add one CPU for
485 * the remainder.
486 */
487 if ((cfs_quota % cfs_period) > 0)
488 rv += 1;
489
490 nprocs = get_nprocs();
1f5596dd
CB
491 if (rv > nprocs)
492 rv = nprocs;
493
494 /* use min value in cpu quota and cpuset */
495 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
496 rv = nr_cpus_in_cpuset;
497
498 return rv;
499}
500
501int cpuview_proc_stat(const char *cg, const char *cpuset,
502 struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size,
503 FILE *f, char *buf, size_t buf_size)
504{
505 __do_free char *line = NULL;
506 __do_free struct cpuacct_usage *diff = NULL;
4f18a602 507 size_t linelen = 0, total_len = 0;
1f5596dd
CB
508 int curcpu = -1; /* cpu numbering starts at 0 */
509 int physcpu, i;
39f231da 510 int cpu_cnt = 0;
2b8eff1d
CB
511 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
512 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
513 uint64_t user_sum = 0, system_sum = 0, idle_sum = 0;
514 uint64_t user_surplus = 0, system_surplus = 0;
39f231da 515 int nprocs, max_cpus;
4f18a602 516 ssize_t l;
2b8eff1d 517 uint64_t total_sum, threshold;
1f5596dd 518 struct cg_proc_stat *stat_node;
1f5596dd 519
39f231da 520 nprocs = get_nprocs_conf();
1f5596dd
CB
521 if (cg_cpu_usage_size < nprocs)
522 nprocs = cg_cpu_usage_size;
523
524 /* Read all CPU stats and stop when we've encountered other lines */
525 while (getline(&line, &linelen, f) != -1) {
526 int ret;
527 char cpu_char[10]; /* That's a lot of cores */
528 uint64_t all_used, cg_used;
529
530 if (strlen(line) == 0)
531 continue;
532
533 /* not a ^cpuN line containing a number N */
534 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
535 break;
536
537 if (sscanf(cpu_char, "%d", &physcpu) != 1)
538 continue;
539
540 if (physcpu >= cg_cpu_usage_size)
541 continue;
542
fd65c77c
CB
543 curcpu++;
544 cpu_cnt++;
1f5596dd
CB
545
546 if (!cpu_in_cpuset(physcpu, cpuset)) {
547 for (i = curcpu; i <= physcpu; i++)
548 cg_cpu_usage[i].online = false;
549 continue;
550 }
551
552 if (curcpu < physcpu) {
553 /* Some CPUs may be disabled */
554 for (i = curcpu; i < physcpu; i++)
555 cg_cpu_usage[i].online = false;
556
557 curcpu = physcpu;
558 }
559
560 cg_cpu_usage[curcpu].online = true;
561
2b8eff1d 562 ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu",
1f5596dd
CB
563 &user,
564 &nice,
565 &system,
566 &idle,
567 &iowait,
568 &irq,
569 &softirq,
570 &steal,
571 &guest,
572 &guest_nice);
1f5596dd
CB
573 if (ret != 10)
574 continue;
575
576 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
577 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
578
579 if (all_used >= cg_used) {
580 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
581
582 } else {
2b8eff1d
CB
583 lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
584 curcpu, cg, all_used, cg_used);
1f5596dd
CB
585 cg_cpu_usage[curcpu].idle = idle;
586 }
587 }
588
f9434b9a
CB
589 /* Cannot use more CPUs than is available in cpuset. */
590 max_cpus = max_cpu_count(cg);
591 if (max_cpus > cpu_cnt || !max_cpus)
592 max_cpus = cpu_cnt;
593
1f5596dd 594 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
b456d40d
CB
595 if (!stat_node)
596 return log_error(0, "Failed to find/create stat node for %s", cg);
1f5596dd
CB
597
598 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
700dd417 599 if (!diff)
1f5596dd 600 return 0;
1f5596dd
CB
601
602 /*
603 * If the new values are LOWER than values stored in memory, it means
604 * the cgroup has been reset/recreated and we should reset too.
605 */
606 for (curcpu = 0; curcpu < nprocs; curcpu++) {
607 if (!cg_cpu_usage[curcpu].online)
608 continue;
609
610 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
611 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
612
613 break;
614 }
615
616 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
617
618 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
619 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
620
621 if (!stat_node->usage[curcpu].online)
622 continue;
623
624 i++;
625
626 stat_node->usage[curcpu].user += diff[curcpu].user;
627 stat_node->usage[curcpu].system += diff[curcpu].system;
628 stat_node->usage[curcpu].idle += diff[curcpu].idle;
629
630 if (max_cpus > 0 && i >= max_cpus) {
631 user_surplus += diff[curcpu].user;
632 system_surplus += diff[curcpu].system;
633 }
634 }
635
636 /* Calculate usage counters of visible CPUs */
637 if (max_cpus > 0) {
2b8eff1d
CB
638 uint64_t diff_user = 0;
639 uint64_t diff_system = 0;
640 uint64_t diff_idle = 0;
641 uint64_t max_diff_idle = 0;
642 uint64_t max_diff_idle_index = 0;
1f5596dd
CB
643 double exact_cpus;
644
645 /* threshold = maximum usage per cpu, including idle */
646 threshold = total_sum / cpu_cnt * max_cpus;
647
648 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
649 if (!stat_node->usage[curcpu].online)
650 continue;
651
652 i++;
653
654 if (i == max_cpus)
655 break;
656
657 if (diff[curcpu].user + diff[curcpu].system >= threshold)
658 continue;
659
660 /* Add user */
661 add_cpu_usage(&user_surplus, &diff[curcpu],
662 &diff[curcpu].user, threshold);
663
664 if (diff[curcpu].user + diff[curcpu].system >= threshold)
665 continue;
666
667 /* If there is still room, add system */
668 add_cpu_usage(&system_surplus, &diff[curcpu],
669 &diff[curcpu].system, threshold);
670 }
671
672 if (user_surplus > 0)
673 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
674 if (system_surplus > 0)
675 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
676
677 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
678 if (!stat_node->usage[curcpu].online)
679 continue;
680
681 i++;
682
683 if (i == max_cpus)
684 break;
685
686 stat_node->view[curcpu].user += diff[curcpu].user;
687 stat_node->view[curcpu].system += diff[curcpu].system;
688 stat_node->view[curcpu].idle += diff[curcpu].idle;
689
690 user_sum += stat_node->view[curcpu].user;
691 system_sum += stat_node->view[curcpu].system;
692 idle_sum += stat_node->view[curcpu].idle;
693
694 diff_user += diff[curcpu].user;
695 diff_system += diff[curcpu].system;
696 diff_idle += diff[curcpu].idle;
697 if (diff[curcpu].idle > max_diff_idle) {
698 max_diff_idle = diff[curcpu].idle;
699 max_diff_idle_index = curcpu;
700 }
701
702 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
703 }
704 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
705
706 /* revise cpu usage view to support partial cpu case. */
707 exact_cpus = exact_cpu_count(cg);
708 if (exact_cpus < (double)max_cpus){
1ba088ae 709 uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
1f5596dd
CB
710
711 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
712 lxcfs_v("delta: %lu\n", delta);
713 lxcfs_v("idle_sum before: %lu\n", idle_sum);
714 idle_sum = idle_sum > delta ? idle_sum - delta : 0;
715 lxcfs_v("idle_sum after: %lu\n", idle_sum);
716
717 curcpu = max_diff_idle_index;
718 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
719 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
720 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
721 }
722 } else {
723 for (curcpu = 0; curcpu < nprocs; curcpu++) {
724 if (!stat_node->usage[curcpu].online)
725 continue;
726
727 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
728 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
729 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
730
731 user_sum += stat_node->view[curcpu].user;
732 system_sum += stat_node->view[curcpu].system;
733 idle_sum += stat_node->view[curcpu].idle;
734 }
735 }
736
737 /* Render the file */
738 /* cpu-all */
2b8eff1d
CB
739 l = snprintf(buf, buf_size,
740 "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
741 user_sum, system_sum, idle_sum);
1f5596dd 742 lxcfs_v("cpu-all: %s\n", buf);
b456d40d
CB
743 if (l < 0)
744 return log_error(0, "Failed to write cache");
745 if (l >= buf_size)
746 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
747
748 buf += l;
749 buf_size -= l;
750 total_len += l;
751
752 /* Render visible CPUs */
753 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
754 if (!stat_node->usage[curcpu].online)
755 continue;
756
757 i++;
758
759 if (max_cpus > 0 && i == max_cpus)
760 break;
761
2b8eff1d
CB
762 l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
763 i,
764 stat_node->view[curcpu].user,
765 stat_node->view[curcpu].system,
766 stat_node->view[curcpu].idle);
1f5596dd 767 lxcfs_v("cpu: %s\n", buf);
b456d40d
CB
768 if (l < 0)
769 return log_error(0, "Failed to write cache");
770 if (l >= buf_size)
771 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
772
773 buf += l;
774 buf_size -= l;
775 total_len += l;
776 }
777
778 /* Pass the rest of /proc/stat, start with the last line read */
779 l = snprintf(buf, buf_size, "%s", line);
b456d40d
CB
780 if (l < 0)
781 return log_error(0, "Failed to write cache");
782 if (l >= buf_size)
783 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
784
785 buf += l;
786 buf_size -= l;
787 total_len += l;
788
789 /* Pass the rest of the host's /proc/stat */
790 while (getline(&line, &linelen, f) != -1) {
791 l = snprintf(buf, buf_size, "%s", line);
b456d40d
CB
792 if (l < 0)
793 return log_error(0, "Failed to write cache");
794 if (l >= buf_size)
795 return log_error(0, "Write to cache was truncated");
796
1f5596dd
CB
797 buf += l;
798 buf_size -= l;
799 total_len += l;
800 }
801
802 if (stat_node)
803 pthread_mutex_unlock(&stat_node->lock);
b456d40d 804
1f5596dd
CB
805 return total_len;
806}
807
808/*
809 * check whether this is a '^processor" line in /proc/cpuinfo
810 */
b456d40d 811static inline bool is_processor_line(const char *line)
1f5596dd
CB
812{
813 int cpu;
b456d40d 814 return sscanf(line, "processor : %d", &cpu) == 1;
1f5596dd
CB
815}
816
b456d40d 817static inline bool cpuline_in_cpuset(const char *line, const char *cpuset)
1f5596dd
CB
818{
819 int cpu;
c539526c
CB
820
821 if (sscanf(line, "processor : %d", &cpu) == 1)
822 return cpu_in_cpuset(cpu, cpuset);
823
824 return false;
1f5596dd
CB
825}
826
827int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
828 struct fuse_file_info *fi)
829{
830 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
757a63e7 831 __do_free void *fopen_cache = NULL;
1f5596dd
CB
832 __do_fclose FILE *f = NULL;
833 struct fuse_context *fc = fuse_get_context();
0274438c 834 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
99b183fb 835 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1f5596dd
CB
836 size_t linelen = 0, total_len = 0;
837 bool am_printing = false, firstline = true, is_s390x = false;
838 int curcpu = -1, cpu, max_cpus = 0;
839 bool use_view;
840 char *cache = d->buf;
841 size_t cache_size = d->buflen;
842
f9434b9a 843 if (offset) {
1f5596dd
CB
844 int left;
845
846 if (offset > d->size)
847 return -EINVAL;
848
849 if (!d->cached)
850 return 0;
851
852 left = d->size - offset;
853 total_len = left > size ? size: left;
854 memcpy(buf, cache + offset, total_len);
855
856 return total_len;
857 }
858
859 pid_t initpid = lookup_initpid_in_store(fc->pid);
860 if (initpid <= 1 || is_shared_pidns(initpid))
861 initpid = fc->pid;
b456d40d 862
1f5596dd
CB
863 cg = get_pid_cgroup(initpid, "cpuset");
864 if (!cg)
865 return read_file_fuse("proc/cpuinfo", buf, size, d);
866 prune_init_slice(cg);
867
868 cpuset = get_cpuset(cg);
869 if (!cpuset)
870 return 0;
871
8044f626 872 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
0274438c 873 use_view = true;
8044f626
CB
874 else
875 use_view = false;
1f5596dd
CB
876 if (use_view)
877 max_cpus = max_cpu_count(cg);
878
757a63e7 879 f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache);
1f5596dd
CB
880 if (!f)
881 return 0;
882
883 while (getline(&line, &linelen, f) != -1) {
884 ssize_t l;
885 if (firstline) {
886 firstline = false;
887 if (strstr(line, "IBM/S390") != NULL) {
888 is_s390x = true;
889 am_printing = true;
890 continue;
891 }
892 }
b456d40d 893
1f5596dd
CB
894 if (strncmp(line, "# processors:", 12) == 0)
895 continue;
b456d40d 896
1f5596dd 897 if (is_processor_line(line)) {
d0031abf 898 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
1f5596dd 899 break;
b456d40d 900
1f5596dd
CB
901 am_printing = cpuline_in_cpuset(line, cpuset);
902 if (am_printing) {
d0031abf 903 curcpu++;
1f5596dd 904 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
b456d40d
CB
905 if (l < 0)
906 return log_error(0, "Failed to write cache");
907 if (l >= cache_size)
908 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
909 cache += l;
910 cache_size -= l;
911 total_len += l;
912 }
913 continue;
914 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
915 char *p;
b456d40d 916
d0031abf 917 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
1f5596dd 918 break;
b456d40d 919
1f5596dd
CB
920 if (!cpu_in_cpuset(cpu, cpuset))
921 continue;
b456d40d 922
1f5596dd
CB
923 curcpu ++;
924 p = strchr(line, ':');
925 if (!p || !*p)
926 return 0;
927 p++;
b456d40d 928
1f5596dd 929 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
b456d40d
CB
930 if (l < 0)
931 return log_error(0, "Failed to write cache");
932 if (l >= cache_size)
933 return log_error(0, "Write to cache was truncated");
934
1f5596dd
CB
935 cache += l;
936 cache_size -= l;
937 total_len += l;
938 continue;
939
940 }
941 if (am_printing) {
942 l = snprintf(cache, cache_size, "%s", line);
b456d40d
CB
943 if (l < 0)
944 return log_error(0, "Failed to write cache");
945 if (l >= cache_size)
946 return log_error(0, "Write to cache was truncated");
947
1f5596dd
CB
948 cache += l;
949 cache_size -= l;
950 total_len += l;
951 }
952 }
953
954 if (is_s390x) {
955 __do_free char *origcache = d->buf;
956 ssize_t l;
957
958 d->buf = malloc(d->buflen);
959 if (!d->buf) {
960 d->buf = move_ptr(origcache);
961 return 0;
962 }
963
964 cache = d->buf;
965 cache_size = d->buflen;
966 total_len = 0;
967 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
968 if (l < 0 || l >= cache_size)
969 return 0;
970
971 cache_size -= l;
972 cache += l;
973 total_len += l;
974 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
975 if (l < 0 || l >= cache_size)
976 return 0;
977
978 cache_size -= l;
979 cache += l;
980 total_len += l;
981 l = snprintf(cache, cache_size, "%s", origcache);
982 if (l < 0 || l >= cache_size)
983 return 0;
984 total_len += l;
985 }
986
987 d->cached = 1;
988 d->size = total_len;
d0031abf
CB
989 if (total_len > size)
990 total_len = size;
1f5596dd
CB
991
992 /* read from off 0 */
993 memcpy(buf, d->buf, total_len);
d0031abf 994
1f5596dd
CB
995 return total_len;
996}
997
998/*
999 * Returns 0 on success.
1000 * It is the caller's responsibility to free `return_usage`, unless this
1001 * function returns an error.
1002 */
1003int read_cpuacct_usage_all(char *cg, char *cpuset,
1004 struct cpuacct_usage **return_usage, int *size)
1005{
1006 __do_free char *usage_str = NULL;
1007 __do_free struct cpuacct_usage *cpu_usage = NULL;
9ce186dc 1008 int i = 0, j = 0, read_pos = 0, read_cnt = 0;
8b6987a2 1009 int cpucount;
9ce186dc 1010 int ret;
1f5596dd
CB
1011 int cg_cpu;
1012 uint64_t cg_user, cg_system;
1013 int64_t ticks_per_sec;
1014
1015 ticks_per_sec = sysconf(_SC_CLK_TCK);
1f5596dd 1016 if (ticks_per_sec < 0 && errno == EINVAL) {
8b6987a2 1017 lxcfs_debug("%m - Failed to determine number of ticks per second");
1f5596dd
CB
1018 return -1;
1019 }
1020
f9434b9a 1021 cpucount = get_nprocs_conf();
1f5596dd
CB
1022 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
1023 if (!cpu_usage)
1024 return -ENOMEM;
1025
1026 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1027 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
8b6987a2
CB
1028 char *sep = " \t\n";
1029 char *tok;
1f5596dd 1030
8b6987a2
CB
1031 /* Read cpuacct.usage_percpu instead. */
1032 lxcfs_debug("Falling back to cpuacct.usage_percpu");
1f5596dd
CB
1033 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
1034 return -1;
1f5596dd 1035
8b6987a2
CB
1036 lxc_iterate_parts(tok, usage_str, sep) {
1037 uint64_t percpu_user;
1038
1039 if (i >= cpucount)
1040 break;
1f5596dd 1041
8b6987a2
CB
1042 tok = trim_whitespace_in_place(tok);
1043 ret = safe_uint64(tok, &percpu_user, 10);
1044 if (ret)
1045 return -1;
1f5596dd 1046
8b6987a2
CB
1047 /* Convert the time from nanoseconds to USER_HZ */
1048 cpu_usage[i].user = percpu_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1049 cpu_usage[i].system = cpu_usage[i].user;
1f5596dd 1050 i++;
8b6987a2 1051 lxcfs_debug("cpu%d with time %s", i, tok);
1f5596dd 1052 }
8b6987a2
CB
1053 } else {
1054 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0)
1055 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg);
1f5596dd 1056
8b6987a2 1057 read_pos += read_cnt;
1f5596dd 1058
8b6987a2
CB
1059 for (i = 0, j = 0; i < cpucount; i++) {
1060 ret = sscanf(usage_str + read_pos,
1061 "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu,
1062 &cg_user, &cg_system, &read_cnt);
1f5596dd 1063
8b6987a2
CB
1064 if (ret == EOF)
1065 break;
1f5596dd 1066
8b6987a2
CB
1067 if (ret != 3)
1068 return log_error(-EINVAL, "Failed to parse cpuacct.usage_all line %s from cgroup %s",
1069 usage_str + read_pos, cg);
1f5596dd 1070
8b6987a2 1071 read_pos += read_cnt;
1f5596dd 1072
8b6987a2
CB
1073 /* Convert the time from nanoseconds to USER_HZ */
1074 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1075 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
1076 j++;
1077 }
1f5596dd
CB
1078 }
1079
1080 *return_usage = move_ptr(cpu_usage);
1081 *size = cpucount;
1082 return 0;
1083}
1084
1085static bool cpuview_init_head(struct cg_proc_stat_head **head)
1086{
1087 *head = malloc(sizeof(struct cg_proc_stat_head));
b456d40d
CB
1088 if (!(*head))
1089 return log_error(false, "%s", strerror(errno));
1f5596dd
CB
1090
1091 (*head)->lastcheck = time(NULL);
1092 (*head)->next = NULL;
1093
1094 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
1f5596dd 1095 free_disarm(*head);
b456d40d 1096 return log_error(false, "Failed to initialize list lock");
1f5596dd
CB
1097 }
1098
1099 return true;
1100}
1101
4ec5c9da 1102bool init_cpuview(void)
1f5596dd
CB
1103{
1104 int i;
1105
1106 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
1107 proc_stat_history[i] = NULL;
1108
1109 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1110 if (!cpuview_init_head(&proc_stat_history[i]))
1111 goto err;
1112 }
1113
1114 return true;
1115
1116err:
1117 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1118 if (proc_stat_history[i])
1119 free_disarm(proc_stat_history[i]);
1120 }
1121
1122 return false;
1123}
1124
1f5596dd
CB
1125static void cpuview_free_head(struct cg_proc_stat_head *head)
1126{
905769cd 1127 struct cg_proc_stat *node;
1f5596dd
CB
1128
1129 if (head->next) {
1130 node = head->next;
1131
1132 for (;;) {
905769cd 1133 struct cg_proc_stat *cur = node;
1f5596dd 1134 node = node->next;
905769cd 1135 free_proc_stat_node(cur);
1f5596dd
CB
1136 if (!node)
1137 break;
1138 }
1139 }
1140
1141 pthread_rwlock_destroy(&head->lock);
1142 free_disarm(head);
1143}
1144
4ec5c9da 1145void free_cpuview(void)
1f5596dd 1146{
4ec5c9da 1147 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++)
1f5596dd
CB
1148 if (proc_stat_history[i])
1149 cpuview_free_head(proc_stat_history[i]);
1f5596dd 1150}