]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/proc_cpuview.c
tree-wide: set _GNU_SOURCE in meson.build
[mirror_lxcfs.git] / src / proc_cpuview.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
1f5596dd 2
f834b6bf
SP
3#include "config.h"
4
1f5596dd
CB
5#include <dirent.h>
6#include <errno.h>
7#include <fcntl.h>
1f5596dd
CB
8#include <inttypes.h>
9#include <libgen.h>
10#include <pthread.h>
11#include <sched.h>
12#include <stdarg.h>
13#include <stdbool.h>
14#include <stdint.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <time.h>
19#include <unistd.h>
20#include <wait.h>
21#include <linux/magic.h>
22#include <linux/sched.h>
23#include <sys/epoll.h>
24#include <sys/mman.h>
25#include <sys/mount.h>
26#include <sys/param.h>
27#include <sys/socket.h>
28#include <sys/syscall.h>
29#include <sys/sysinfo.h>
30#include <sys/vfs.h>
31
e01afbb7
CB
32#include "proc_cpuview.h"
33
1f5596dd 34#include "bindings.h"
1f5596dd
CB
35#include "cgroup_fuse.h"
36#include "cpuset_parse.h"
37#include "cgroups/cgroup.h"
38#include "cgroups/cgroup_utils.h"
39#include "memory_utils.h"
4ec5c9da 40#include "proc_loadavg.h"
1f5596dd
CB
41#include "utils.h"
42
1f5596dd
CB
43/* Data for CPU view */
44struct cg_proc_stat {
45 char *cg;
ce617d73
CB
46 struct cpuacct_usage *usage; /* Real usage as read from the host's /proc/stat. */
47 struct cpuacct_usage *view; /* Usage stats reported to the container. */
1f5596dd 48 int cpu_count;
ce617d73 49 pthread_mutex_t lock; /* For node manipulation. */
1f5596dd
CB
50 struct cg_proc_stat *next;
51};
52
53struct cg_proc_stat_head {
54 struct cg_proc_stat *next;
55 time_t lastcheck;
56
57 /*
58 * For access to the list. Reading can be parallel, pruning is exclusive.
59 */
60 pthread_rwlock_t lock;
61};
62
63#define CPUVIEW_HASH_SIZE 100
64static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
65
b456d40d
CB
66static void reset_proc_stat_node(struct cg_proc_stat *node,
67 struct cpuacct_usage *usage, int cpu_count)
1f5596dd 68{
1f5596dd
CB
69 lxcfs_debug("Resetting stat node for %s\n", node->cg);
70 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
71
b456d40d 72 for (int i = 0; i < cpu_count; i++) {
1f5596dd
CB
73 node->view[i].user = 0;
74 node->view[i].system = 0;
75 node->view[i].idle = 0;
76 }
77
78 node->cpu_count = cpu_count;
79}
80
81static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
82{
83 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
84
85 /* Allocate new memory */
82d74a95 86 new_usage = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
1f5596dd
CB
87 if (!new_usage)
88 return false;
89
82d74a95 90 new_view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
1f5596dd
CB
91 if (!new_view)
92 return false;
93
94 /* Copy existing data & initialize new elements */
95 for (int i = 0; i < cpu_count; i++) {
96 if (i < node->cpu_count) {
82d74a95
CB
97 new_usage[i].user = node->usage[i].user;
98 new_usage[i].system = node->usage[i].system;
99 new_usage[i].idle = node->usage[i].idle;
100
101 new_view[i].user = node->view[i].user;
102 new_view[i].system = node->view[i].system;
103 new_view[i].idle = node->view[i].idle;
1f5596dd
CB
104 }
105 }
106
107 free(node->usage);
108 node->usage = move_ptr(new_usage);
109
110 free(node->view);
111 node->view = move_ptr(new_view);
112 node->cpu_count = cpu_count;
113
114 return true;
115}
116
4ec5c9da
CB
117static void free_proc_stat_node(struct cg_proc_stat *node)
118{
6a4dceb1
CB
119 if (node) {
120 /*
121 * We're abusing the usage pointer to indicate that
122 * pthread_mutex_init() was successful. Don't judge me.
123 */
124 if (node->usage)
125 pthread_mutex_destroy(&node->lock);
126 free_disarm(node->cg);
127 free_disarm(node->usage);
128 free_disarm(node->view);
129 free_disarm(node);
130 }
4ec5c9da
CB
131}
132
6a4dceb1
CB
133define_cleanup_function(struct cg_proc_stat *, free_proc_stat_node);
134
1f5596dd
CB
135static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
136{
0d129671
CB
137 call_cleaner(free_proc_stat_node) struct cg_proc_stat *new = new_node;
138 struct cg_proc_stat *rv = new_node;
139 int hash = calc_hash(new->cg) % CPUVIEW_HASH_SIZE;
1f5596dd 140 struct cg_proc_stat_head *head = proc_stat_history[hash];
0d129671 141 struct cg_proc_stat *cur;
1f5596dd
CB
142
143 pthread_rwlock_wrlock(&head->lock);
144
145 if (!head->next) {
0d129671 146 head->next = move_ptr(new);
164acda7 147 goto out_rwlock_unlock;
1f5596dd
CB
148 }
149
0d129671 150 cur = head->next;
1f5596dd
CB
151
152 for (;;) {
0d129671
CB
153 /*
154 * The node to be added is already present in the list, so
155 * free the newly allocated one and return the one we found.
156 */
157 if (strcmp(cur->cg, new->cg) == 0) {
158 rv = cur;
164acda7 159 goto out_rwlock_unlock;
1f5596dd
CB
160 }
161
0d129671
CB
162 /* Keep walking. */
163 if (cur->next) {
164 cur = cur->next;
1f5596dd
CB
165 continue;
166 }
167
0d129671
CB
168 /* Add new node to end of list. */
169 cur->next = move_ptr(new);
164acda7 170 goto out_rwlock_unlock;
1f5596dd
CB
171 }
172
164acda7 173out_rwlock_unlock:
1f5596dd 174 pthread_rwlock_unlock(&head->lock);
0d129671 175 return move_ptr(rv);
1f5596dd
CB
176}
177
6a4dceb1
CB
178static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage,
179 int cpu_count, const char *cg)
1f5596dd 180{
6a4dceb1
CB
181 call_cleaner(free_proc_stat_node) struct cg_proc_stat *node = NULL;
182 __do_free struct cpuacct_usage *new_usage = NULL;
1f5596dd 183
6a4dceb1 184 node = zalloc(sizeof(struct cg_proc_stat));
1f5596dd 185 if (!node)
6a4dceb1 186 return NULL;
1f5596dd 187
6a4dceb1 188 node->cg = strdup(cg);
1f5596dd 189 if (!node->cg)
6a4dceb1 190 return NULL;
1f5596dd 191
6a4dceb1
CB
192 new_usage = memdup(usage, sizeof(struct cpuacct_usage) * cpu_count);
193 if (!new_usage)
194 return NULL;
1f5596dd 195
6a4dceb1 196 node->view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
1f5596dd 197 if (!node->view)
6a4dceb1 198 return NULL;
1f5596dd
CB
199
200 node->cpu_count = cpu_count;
1f5596dd 201
6a4dceb1
CB
202 if (pthread_mutex_init(&node->lock, NULL))
203 return NULL;
204 /*
205 * We're abusing the usage pointer to indicate that
206 * pthread_mutex_init() was successful. Don't judge me.
207 */
208 node->usage = move_ptr(new_usage);
1f5596dd 209
6a4dceb1 210 return move_ptr(node);
1f5596dd
CB
211}
212
2d00d04c
CB
213static bool cgroup_supports(const char *controller, const char *cgroup,
214 const char *file)
4ec5c9da 215{
2c990b1d
CB
216 __do_free char *path = NULL;
217 int cfd;
4ec5c9da
CB
218
219 cfd = get_cgroup_fd(controller);
220 if (cfd < 0)
221 return false;
222
925d5849 223 path = must_make_path_relative(cgroup, file, NULL);
2d00d04c 224 return faccessat(cfd, path, F_OK, 0) == 0;
4ec5c9da
CB
225}
226
1f5596dd
CB
227static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
228{
b456d40d 229 struct cg_proc_stat *first = NULL;
1f5596dd 230
b456d40d 231 for (struct cg_proc_stat *prev = NULL; node; ) {
2d00d04c 232 if (!cgroup_supports("cpu", node->cg, "cpu.shares")) {
d5e34313 233 struct cg_proc_stat *cur = node;
1f5596dd
CB
234
235 if (prev)
236 prev->next = node->next;
237 else
238 first = node->next;
239
240 node = node->next;
d5e34313
CB
241 lxcfs_debug("Removing stat node for %s\n", cur);
242
243 free_proc_stat_node(cur);
1f5596dd
CB
244 } else {
245 if (!first)
246 first = node;
247 prev = node;
248 node = node->next;
249 }
250 }
251
252 return first;
253}
254
255#define PROC_STAT_PRUNE_INTERVAL 10
256static void prune_proc_stat_history(void)
257{
1f5596dd
CB
258 time_t now = time(NULL);
259
b456d40d 260 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1f5596dd
CB
261 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
262
263 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
264 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
265 return;
266 }
267
268 if (proc_stat_history[i]->next) {
269 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
270 proc_stat_history[i]->lastcheck = now;
271 }
272
273 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
274 }
275}
276
277static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head,
278 const char *cg)
279{
280 struct cg_proc_stat *node;
281
282 pthread_rwlock_rdlock(&head->lock);
283
284 if (!head->next) {
285 pthread_rwlock_unlock(&head->lock);
286 return NULL;
287 }
288
289 node = head->next;
290
291 do {
292 if (strcmp(cg, node->cg) == 0)
293 goto out;
294 } while ((node = node->next));
295
296 node = NULL;
297
298out:
299 pthread_rwlock_unlock(&head->lock);
300 prune_proc_stat_history();
301 return node;
302}
303
692f48eb
CB
304static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage,
305 int cpu_count, const char *cg)
1f5596dd
CB
306{
307 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
308 struct cg_proc_stat_head *head = proc_stat_history[hash];
309 struct cg_proc_stat *node;
310
311 node = find_proc_stat_node(head, cg);
1f5596dd
CB
312 if (!node) {
313 node = new_proc_stat_node(usage, cpu_count, cg);
314 if (!node)
315 return NULL;
316
317 node = add_proc_stat_node(node);
318 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
319 }
320
321 pthread_mutex_lock(&node->lock);
322
ce089f10
CB
323 /*
324 * If additional CPUs on the host have been enabled, CPU usage counter
325 * arrays have to be expanded.
326 */
1f5596dd
CB
327 if (node->cpu_count < cpu_count) {
328 lxcfs_debug("Expanding stat node %d->%d for %s\n",
ce089f10 329 node->cpu_count, cpu_count, cg);
1f5596dd
CB
330
331 if (!expand_proc_stat_node(node, cpu_count)) {
332 pthread_mutex_unlock(&node->lock);
b456d40d 333 return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg);
1f5596dd
CB
334 }
335 }
336
337 return node;
338}
339
2b8eff1d
CB
340static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage,
341 uint64_t *counter, uint64_t threshold)
1f5596dd 342{
1ba088ae 343 uint64_t free_space, to_add;
1f5596dd
CB
344
345 free_space = threshold - usage->user - usage->system;
346
347 if (free_space > usage->idle)
348 free_space = usage->idle;
349
8206874a
CB
350 if (free_space > *surplus)
351 to_add = *surplus;
352 else
353 to_add = free_space;
1f5596dd
CB
354
355 *counter += to_add;
356 usage->idle -= to_add;
357 *surplus -= to_add;
358}
359
1ba088ae
CB
360static uint64_t diff_cpu_usage(struct cpuacct_usage *older,
361 struct cpuacct_usage *newer,
362 struct cpuacct_usage *diff, int cpu_count)
1f5596dd 363{
1ba088ae 364 uint64_t sum = 0;
1f5596dd 365
b456d40d 366 for (int i = 0; i < cpu_count; i++) {
1f5596dd
CB
367 if (!newer[i].online)
368 continue;
369
b456d40d
CB
370 /*
371 * When cpuset is changed on the fly, the CPUs might get
372 * reordered. We could either reset all counters, or check
373 * that the substractions below will return expected results.
1f5596dd
CB
374 */
375 if (newer[i].user > older[i].user)
376 diff[i].user = newer[i].user - older[i].user;
377 else
378 diff[i].user = 0;
379
380 if (newer[i].system > older[i].system)
381 diff[i].system = newer[i].system - older[i].system;
382 else
383 diff[i].system = 0;
384
385 if (newer[i].idle > older[i].idle)
386 diff[i].idle = newer[i].idle - older[i].idle;
387 else
388 diff[i].idle = 0;
389
390 sum += diff[i].user;
391 sum += diff[i].system;
392 sum += diff[i].idle;
393 }
394
395 return sum;
396}
397
398/*
b456d40d
CB
399 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
400 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
92264841 401 * through `value`.
1f5596dd
CB
402 */
403static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
404{
405 __do_free char *str = NULL;
48f6862e 406 char file[STRLITERALLEN("cpu.cfs_period_us") + 1];
9844eea7 407 bool first = true;
48f6862e 408 int ret;
1f5596dd 409
48f6862e 410 if (pure_unified_layout(cgroup_ops)) {
9844eea7 411 first = !strcmp(param, "quota");
48f6862e
CB
412 ret = snprintf(file, sizeof(file), "cpu.max");
413 } else {
414 ret = snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
9844eea7 415 }
48f6862e 416 if (ret < 0 || (size_t)ret >= sizeof(file))
1f5596dd
CB
417 return false;
418
48f6862e 419 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
1f5596dd
CB
420 return false;
421
48f6862e 422 return sscanf(str, first ? "%" PRId64 : "%*d %" PRId64, value) == 1;
1f5596dd
CB
423}
424
425/*
426 * Return the exact number of visible CPUs based on CPU quotas.
427 * If there is no quota set, zero is returned.
428 */
429static double exact_cpu_count(const char *cg)
430{
431 double rv;
432 int nprocs;
433 int64_t cfs_quota, cfs_period;
434
c602a0d0
CB
435 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
436 return 0;
437
438 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
439 return 0;
1f5596dd
CB
440
441 if (cfs_quota <= 0 || cfs_period <= 0)
442 return 0;
443
444 rv = (double)cfs_quota / (double)cfs_period;
445
446 nprocs = get_nprocs();
447
448 if (rv > nprocs)
449 rv = nprocs;
450
451 return rv;
452}
453
454/*
455 * Return the maximum number of visible CPUs based on CPU quotas.
456 * If there is no quota set, zero is returned.
457 */
4ec5c9da 458int max_cpu_count(const char *cg)
1f5596dd 459{
700dd417 460 __do_free char *cpuset = NULL;
1f5596dd
CB
461 int rv, nprocs;
462 int64_t cfs_quota, cfs_period;
463 int nr_cpus_in_cpuset = 0;
1f5596dd 464
921bdfdb
CB
465 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
466 return 0;
467
468 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
469 return 0;
1f5596dd
CB
470
471 cpuset = get_cpuset(cg);
472 if (cpuset)
473 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
474
921bdfdb 475 if (cfs_quota <= 0 || cfs_period <= 0) {
1f5596dd
CB
476 if (nr_cpus_in_cpuset > 0)
477 return nr_cpus_in_cpuset;
478
479 return 0;
480 }
481
482 rv = cfs_quota / cfs_period;
483
921bdfdb
CB
484 /*
485 * In case quota/period does not yield a whole number, add one CPU for
1f5596dd
CB
486 * the remainder.
487 */
488 if ((cfs_quota % cfs_period) > 0)
489 rv += 1;
490
491 nprocs = get_nprocs();
1f5596dd
CB
492 if (rv > nprocs)
493 rv = nprocs;
494
921bdfdb 495 /* Use min value in cpu quota and cpuset. */
1f5596dd
CB
496 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
497 rv = nr_cpus_in_cpuset;
498
499 return rv;
500}
501
502int cpuview_proc_stat(const char *cg, const char *cpuset,
503 struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size,
504 FILE *f, char *buf, size_t buf_size)
505{
506 __do_free char *line = NULL;
507 __do_free struct cpuacct_usage *diff = NULL;
4f18a602 508 size_t linelen = 0, total_len = 0;
1f5596dd
CB
509 int curcpu = -1; /* cpu numbering starts at 0 */
510 int physcpu, i;
39f231da 511 int cpu_cnt = 0;
2b8eff1d
CB
512 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
513 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
514 uint64_t user_sum = 0, system_sum = 0, idle_sum = 0;
515 uint64_t user_surplus = 0, system_surplus = 0;
39f231da 516 int nprocs, max_cpus;
4f18a602 517 ssize_t l;
2b8eff1d 518 uint64_t total_sum, threshold;
1f5596dd 519 struct cg_proc_stat *stat_node;
1f5596dd 520
39f231da 521 nprocs = get_nprocs_conf();
1f5596dd
CB
522 if (cg_cpu_usage_size < nprocs)
523 nprocs = cg_cpu_usage_size;
524
525 /* Read all CPU stats and stop when we've encountered other lines */
526 while (getline(&line, &linelen, f) != -1) {
527 int ret;
528 char cpu_char[10]; /* That's a lot of cores */
529 uint64_t all_used, cg_used;
530
531 if (strlen(line) == 0)
532 continue;
533
534 /* not a ^cpuN line containing a number N */
535 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
536 break;
537
538 if (sscanf(cpu_char, "%d", &physcpu) != 1)
539 continue;
540
541 if (physcpu >= cg_cpu_usage_size)
542 continue;
543
fd65c77c
CB
544 curcpu++;
545 cpu_cnt++;
1f5596dd
CB
546
547 if (!cpu_in_cpuset(physcpu, cpuset)) {
548 for (i = curcpu; i <= physcpu; i++)
549 cg_cpu_usage[i].online = false;
550 continue;
551 }
552
553 if (curcpu < physcpu) {
554 /* Some CPUs may be disabled */
555 for (i = curcpu; i < physcpu; i++)
556 cg_cpu_usage[i].online = false;
557
558 curcpu = physcpu;
559 }
560
561 cg_cpu_usage[curcpu].online = true;
562
2b8eff1d 563 ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu",
1f5596dd
CB
564 &user,
565 &nice,
566 &system,
567 &idle,
568 &iowait,
569 &irq,
570 &softirq,
571 &steal,
572 &guest,
573 &guest_nice);
1f5596dd
CB
574 if (ret != 10)
575 continue;
576
577 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
578 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
579
580 if (all_used >= cg_used) {
581 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
1f5596dd 582 } else {
cc49667a
CB
583 lxcfs_v("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
584 curcpu, cg, all_used, cg_used);
1f5596dd
CB
585 cg_cpu_usage[curcpu].idle = idle;
586 }
587 }
588
f9434b9a
CB
589 /* Cannot use more CPUs than is available in cpuset. */
590 max_cpus = max_cpu_count(cg);
591 if (max_cpus > cpu_cnt || !max_cpus)
592 max_cpus = cpu_cnt;
593
692f48eb 594 /* takes lock pthread_mutex_lock(&node->lock) */
1f5596dd 595 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
b456d40d
CB
596 if (!stat_node)
597 return log_error(0, "Failed to find/create stat node for %s", cg);
1f5596dd 598
b4572722 599 diff = zalloc(sizeof(struct cpuacct_usage) * nprocs);
700dd417 600 if (!diff)
08d61303 601 goto out_pthread_mutex_unlock;
1f5596dd
CB
602
603 /*
604 * If the new values are LOWER than values stored in memory, it means
605 * the cgroup has been reset/recreated and we should reset too.
606 */
607 for (curcpu = 0; curcpu < nprocs; curcpu++) {
608 if (!cg_cpu_usage[curcpu].online)
609 continue;
610
611 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
612 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
613
614 break;
615 }
616
617 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
618
619 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
620 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
621
622 if (!stat_node->usage[curcpu].online)
623 continue;
624
625 i++;
626
b4572722 627 stat_node->usage[curcpu].user += diff[curcpu].user;
1f5596dd 628 stat_node->usage[curcpu].system += diff[curcpu].system;
b4572722 629 stat_node->usage[curcpu].idle += diff[curcpu].idle;
1f5596dd
CB
630
631 if (max_cpus > 0 && i >= max_cpus) {
b4572722
CB
632 user_surplus += diff[curcpu].user;
633 system_surplus += diff[curcpu].system;
1f5596dd
CB
634 }
635 }
636
637 /* Calculate usage counters of visible CPUs */
638 if (max_cpus > 0) {
2b8eff1d
CB
639 uint64_t diff_user = 0;
640 uint64_t diff_system = 0;
641 uint64_t diff_idle = 0;
642 uint64_t max_diff_idle = 0;
643 uint64_t max_diff_idle_index = 0;
1f5596dd 644 double exact_cpus;
1f5596dd
CB
645 /* threshold = maximum usage per cpu, including idle */
646 threshold = total_sum / cpu_cnt * max_cpus;
647
648 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
649 if (!stat_node->usage[curcpu].online)
650 continue;
651
652 i++;
653
654 if (i == max_cpus)
655 break;
656
657 if (diff[curcpu].user + diff[curcpu].system >= threshold)
658 continue;
659
660 /* Add user */
661 add_cpu_usage(&user_surplus, &diff[curcpu],
662 &diff[curcpu].user, threshold);
663
664 if (diff[curcpu].user + diff[curcpu].system >= threshold)
665 continue;
666
667 /* If there is still room, add system */
668 add_cpu_usage(&system_surplus, &diff[curcpu],
669 &diff[curcpu].system, threshold);
670 }
671
672 if (user_surplus > 0)
673 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
674 if (system_surplus > 0)
675 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
676
677 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
678 if (!stat_node->usage[curcpu].online)
679 continue;
680
681 i++;
682
683 if (i == max_cpus)
684 break;
685
b4572722
CB
686 stat_node->view[curcpu].user += diff[curcpu].user;
687 stat_node->view[curcpu].system += diff[curcpu].system;
688 stat_node->view[curcpu].idle += diff[curcpu].idle;
1f5596dd 689
b4572722
CB
690 user_sum += stat_node->view[curcpu].user;
691 system_sum += stat_node->view[curcpu].system;
692 idle_sum += stat_node->view[curcpu].idle;
1f5596dd 693
b4572722
CB
694 diff_user += diff[curcpu].user;
695 diff_system += diff[curcpu].system;
696 diff_idle += diff[curcpu].idle;
1f5596dd 697 if (diff[curcpu].idle > max_diff_idle) {
b4572722
CB
698 max_diff_idle = diff[curcpu].idle;
699 max_diff_idle_index = curcpu;
1f5596dd
CB
700 }
701
702 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
703 }
704 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
705
706 /* revise cpu usage view to support partial cpu case. */
707 exact_cpus = exact_cpu_count(cg);
708 if (exact_cpus < (double)max_cpus){
1ba088ae 709 uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
1f5596dd
CB
710
711 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
712 lxcfs_v("delta: %lu\n", delta);
713 lxcfs_v("idle_sum before: %lu\n", idle_sum);
b4572722
CB
714 if (idle_sum > delta)
715 idle_sum = idle_sum - delta;
716 else
717 idle_sum = 0;
1f5596dd
CB
718 lxcfs_v("idle_sum after: %lu\n", idle_sum);
719
720 curcpu = max_diff_idle_index;
721 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
b4572722
CB
722 if (stat_node->view[curcpu].idle > delta)
723 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle - delta;
724 else
725 stat_node->view[curcpu].idle = 0;
1f5596dd
CB
726 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
727 }
728 } else {
729 for (curcpu = 0; curcpu < nprocs; curcpu++) {
730 if (!stat_node->usage[curcpu].online)
731 continue;
732
b4572722
CB
733 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
734 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
735 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
1f5596dd 736
b4572722
CB
737 user_sum += stat_node->view[curcpu].user;
738 system_sum += stat_node->view[curcpu].system;
739 idle_sum += stat_node->view[curcpu].idle;
1f5596dd
CB
740 }
741 }
742
743 /* Render the file */
744 /* cpu-all */
2b8eff1d
CB
745 l = snprintf(buf, buf_size,
746 "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
747 user_sum, system_sum, idle_sum);
1f5596dd 748 lxcfs_v("cpu-all: %s\n", buf);
692f48eb
CB
749 if (l < 0) {
750 lxcfs_error("Failed to write cache");
751 total_len = 0;
752 goto out_pthread_mutex_unlock;
753 }
3cf1e562 754 if ((size_t)l >= buf_size) {
08d61303
CB
755 lxcfs_error("Write to cache was truncated");
756 total_len = 0;
757 goto out_pthread_mutex_unlock;
758 }
1f5596dd
CB
759
760 buf += l;
761 buf_size -= l;
762 total_len += l;
763
764 /* Render visible CPUs */
765 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
766 if (!stat_node->usage[curcpu].online)
767 continue;
768
769 i++;
770
771 if (max_cpus > 0 && i == max_cpus)
772 break;
773
2b8eff1d
CB
774 l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
775 i,
776 stat_node->view[curcpu].user,
777 stat_node->view[curcpu].system,
778 stat_node->view[curcpu].idle);
1f5596dd 779 lxcfs_v("cpu: %s\n", buf);
692f48eb
CB
780 if (l < 0) {
781 lxcfs_error("Failed to write cache");
782 total_len = 0;
783 goto out_pthread_mutex_unlock;
784 }
3cf1e562 785 if ((size_t)l >= buf_size) {
692f48eb
CB
786 lxcfs_error("Write to cache was truncated");
787 total_len = 0;
788 goto out_pthread_mutex_unlock;
789 }
1f5596dd
CB
790
791 buf += l;
792 buf_size -= l;
793 total_len += l;
794 }
795
796 /* Pass the rest of /proc/stat, start with the last line read */
797 l = snprintf(buf, buf_size, "%s", line);
692f48eb
CB
798 if (l < 0) {
799 lxcfs_error("Failed to write cache");
800 total_len = 0;
801 goto out_pthread_mutex_unlock;
802 }
3cf1e562 803 if ((size_t)l >= buf_size) {
692f48eb
CB
804 lxcfs_error("Write to cache was truncated");
805 total_len = 0;
806 goto out_pthread_mutex_unlock;
807 }
1f5596dd
CB
808
809 buf += l;
810 buf_size -= l;
811 total_len += l;
812
813 /* Pass the rest of the host's /proc/stat */
814 while (getline(&line, &linelen, f) != -1) {
815 l = snprintf(buf, buf_size, "%s", line);
692f48eb
CB
816 if (l < 0) {
817 lxcfs_error("Failed to write cache");
818 total_len = 0;
819 goto out_pthread_mutex_unlock;
820 }
3cf1e562 821 if ((size_t)l >= buf_size) {
692f48eb
CB
822 lxcfs_error("Write to cache was truncated");
823 total_len = 0;
824 goto out_pthread_mutex_unlock;
825 }
b456d40d 826
1f5596dd
CB
827 buf += l;
828 buf_size -= l;
829 total_len += l;
830 }
831
692f48eb 832out_pthread_mutex_unlock:
1f5596dd
CB
833 if (stat_node)
834 pthread_mutex_unlock(&stat_node->lock);
b456d40d 835
1f5596dd
CB
836 return total_len;
837}
838
839/*
840 * check whether this is a '^processor" line in /proc/cpuinfo
841 */
b456d40d 842static inline bool is_processor_line(const char *line)
1f5596dd
CB
843{
844 int cpu;
b456d40d 845 return sscanf(line, "processor : %d", &cpu) == 1;
1f5596dd
CB
846}
847
b456d40d 848static inline bool cpuline_in_cpuset(const char *line, const char *cpuset)
1f5596dd
CB
849{
850 int cpu;
c539526c
CB
851
852 if (sscanf(line, "processor : %d", &cpu) == 1)
853 return cpu_in_cpuset(cpu, cpuset);
854
855 return false;
1f5596dd
CB
856}
857
858int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
859 struct fuse_file_info *fi)
860{
861 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
757a63e7 862 __do_free void *fopen_cache = NULL;
1f5596dd
CB
863 __do_fclose FILE *f = NULL;
864 struct fuse_context *fc = fuse_get_context();
0274438c 865 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
99b183fb 866 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1f5596dd
CB
867 size_t linelen = 0, total_len = 0;
868 bool am_printing = false, firstline = true, is_s390x = false;
869 int curcpu = -1, cpu, max_cpus = 0;
870 bool use_view;
871 char *cache = d->buf;
872 size_t cache_size = d->buflen;
873
f9434b9a 874 if (offset) {
3cf1e562 875 size_t left;
1f5596dd
CB
876
877 if (offset > d->size)
878 return -EINVAL;
879
880 if (!d->cached)
881 return 0;
882
883 left = d->size - offset;
884 total_len = left > size ? size: left;
885 memcpy(buf, cache + offset, total_len);
886
887 return total_len;
888 }
889
890 pid_t initpid = lookup_initpid_in_store(fc->pid);
891 if (initpid <= 1 || is_shared_pidns(initpid))
892 initpid = fc->pid;
b456d40d 893
1f5596dd
CB
894 cg = get_pid_cgroup(initpid, "cpuset");
895 if (!cg)
896 return read_file_fuse("proc/cpuinfo", buf, size, d);
897 prune_init_slice(cg);
898
899 cpuset = get_cpuset(cg);
900 if (!cpuset)
901 return 0;
902
8044f626 903 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
0274438c 904 use_view = true;
8044f626
CB
905 else
906 use_view = false;
1f5596dd
CB
907 if (use_view)
908 max_cpus = max_cpu_count(cg);
909
757a63e7 910 f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache);
1f5596dd
CB
911 if (!f)
912 return 0;
913
914 while (getline(&line, &linelen, f) != -1) {
915 ssize_t l;
916 if (firstline) {
917 firstline = false;
918 if (strstr(line, "IBM/S390") != NULL) {
919 is_s390x = true;
920 am_printing = true;
921 continue;
922 }
923 }
b456d40d 924
1f5596dd
CB
925 if (strncmp(line, "# processors:", 12) == 0)
926 continue;
b456d40d 927
1f5596dd 928 if (is_processor_line(line)) {
d0031abf 929 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
1f5596dd 930 break;
b456d40d 931
1f5596dd
CB
932 am_printing = cpuline_in_cpuset(line, cpuset);
933 if (am_printing) {
d0031abf 934 curcpu++;
1f5596dd 935 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
b456d40d
CB
936 if (l < 0)
937 return log_error(0, "Failed to write cache");
3cf1e562 938 if ((size_t)l >= cache_size)
b456d40d 939 return log_error(0, "Write to cache was truncated");
1f5596dd
CB
940 cache += l;
941 cache_size -= l;
942 total_len += l;
943 }
944 continue;
945 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
946 char *p;
b456d40d 947
d0031abf 948 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
1f5596dd 949 break;
b456d40d 950
1f5596dd
CB
951 if (!cpu_in_cpuset(cpu, cpuset))
952 continue;
b456d40d 953
1f5596dd
CB
954 curcpu ++;
955 p = strchr(line, ':');
956 if (!p || !*p)
957 return 0;
958 p++;
b456d40d 959
1f5596dd 960 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
b456d40d
CB
961 if (l < 0)
962 return log_error(0, "Failed to write cache");
3cf1e562 963 if ((size_t)l >= cache_size)
b456d40d
CB
964 return log_error(0, "Write to cache was truncated");
965
1f5596dd
CB
966 cache += l;
967 cache_size -= l;
968 total_len += l;
969 continue;
970
971 }
972 if (am_printing) {
973 l = snprintf(cache, cache_size, "%s", line);
b456d40d
CB
974 if (l < 0)
975 return log_error(0, "Failed to write cache");
3cf1e562 976 if ((size_t)l >= cache_size)
b456d40d
CB
977 return log_error(0, "Write to cache was truncated");
978
1f5596dd
CB
979 cache += l;
980 cache_size -= l;
981 total_len += l;
982 }
983 }
984
985 if (is_s390x) {
986 __do_free char *origcache = d->buf;
987 ssize_t l;
988
989 d->buf = malloc(d->buflen);
990 if (!d->buf) {
991 d->buf = move_ptr(origcache);
992 return 0;
993 }
994
995 cache = d->buf;
996 cache_size = d->buflen;
997 total_len = 0;
998 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3cf1e562 999 if (l < 0 || (size_t)l >= cache_size)
1f5596dd
CB
1000 return 0;
1001
1002 cache_size -= l;
1003 cache += l;
1004 total_len += l;
1005 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3cf1e562 1006 if (l < 0 || (size_t)l >= cache_size)
1f5596dd
CB
1007 return 0;
1008
1009 cache_size -= l;
1010 cache += l;
1011 total_len += l;
1012 l = snprintf(cache, cache_size, "%s", origcache);
3cf1e562 1013 if (l < 0 || (size_t)l >= cache_size)
1f5596dd
CB
1014 return 0;
1015 total_len += l;
1016 }
1017
1018 d->cached = 1;
1019 d->size = total_len;
d0031abf
CB
1020 if (total_len > size)
1021 total_len = size;
1f5596dd
CB
1022
1023 /* read from off 0 */
1024 memcpy(buf, d->buf, total_len);
d0031abf 1025
1f5596dd
CB
1026 return total_len;
1027}
1028
1029/*
1030 * Returns 0 on success.
1031 * It is the caller's responsibility to free `return_usage`, unless this
1032 * function returns an error.
1033 */
1034int read_cpuacct_usage_all(char *cg, char *cpuset,
1035 struct cpuacct_usage **return_usage, int *size)
1036{
1037 __do_free char *usage_str = NULL;
1038 __do_free struct cpuacct_usage *cpu_usage = NULL;
9ce186dc 1039 int i = 0, j = 0, read_pos = 0, read_cnt = 0;
8b6987a2 1040 int cpucount;
9ce186dc 1041 int ret;
1f5596dd
CB
1042 int cg_cpu;
1043 uint64_t cg_user, cg_system;
1044 int64_t ticks_per_sec;
1045
1046 ticks_per_sec = sysconf(_SC_CLK_TCK);
1f5596dd 1047 if (ticks_per_sec < 0 && errno == EINVAL) {
8b6987a2 1048 lxcfs_debug("%m - Failed to determine number of ticks per second");
1f5596dd
CB
1049 return -1;
1050 }
1051
f9434b9a 1052 cpucount = get_nprocs_conf();
1f5596dd
CB
1053 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
1054 if (!cpu_usage)
1055 return -ENOMEM;
1056
1057 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1058 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
8b6987a2
CB
1059 char *sep = " \t\n";
1060 char *tok;
1f5596dd 1061
8b6987a2
CB
1062 /* Read cpuacct.usage_percpu instead. */
1063 lxcfs_debug("Falling back to cpuacct.usage_percpu");
1f5596dd
CB
1064 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
1065 return -1;
1f5596dd 1066
8b6987a2
CB
1067 lxc_iterate_parts(tok, usage_str, sep) {
1068 uint64_t percpu_user;
1069
1070 if (i >= cpucount)
1071 break;
1f5596dd 1072
8b6987a2
CB
1073 tok = trim_whitespace_in_place(tok);
1074 ret = safe_uint64(tok, &percpu_user, 10);
1075 if (ret)
1076 return -1;
1f5596dd 1077
8b6987a2
CB
1078 /* Convert the time from nanoseconds to USER_HZ */
1079 cpu_usage[i].user = percpu_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1080 cpu_usage[i].system = cpu_usage[i].user;
1f5596dd 1081 i++;
8b6987a2 1082 lxcfs_debug("cpu%d with time %s", i, tok);
1f5596dd 1083 }
8b6987a2
CB
1084 } else {
1085 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0)
1086 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg);
1f5596dd 1087
8b6987a2 1088 read_pos += read_cnt;
1f5596dd 1089
8b6987a2
CB
1090 for (i = 0, j = 0; i < cpucount; i++) {
1091 ret = sscanf(usage_str + read_pos,
1092 "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu,
1093 &cg_user, &cg_system, &read_cnt);
1f5596dd 1094
8b6987a2
CB
1095 if (ret == EOF)
1096 break;
1f5596dd 1097
8b6987a2
CB
1098 if (ret != 3)
1099 return log_error(-EINVAL, "Failed to parse cpuacct.usage_all line %s from cgroup %s",
1100 usage_str + read_pos, cg);
1f5596dd 1101
8b6987a2 1102 read_pos += read_cnt;
1f5596dd 1103
8b6987a2
CB
1104 /* Convert the time from nanoseconds to USER_HZ */
1105 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1106 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
1107 j++;
1108 }
1f5596dd
CB
1109 }
1110
1111 *return_usage = move_ptr(cpu_usage);
1112 *size = cpucount;
1113 return 0;
1114}
1115
1116static bool cpuview_init_head(struct cg_proc_stat_head **head)
1117{
9d7fc1a3 1118 __do_free struct cg_proc_stat_head *h;
1f5596dd 1119
9d7fc1a3
CB
1120 h = zalloc(sizeof(struct cg_proc_stat_head));
1121 if (!h)
1122 return false;
1f5596dd 1123
9d7fc1a3
CB
1124 if (pthread_rwlock_init(&h->lock, NULL))
1125 return false;
1126
1127 h->lastcheck = time(NULL);
1f5596dd 1128
9d7fc1a3 1129 *head = move_ptr(h);
1f5596dd
CB
1130 return true;
1131}
1132
4ec5c9da 1133bool init_cpuview(void)
1f5596dd
CB
1134{
1135 int i;
1136
1137 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
1138 proc_stat_history[i] = NULL;
1139
1140 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1141 if (!cpuview_init_head(&proc_stat_history[i]))
1142 goto err;
1143 }
1144
1145 return true;
1146
1147err:
1148 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1149 if (proc_stat_history[i])
1150 free_disarm(proc_stat_history[i]);
1151 }
1152
1153 return false;
1154}
1155
1f5596dd
CB
1156static void cpuview_free_head(struct cg_proc_stat_head *head)
1157{
905769cd 1158 struct cg_proc_stat *node;
1f5596dd
CB
1159
1160 if (head->next) {
1161 node = head->next;
1162
1163 for (;;) {
905769cd 1164 struct cg_proc_stat *cur = node;
1f5596dd 1165 node = node->next;
905769cd 1166 free_proc_stat_node(cur);
1f5596dd
CB
1167 if (!node)
1168 break;
1169 }
1170 }
1171
1172 pthread_rwlock_destroy(&head->lock);
1173 free_disarm(head);
1174}
1175
4ec5c9da 1176void free_cpuview(void)
1f5596dd 1177{
4ec5c9da 1178 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++)
1f5596dd
CB
1179 if (proc_stat_history[i])
1180 cpuview_free_head(proc_stat_history[i]);
1f5596dd 1181}