]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/proc_cpuview.c
tree-wide: fix fuse header inclusion
[mirror_lxcfs.git] / src / proc_cpuview.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE
5 #endif
6
7 #include "config.h"
8
9 #define __STDC_FORMAT_MACROS
10 #include <dirent.h>
11 #include <errno.h>
12 #include <fcntl.h>
13 #include <inttypes.h>
14 #include <libgen.h>
15 #include <pthread.h>
16 #include <sched.h>
17 #include <stdarg.h>
18 #include <stdbool.h>
19 #include <stdint.h>
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <time.h>
24 #include <unistd.h>
25 #include <wait.h>
26 #include <linux/magic.h>
27 #include <linux/sched.h>
28 #include <sys/epoll.h>
29 #include <sys/mman.h>
30 #include <sys/mount.h>
31 #include <sys/param.h>
32 #include <sys/socket.h>
33 #include <sys/syscall.h>
34 #include <sys/sysinfo.h>
35 #include <sys/vfs.h>
36
37 #include "proc_cpuview.h"
38
39 #include "bindings.h"
40 #include "cgroup_fuse.h"
41 #include "cpuset_parse.h"
42 #include "cgroups/cgroup.h"
43 #include "cgroups/cgroup_utils.h"
44 #include "memory_utils.h"
45 #include "proc_loadavg.h"
46 #include "utils.h"
47
48 /* Data for CPU view */
49 struct cg_proc_stat {
50 char *cg;
51 struct cpuacct_usage *usage; /* Real usage as read from the host's /proc/stat. */
52 struct cpuacct_usage *view; /* Usage stats reported to the container. */
53 int cpu_count;
54 pthread_mutex_t lock; /* For node manipulation. */
55 struct cg_proc_stat *next;
56 };
57
58 struct cg_proc_stat_head {
59 struct cg_proc_stat *next;
60 time_t lastcheck;
61
62 /*
63 * For access to the list. Reading can be parallel, pruning is exclusive.
64 */
65 pthread_rwlock_t lock;
66 };
67
68 #define CPUVIEW_HASH_SIZE 100
69 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
70
71 static void reset_proc_stat_node(struct cg_proc_stat *node,
72 struct cpuacct_usage *usage, int cpu_count)
73 {
74 lxcfs_debug("Resetting stat node for %s\n", node->cg);
75 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
76
77 for (int i = 0; i < cpu_count; i++) {
78 node->view[i].user = 0;
79 node->view[i].system = 0;
80 node->view[i].idle = 0;
81 }
82
83 node->cpu_count = cpu_count;
84 }
85
86 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
87 {
88 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
89
90 /* Allocate new memory */
91 new_usage = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
92 if (!new_usage)
93 return false;
94
95 new_view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
96 if (!new_view)
97 return false;
98
99 /* Copy existing data & initialize new elements */
100 for (int i = 0; i < cpu_count; i++) {
101 if (i < node->cpu_count) {
102 new_usage[i].user = node->usage[i].user;
103 new_usage[i].system = node->usage[i].system;
104 new_usage[i].idle = node->usage[i].idle;
105
106 new_view[i].user = node->view[i].user;
107 new_view[i].system = node->view[i].system;
108 new_view[i].idle = node->view[i].idle;
109 }
110 }
111
112 free(node->usage);
113 node->usage = move_ptr(new_usage);
114
115 free(node->view);
116 node->view = move_ptr(new_view);
117 node->cpu_count = cpu_count;
118
119 return true;
120 }
121
122 static void free_proc_stat_node(struct cg_proc_stat *node)
123 {
124 if (node) {
125 /*
126 * We're abusing the usage pointer to indicate that
127 * pthread_mutex_init() was successful. Don't judge me.
128 */
129 if (node->usage)
130 pthread_mutex_destroy(&node->lock);
131 free_disarm(node->cg);
132 free_disarm(node->usage);
133 free_disarm(node->view);
134 free_disarm(node);
135 }
136 }
137
138 define_cleanup_function(struct cg_proc_stat *, free_proc_stat_node);
139
140 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
141 {
142 call_cleaner(free_proc_stat_node) struct cg_proc_stat *new = new_node;
143 struct cg_proc_stat *rv = new_node;
144 int hash = calc_hash(new->cg) % CPUVIEW_HASH_SIZE;
145 struct cg_proc_stat_head *head = proc_stat_history[hash];
146 struct cg_proc_stat *cur;
147
148 pthread_rwlock_wrlock(&head->lock);
149
150 if (!head->next) {
151 head->next = move_ptr(new);
152 goto out_rwlock_unlock;
153 }
154
155 cur = head->next;
156
157 for (;;) {
158 /*
159 * The node to be added is already present in the list, so
160 * free the newly allocated one and return the one we found.
161 */
162 if (strcmp(cur->cg, new->cg) == 0) {
163 rv = cur;
164 goto out_rwlock_unlock;
165 }
166
167 /* Keep walking. */
168 if (cur->next) {
169 cur = cur->next;
170 continue;
171 }
172
173 /* Add new node to end of list. */
174 cur->next = move_ptr(new);
175 goto out_rwlock_unlock;
176 }
177
178 out_rwlock_unlock:
179 pthread_rwlock_unlock(&head->lock);
180 return move_ptr(rv);
181 }
182
183 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage,
184 int cpu_count, const char *cg)
185 {
186 call_cleaner(free_proc_stat_node) struct cg_proc_stat *node = NULL;
187 __do_free struct cpuacct_usage *new_usage = NULL;
188
189 node = zalloc(sizeof(struct cg_proc_stat));
190 if (!node)
191 return NULL;
192
193 node->cg = strdup(cg);
194 if (!node->cg)
195 return NULL;
196
197 new_usage = memdup(usage, sizeof(struct cpuacct_usage) * cpu_count);
198 if (!new_usage)
199 return NULL;
200
201 node->view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
202 if (!node->view)
203 return NULL;
204
205 node->cpu_count = cpu_count;
206
207 if (pthread_mutex_init(&node->lock, NULL))
208 return NULL;
209 /*
210 * We're abusing the usage pointer to indicate that
211 * pthread_mutex_init() was successful. Don't judge me.
212 */
213 node->usage = move_ptr(new_usage);
214
215 return move_ptr(node);
216 }
217
218 static bool cgroup_supports(const char *controller, const char *cgroup,
219 const char *file)
220 {
221 __do_free char *path = NULL;
222 int cfd;
223
224 cfd = get_cgroup_fd(controller);
225 if (cfd < 0)
226 return false;
227
228 path = must_make_path_relative(cgroup, file, NULL);
229 return faccessat(cfd, path, F_OK, 0) == 0;
230 }
231
232 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
233 {
234 struct cg_proc_stat *first = NULL;
235
236 for (struct cg_proc_stat *prev = NULL; node; ) {
237 if (!cgroup_supports("cpu", node->cg, "cpu.shares")) {
238 call_cleaner(free_proc_stat_node) struct cg_proc_stat *cur = node;
239
240 if (prev)
241 prev->next = node->next;
242 else
243 first = node->next;
244
245 node = node->next;
246 lxcfs_debug("Removing stat node for %s\n", cur->cg);
247 } else {
248 if (!first)
249 first = node;
250 prev = node;
251 node = node->next;
252 }
253 }
254
255 return first;
256 }
257
258 #define PROC_STAT_PRUNE_INTERVAL 10
259 static void prune_proc_stat_history(void)
260 {
261 time_t now = time(NULL);
262
263 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) {
264 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
265
266 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
267 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
268 return;
269 }
270
271 if (proc_stat_history[i]->next) {
272 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
273 proc_stat_history[i]->lastcheck = now;
274 }
275
276 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
277 }
278 }
279
280 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head,
281 const char *cg)
282 {
283 struct cg_proc_stat *node;
284
285 pthread_rwlock_rdlock(&head->lock);
286
287 if (!head->next) {
288 pthread_rwlock_unlock(&head->lock);
289 return NULL;
290 }
291
292 node = head->next;
293
294 do {
295 if (strcmp(cg, node->cg) == 0)
296 goto out;
297 } while ((node = node->next));
298
299 node = NULL;
300
301 out:
302 pthread_rwlock_unlock(&head->lock);
303 prune_proc_stat_history();
304 return node;
305 }
306
307 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage,
308 int cpu_count, const char *cg)
309 {
310 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
311 struct cg_proc_stat_head *head = proc_stat_history[hash];
312 struct cg_proc_stat *node;
313
314 node = find_proc_stat_node(head, cg);
315 if (!node) {
316 node = new_proc_stat_node(usage, cpu_count, cg);
317 if (!node)
318 return NULL;
319
320 node = add_proc_stat_node(node);
321 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
322 }
323
324 pthread_mutex_lock(&node->lock);
325
326 /*
327 * If additional CPUs on the host have been enabled, CPU usage counter
328 * arrays have to be expanded.
329 */
330 if (node->cpu_count < cpu_count) {
331 lxcfs_debug("Expanding stat node %d->%d for %s\n",
332 node->cpu_count, cpu_count, cg);
333
334 if (!expand_proc_stat_node(node, cpu_count)) {
335 pthread_mutex_unlock(&node->lock);
336 return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg);
337 }
338 }
339
340 return node;
341 }
342
343 static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage,
344 uint64_t *counter, uint64_t threshold)
345 {
346 uint64_t free_space, to_add;
347
348 free_space = threshold - usage->user - usage->system;
349
350 if (free_space > usage->idle)
351 free_space = usage->idle;
352
353 if (free_space > *surplus)
354 to_add = *surplus;
355 else
356 to_add = free_space;
357
358 *counter += to_add;
359 usage->idle -= to_add;
360 *surplus -= to_add;
361 }
362
363 static uint64_t diff_cpu_usage(struct cpuacct_usage *older,
364 struct cpuacct_usage *newer,
365 struct cpuacct_usage *diff, int cpu_count)
366 {
367 uint64_t sum = 0;
368
369 for (int i = 0; i < cpu_count; i++) {
370 if (!newer[i].online)
371 continue;
372
373 /*
374 * When cpuset is changed on the fly, the CPUs might get
375 * reordered. We could either reset all counters, or check
376 * that the substractions below will return expected results.
377 */
378 if (newer[i].user > older[i].user)
379 diff[i].user = newer[i].user - older[i].user;
380 else
381 diff[i].user = 0;
382
383 if (newer[i].system > older[i].system)
384 diff[i].system = newer[i].system - older[i].system;
385 else
386 diff[i].system = 0;
387
388 if (newer[i].idle > older[i].idle)
389 diff[i].idle = newer[i].idle - older[i].idle;
390 else
391 diff[i].idle = 0;
392
393 sum += diff[i].user;
394 sum += diff[i].system;
395 sum += diff[i].idle;
396 }
397
398 return sum;
399 }
400
401 /*
402 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
403 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
404 * through `value`.
405 */
406 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
407 {
408 __do_free char *str = NULL;
409 char file[STRLITERALLEN("cpu.cfs_period_us") + 1];
410 bool first = true;
411 int ret;
412
413 if (pure_unified_layout(cgroup_ops)) {
414 first = !strcmp(param, "quota");
415 ret = snprintf(file, sizeof(file), "cpu.max");
416 } else {
417 ret = snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
418 }
419 if (ret < 0 || (size_t)ret >= sizeof(file))
420 return false;
421
422 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
423 return false;
424
425 return sscanf(str, first ? "%" PRId64 : "%*d %" PRId64, value) == 1;
426 }
427
428 /*
429 * Return the exact number of visible CPUs based on CPU quotas.
430 * If there is no quota set, zero is returned.
431 */
432 static double exact_cpu_count(const char *cg)
433 {
434 double rv;
435 int nprocs;
436 int64_t cfs_quota, cfs_period;
437
438 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
439 return 0;
440
441 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
442 return 0;
443
444 if (cfs_quota <= 0 || cfs_period <= 0)
445 return 0;
446
447 rv = (double)cfs_quota / (double)cfs_period;
448
449 nprocs = get_nprocs();
450
451 if (rv > nprocs)
452 rv = nprocs;
453
454 return rv;
455 }
456
457 /*
458 * Return the maximum number of visible CPUs based on CPU quotas.
459 * If there is no quota set, zero is returned.
460 */
461 int max_cpu_count(const char *cg)
462 {
463 __do_free char *cpuset = NULL;
464 int rv, nprocs;
465 int64_t cfs_quota, cfs_period;
466 int nr_cpus_in_cpuset = 0;
467
468 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
469 return 0;
470
471 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
472 return 0;
473
474 cpuset = get_cpuset(cg);
475 if (cpuset)
476 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
477
478 if (cfs_quota <= 0 || cfs_period <= 0) {
479 if (nr_cpus_in_cpuset > 0)
480 return nr_cpus_in_cpuset;
481
482 return 0;
483 }
484
485 rv = cfs_quota / cfs_period;
486
487 /*
488 * In case quota/period does not yield a whole number, add one CPU for
489 * the remainder.
490 */
491 if ((cfs_quota % cfs_period) > 0)
492 rv += 1;
493
494 nprocs = get_nprocs();
495 if (rv > nprocs)
496 rv = nprocs;
497
498 /* Use min value in cpu quota and cpuset. */
499 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
500 rv = nr_cpus_in_cpuset;
501
502 return rv;
503 }
504
505 int cpuview_proc_stat(const char *cg, const char *cpuset,
506 struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size,
507 FILE *f, char *buf, size_t buf_size)
508 {
509 __do_free char *line = NULL;
510 __do_free struct cpuacct_usage *diff = NULL;
511 size_t linelen = 0, total_len = 0;
512 int curcpu = -1; /* cpu numbering starts at 0 */
513 int physcpu, i;
514 int cpu_cnt = 0;
515 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
516 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
517 uint64_t user_sum = 0, system_sum = 0, idle_sum = 0;
518 uint64_t user_surplus = 0, system_surplus = 0;
519 int nprocs, max_cpus;
520 ssize_t l;
521 uint64_t total_sum, threshold;
522 struct cg_proc_stat *stat_node;
523
524 nprocs = get_nprocs_conf();
525 if (cg_cpu_usage_size < nprocs)
526 nprocs = cg_cpu_usage_size;
527
528 /* Read all CPU stats and stop when we've encountered other lines */
529 while (getline(&line, &linelen, f) != -1) {
530 int ret;
531 char cpu_char[10]; /* That's a lot of cores */
532 uint64_t all_used, cg_used;
533
534 if (strlen(line) == 0)
535 continue;
536
537 /* not a ^cpuN line containing a number N */
538 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
539 break;
540
541 if (sscanf(cpu_char, "%d", &physcpu) != 1)
542 continue;
543
544 if (physcpu >= cg_cpu_usage_size)
545 continue;
546
547 curcpu++;
548 cpu_cnt++;
549
550 if (!cpu_in_cpuset(physcpu, cpuset)) {
551 for (i = curcpu; i <= physcpu; i++)
552 cg_cpu_usage[i].online = false;
553 continue;
554 }
555
556 if (curcpu < physcpu) {
557 /* Some CPUs may be disabled */
558 for (i = curcpu; i < physcpu; i++)
559 cg_cpu_usage[i].online = false;
560
561 curcpu = physcpu;
562 }
563
564 cg_cpu_usage[curcpu].online = true;
565
566 ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu",
567 &user,
568 &nice,
569 &system,
570 &idle,
571 &iowait,
572 &irq,
573 &softirq,
574 &steal,
575 &guest,
576 &guest_nice);
577 if (ret != 10)
578 continue;
579
580 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
581 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
582
583 if (all_used >= cg_used) {
584 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
585
586 } else {
587 lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
588 curcpu, cg, all_used, cg_used);
589 cg_cpu_usage[curcpu].idle = idle;
590 }
591 }
592
593 /* Cannot use more CPUs than is available in cpuset. */
594 max_cpus = max_cpu_count(cg);
595 if (max_cpus > cpu_cnt || !max_cpus)
596 max_cpus = cpu_cnt;
597
598 /* takes lock pthread_mutex_lock(&node->lock) */
599 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
600 if (!stat_node)
601 return log_error(0, "Failed to find/create stat node for %s", cg);
602
603 diff = zalloc(sizeof(struct cpuacct_usage) * nprocs);
604 if (!diff)
605 goto out_pthread_mutex_unlock;
606
607 /*
608 * If the new values are LOWER than values stored in memory, it means
609 * the cgroup has been reset/recreated and we should reset too.
610 */
611 for (curcpu = 0; curcpu < nprocs; curcpu++) {
612 if (!cg_cpu_usage[curcpu].online)
613 continue;
614
615 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
616 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
617
618 break;
619 }
620
621 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
622
623 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
624 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
625
626 if (!stat_node->usage[curcpu].online)
627 continue;
628
629 i++;
630
631 stat_node->usage[curcpu].user += diff[curcpu].user;
632 stat_node->usage[curcpu].system += diff[curcpu].system;
633 stat_node->usage[curcpu].idle += diff[curcpu].idle;
634
635 if (max_cpus > 0 && i >= max_cpus) {
636 user_surplus += diff[curcpu].user;
637 system_surplus += diff[curcpu].system;
638 }
639 }
640
641 /* Calculate usage counters of visible CPUs */
642 if (max_cpus > 0) {
643 uint64_t diff_user = 0;
644 uint64_t diff_system = 0;
645 uint64_t diff_idle = 0;
646 uint64_t max_diff_idle = 0;
647 uint64_t max_diff_idle_index = 0;
648 double exact_cpus;
649 /* threshold = maximum usage per cpu, including idle */
650 threshold = total_sum / cpu_cnt * max_cpus;
651
652 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
653 if (!stat_node->usage[curcpu].online)
654 continue;
655
656 i++;
657
658 if (i == max_cpus)
659 break;
660
661 if (diff[curcpu].user + diff[curcpu].system >= threshold)
662 continue;
663
664 /* Add user */
665 add_cpu_usage(&user_surplus, &diff[curcpu],
666 &diff[curcpu].user, threshold);
667
668 if (diff[curcpu].user + diff[curcpu].system >= threshold)
669 continue;
670
671 /* If there is still room, add system */
672 add_cpu_usage(&system_surplus, &diff[curcpu],
673 &diff[curcpu].system, threshold);
674 }
675
676 if (user_surplus > 0)
677 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
678 if (system_surplus > 0)
679 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
680
681 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
682 if (!stat_node->usage[curcpu].online)
683 continue;
684
685 i++;
686
687 if (i == max_cpus)
688 break;
689
690 stat_node->view[curcpu].user += diff[curcpu].user;
691 stat_node->view[curcpu].system += diff[curcpu].system;
692 stat_node->view[curcpu].idle += diff[curcpu].idle;
693
694 user_sum += stat_node->view[curcpu].user;
695 system_sum += stat_node->view[curcpu].system;
696 idle_sum += stat_node->view[curcpu].idle;
697
698 diff_user += diff[curcpu].user;
699 diff_system += diff[curcpu].system;
700 diff_idle += diff[curcpu].idle;
701 if (diff[curcpu].idle > max_diff_idle) {
702 max_diff_idle = diff[curcpu].idle;
703 max_diff_idle_index = curcpu;
704 }
705
706 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
707 }
708 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
709
710 /* revise cpu usage view to support partial cpu case. */
711 exact_cpus = exact_cpu_count(cg);
712 if (exact_cpus < (double)max_cpus){
713 uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
714
715 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
716 lxcfs_v("delta: %lu\n", delta);
717 lxcfs_v("idle_sum before: %lu\n", idle_sum);
718 if (idle_sum > delta)
719 idle_sum = idle_sum - delta;
720 else
721 idle_sum = 0;
722 lxcfs_v("idle_sum after: %lu\n", idle_sum);
723
724 curcpu = max_diff_idle_index;
725 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
726 if (stat_node->view[curcpu].idle > delta)
727 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle - delta;
728 else
729 stat_node->view[curcpu].idle = 0;
730 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
731 }
732 } else {
733 for (curcpu = 0; curcpu < nprocs; curcpu++) {
734 if (!stat_node->usage[curcpu].online)
735 continue;
736
737 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
738 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
739 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
740
741 user_sum += stat_node->view[curcpu].user;
742 system_sum += stat_node->view[curcpu].system;
743 idle_sum += stat_node->view[curcpu].idle;
744 }
745 }
746
747 /* Render the file */
748 /* cpu-all */
749 l = snprintf(buf, buf_size,
750 "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
751 user_sum, system_sum, idle_sum);
752 lxcfs_v("cpu-all: %s\n", buf);
753 if (l < 0) {
754 lxcfs_error("Failed to write cache");
755 total_len = 0;
756 goto out_pthread_mutex_unlock;
757 }
758 if (l >= buf_size) {
759 lxcfs_error("Write to cache was truncated");
760 total_len = 0;
761 goto out_pthread_mutex_unlock;
762 }
763
764 buf += l;
765 buf_size -= l;
766 total_len += l;
767
768 /* Render visible CPUs */
769 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
770 if (!stat_node->usage[curcpu].online)
771 continue;
772
773 i++;
774
775 if (max_cpus > 0 && i == max_cpus)
776 break;
777
778 l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
779 i,
780 stat_node->view[curcpu].user,
781 stat_node->view[curcpu].system,
782 stat_node->view[curcpu].idle);
783 lxcfs_v("cpu: %s\n", buf);
784 if (l < 0) {
785 lxcfs_error("Failed to write cache");
786 total_len = 0;
787 goto out_pthread_mutex_unlock;
788 }
789 if (l >= buf_size) {
790 lxcfs_error("Write to cache was truncated");
791 total_len = 0;
792 goto out_pthread_mutex_unlock;
793 }
794
795 buf += l;
796 buf_size -= l;
797 total_len += l;
798 }
799
800 /* Pass the rest of /proc/stat, start with the last line read */
801 l = snprintf(buf, buf_size, "%s", line);
802 if (l < 0) {
803 lxcfs_error("Failed to write cache");
804 total_len = 0;
805 goto out_pthread_mutex_unlock;
806 }
807 if (l >= buf_size) {
808 lxcfs_error("Write to cache was truncated");
809 total_len = 0;
810 goto out_pthread_mutex_unlock;
811 }
812
813 buf += l;
814 buf_size -= l;
815 total_len += l;
816
817 /* Pass the rest of the host's /proc/stat */
818 while (getline(&line, &linelen, f) != -1) {
819 l = snprintf(buf, buf_size, "%s", line);
820 if (l < 0) {
821 lxcfs_error("Failed to write cache");
822 total_len = 0;
823 goto out_pthread_mutex_unlock;
824 }
825 if (l >= buf_size) {
826 lxcfs_error("Write to cache was truncated");
827 total_len = 0;
828 goto out_pthread_mutex_unlock;
829 }
830
831 buf += l;
832 buf_size -= l;
833 total_len += l;
834 }
835
836 out_pthread_mutex_unlock:
837 if (stat_node)
838 pthread_mutex_unlock(&stat_node->lock);
839
840 return total_len;
841 }
842
843 /*
844 * check whether this is a '^processor" line in /proc/cpuinfo
845 */
846 static inline bool is_processor_line(const char *line)
847 {
848 int cpu;
849 return sscanf(line, "processor : %d", &cpu) == 1;
850 }
851
852 static inline bool cpuline_in_cpuset(const char *line, const char *cpuset)
853 {
854 int cpu;
855
856 if (sscanf(line, "processor : %d", &cpu) == 1)
857 return cpu_in_cpuset(cpu, cpuset);
858
859 return false;
860 }
861
862 int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
863 struct fuse_file_info *fi)
864 {
865 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
866 __do_free void *fopen_cache = NULL;
867 __do_fclose FILE *f = NULL;
868 struct fuse_context *fc = fuse_get_context();
869 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
870 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
871 size_t linelen = 0, total_len = 0;
872 bool am_printing = false, firstline = true, is_s390x = false;
873 int curcpu = -1, cpu, max_cpus = 0;
874 bool use_view;
875 char *cache = d->buf;
876 size_t cache_size = d->buflen;
877
878 if (offset) {
879 int left;
880
881 if (offset > d->size)
882 return -EINVAL;
883
884 if (!d->cached)
885 return 0;
886
887 left = d->size - offset;
888 total_len = left > size ? size: left;
889 memcpy(buf, cache + offset, total_len);
890
891 return total_len;
892 }
893
894 pid_t initpid = lookup_initpid_in_store(fc->pid);
895 if (initpid <= 1 || is_shared_pidns(initpid))
896 initpid = fc->pid;
897
898 cg = get_pid_cgroup(initpid, "cpuset");
899 if (!cg)
900 return read_file_fuse("proc/cpuinfo", buf, size, d);
901 prune_init_slice(cg);
902
903 cpuset = get_cpuset(cg);
904 if (!cpuset)
905 return 0;
906
907 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
908 use_view = true;
909 else
910 use_view = false;
911 if (use_view)
912 max_cpus = max_cpu_count(cg);
913
914 f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache);
915 if (!f)
916 return 0;
917
918 while (getline(&line, &linelen, f) != -1) {
919 ssize_t l;
920 if (firstline) {
921 firstline = false;
922 if (strstr(line, "IBM/S390") != NULL) {
923 is_s390x = true;
924 am_printing = true;
925 continue;
926 }
927 }
928
929 if (strncmp(line, "# processors:", 12) == 0)
930 continue;
931
932 if (is_processor_line(line)) {
933 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
934 break;
935
936 am_printing = cpuline_in_cpuset(line, cpuset);
937 if (am_printing) {
938 curcpu++;
939 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
940 if (l < 0)
941 return log_error(0, "Failed to write cache");
942 if (l >= cache_size)
943 return log_error(0, "Write to cache was truncated");
944 cache += l;
945 cache_size -= l;
946 total_len += l;
947 }
948 continue;
949 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
950 char *p;
951
952 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
953 break;
954
955 if (!cpu_in_cpuset(cpu, cpuset))
956 continue;
957
958 curcpu ++;
959 p = strchr(line, ':');
960 if (!p || !*p)
961 return 0;
962 p++;
963
964 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
965 if (l < 0)
966 return log_error(0, "Failed to write cache");
967 if (l >= cache_size)
968 return log_error(0, "Write to cache was truncated");
969
970 cache += l;
971 cache_size -= l;
972 total_len += l;
973 continue;
974
975 }
976 if (am_printing) {
977 l = snprintf(cache, cache_size, "%s", line);
978 if (l < 0)
979 return log_error(0, "Failed to write cache");
980 if (l >= cache_size)
981 return log_error(0, "Write to cache was truncated");
982
983 cache += l;
984 cache_size -= l;
985 total_len += l;
986 }
987 }
988
989 if (is_s390x) {
990 __do_free char *origcache = d->buf;
991 ssize_t l;
992
993 d->buf = malloc(d->buflen);
994 if (!d->buf) {
995 d->buf = move_ptr(origcache);
996 return 0;
997 }
998
999 cache = d->buf;
1000 cache_size = d->buflen;
1001 total_len = 0;
1002 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
1003 if (l < 0 || l >= cache_size)
1004 return 0;
1005
1006 cache_size -= l;
1007 cache += l;
1008 total_len += l;
1009 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
1010 if (l < 0 || l >= cache_size)
1011 return 0;
1012
1013 cache_size -= l;
1014 cache += l;
1015 total_len += l;
1016 l = snprintf(cache, cache_size, "%s", origcache);
1017 if (l < 0 || l >= cache_size)
1018 return 0;
1019 total_len += l;
1020 }
1021
1022 d->cached = 1;
1023 d->size = total_len;
1024 if (total_len > size)
1025 total_len = size;
1026
1027 /* read from off 0 */
1028 memcpy(buf, d->buf, total_len);
1029
1030 return total_len;
1031 }
1032
1033 /*
1034 * Returns 0 on success.
1035 * It is the caller's responsibility to free `return_usage`, unless this
1036 * function returns an error.
1037 */
1038 int read_cpuacct_usage_all(char *cg, char *cpuset,
1039 struct cpuacct_usage **return_usage, int *size)
1040 {
1041 __do_free char *usage_str = NULL;
1042 __do_free struct cpuacct_usage *cpu_usage = NULL;
1043 int i = 0, j = 0, read_pos = 0, read_cnt = 0;
1044 int cpucount;
1045 int ret;
1046 int cg_cpu;
1047 uint64_t cg_user, cg_system;
1048 int64_t ticks_per_sec;
1049
1050 ticks_per_sec = sysconf(_SC_CLK_TCK);
1051 if (ticks_per_sec < 0 && errno == EINVAL) {
1052 lxcfs_debug("%m - Failed to determine number of ticks per second");
1053 return -1;
1054 }
1055
1056 cpucount = get_nprocs_conf();
1057 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
1058 if (!cpu_usage)
1059 return -ENOMEM;
1060
1061 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1062 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
1063 char *sep = " \t\n";
1064 char *tok;
1065
1066 /* Read cpuacct.usage_percpu instead. */
1067 lxcfs_debug("Falling back to cpuacct.usage_percpu");
1068 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
1069 return -1;
1070
1071 lxc_iterate_parts(tok, usage_str, sep) {
1072 uint64_t percpu_user;
1073
1074 if (i >= cpucount)
1075 break;
1076
1077 tok = trim_whitespace_in_place(tok);
1078 ret = safe_uint64(tok, &percpu_user, 10);
1079 if (ret)
1080 return -1;
1081
1082 /* Convert the time from nanoseconds to USER_HZ */
1083 cpu_usage[i].user = percpu_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1084 cpu_usage[i].system = cpu_usage[i].user;
1085 i++;
1086 lxcfs_debug("cpu%d with time %s", i, tok);
1087 }
1088 } else {
1089 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0)
1090 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg);
1091
1092 read_pos += read_cnt;
1093
1094 for (i = 0, j = 0; i < cpucount; i++) {
1095 ret = sscanf(usage_str + read_pos,
1096 "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu,
1097 &cg_user, &cg_system, &read_cnt);
1098
1099 if (ret == EOF)
1100 break;
1101
1102 if (ret != 3)
1103 return log_error(-EINVAL, "Failed to parse cpuacct.usage_all line %s from cgroup %s",
1104 usage_str + read_pos, cg);
1105
1106 read_pos += read_cnt;
1107
1108 /* Convert the time from nanoseconds to USER_HZ */
1109 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1110 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
1111 j++;
1112 }
1113 }
1114
1115 *return_usage = move_ptr(cpu_usage);
1116 *size = cpucount;
1117 return 0;
1118 }
1119
1120 static bool cpuview_init_head(struct cg_proc_stat_head **head)
1121 {
1122 __do_free struct cg_proc_stat_head *h;
1123
1124 h = zalloc(sizeof(struct cg_proc_stat_head));
1125 if (!h)
1126 return false;
1127
1128 if (pthread_rwlock_init(&h->lock, NULL))
1129 return false;
1130
1131 h->lastcheck = time(NULL);
1132
1133 *head = move_ptr(h);
1134 return true;
1135 }
1136
1137 bool init_cpuview(void)
1138 {
1139 int i;
1140
1141 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
1142 proc_stat_history[i] = NULL;
1143
1144 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1145 if (!cpuview_init_head(&proc_stat_history[i]))
1146 goto err;
1147 }
1148
1149 return true;
1150
1151 err:
1152 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1153 if (proc_stat_history[i])
1154 free_disarm(proc_stat_history[i]);
1155 }
1156
1157 return false;
1158 }
1159
1160 static void cpuview_free_head(struct cg_proc_stat_head *head)
1161 {
1162 struct cg_proc_stat *node;
1163
1164 if (head->next) {
1165 node = head->next;
1166
1167 for (;;) {
1168 struct cg_proc_stat *cur = node;
1169 node = node->next;
1170 free_proc_stat_node(cur);
1171 if (!node)
1172 break;
1173 }
1174 }
1175
1176 pthread_rwlock_destroy(&head->lock);
1177 free_disarm(head);
1178 }
1179
1180 void free_cpuview(void)
1181 {
1182 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++)
1183 if (proc_stat_history[i])
1184 cpuview_free_head(proc_stat_history[i]);
1185 }