]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/proc_cpuview.c
tree-wide: include own header file first
[mirror_lxcfs.git] / src / proc_cpuview.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE
5 #endif
6
7 #include "config.h"
8
9 #define __STDC_FORMAT_MACROS
10 #include <dirent.h>
11 #include <errno.h>
12 #include <fcntl.h>
13 #include <fuse.h>
14 #include <inttypes.h>
15 #include <libgen.h>
16 #include <pthread.h>
17 #include <sched.h>
18 #include <stdarg.h>
19 #include <stdbool.h>
20 #include <stdint.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <time.h>
25 #include <unistd.h>
26 #include <wait.h>
27 #include <linux/magic.h>
28 #include <linux/sched.h>
29 #include <sys/epoll.h>
30 #include <sys/mman.h>
31 #include <sys/mount.h>
32 #include <sys/param.h>
33 #include <sys/socket.h>
34 #include <sys/syscall.h>
35 #include <sys/sysinfo.h>
36 #include <sys/vfs.h>
37
38 #include "proc_cpuview.h"
39
40 #include "bindings.h"
41 #include "cgroup_fuse.h"
42 #include "cpuset_parse.h"
43 #include "cgroups/cgroup.h"
44 #include "cgroups/cgroup_utils.h"
45 #include "memory_utils.h"
46 #include "proc_loadavg.h"
47 #include "utils.h"
48
49 /* Data for CPU view */
50 struct cg_proc_stat {
51 char *cg;
52 struct cpuacct_usage *usage; /* Real usage as read from the host's /proc/stat. */
53 struct cpuacct_usage *view; /* Usage stats reported to the container. */
54 int cpu_count;
55 pthread_mutex_t lock; /* For node manipulation. */
56 struct cg_proc_stat *next;
57 };
58
59 struct cg_proc_stat_head {
60 struct cg_proc_stat *next;
61 time_t lastcheck;
62
63 /*
64 * For access to the list. Reading can be parallel, pruning is exclusive.
65 */
66 pthread_rwlock_t lock;
67 };
68
69 #define CPUVIEW_HASH_SIZE 100
70 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
71
72 static void reset_proc_stat_node(struct cg_proc_stat *node,
73 struct cpuacct_usage *usage, int cpu_count)
74 {
75 lxcfs_debug("Resetting stat node for %s\n", node->cg);
76 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
77
78 for (int i = 0; i < cpu_count; i++) {
79 node->view[i].user = 0;
80 node->view[i].system = 0;
81 node->view[i].idle = 0;
82 }
83
84 node->cpu_count = cpu_count;
85 }
86
87 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
88 {
89 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
90
91 /* Allocate new memory */
92 new_usage = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
93 if (!new_usage)
94 return false;
95
96 new_view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
97 if (!new_view)
98 return false;
99
100 /* Copy existing data & initialize new elements */
101 for (int i = 0; i < cpu_count; i++) {
102 if (i < node->cpu_count) {
103 new_usage[i].user = node->usage[i].user;
104 new_usage[i].system = node->usage[i].system;
105 new_usage[i].idle = node->usage[i].idle;
106
107 new_view[i].user = node->view[i].user;
108 new_view[i].system = node->view[i].system;
109 new_view[i].idle = node->view[i].idle;
110 }
111 }
112
113 free(node->usage);
114 node->usage = move_ptr(new_usage);
115
116 free(node->view);
117 node->view = move_ptr(new_view);
118 node->cpu_count = cpu_count;
119
120 return true;
121 }
122
123 static void free_proc_stat_node(struct cg_proc_stat *node)
124 {
125 if (node) {
126 /*
127 * We're abusing the usage pointer to indicate that
128 * pthread_mutex_init() was successful. Don't judge me.
129 */
130 if (node->usage)
131 pthread_mutex_destroy(&node->lock);
132 free_disarm(node->cg);
133 free_disarm(node->usage);
134 free_disarm(node->view);
135 free_disarm(node);
136 }
137 }
138
139 define_cleanup_function(struct cg_proc_stat *, free_proc_stat_node);
140
141 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
142 {
143 call_cleaner(free_proc_stat_node) struct cg_proc_stat *new = new_node;
144 struct cg_proc_stat *rv = new_node;
145 int hash = calc_hash(new->cg) % CPUVIEW_HASH_SIZE;
146 struct cg_proc_stat_head *head = proc_stat_history[hash];
147 struct cg_proc_stat *cur;
148
149 pthread_rwlock_wrlock(&head->lock);
150
151 if (!head->next) {
152 head->next = move_ptr(new);
153 goto out_rwlock_unlock;
154 }
155
156 cur = head->next;
157
158 for (;;) {
159 /*
160 * The node to be added is already present in the list, so
161 * free the newly allocated one and return the one we found.
162 */
163 if (strcmp(cur->cg, new->cg) == 0) {
164 rv = cur;
165 goto out_rwlock_unlock;
166 }
167
168 /* Keep walking. */
169 if (cur->next) {
170 cur = cur->next;
171 continue;
172 }
173
174 /* Add new node to end of list. */
175 cur->next = move_ptr(new);
176 goto out_rwlock_unlock;
177 }
178
179 out_rwlock_unlock:
180 pthread_rwlock_unlock(&head->lock);
181 return move_ptr(rv);
182 }
183
184 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage,
185 int cpu_count, const char *cg)
186 {
187 call_cleaner(free_proc_stat_node) struct cg_proc_stat *node = NULL;
188 __do_free struct cpuacct_usage *new_usage = NULL;
189
190 node = zalloc(sizeof(struct cg_proc_stat));
191 if (!node)
192 return NULL;
193
194 node->cg = strdup(cg);
195 if (!node->cg)
196 return NULL;
197
198 new_usage = memdup(usage, sizeof(struct cpuacct_usage) * cpu_count);
199 if (!new_usage)
200 return NULL;
201
202 node->view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
203 if (!node->view)
204 return NULL;
205
206 node->cpu_count = cpu_count;
207
208 if (pthread_mutex_init(&node->lock, NULL))
209 return NULL;
210 /*
211 * We're abusing the usage pointer to indicate that
212 * pthread_mutex_init() was successful. Don't judge me.
213 */
214 node->usage = move_ptr(new_usage);
215
216 return move_ptr(node);
217 }
218
219 static bool cgroup_supports(const char *controller, const char *cgroup,
220 const char *file)
221 {
222 __do_free char *path = NULL;
223 int cfd;
224
225 cfd = get_cgroup_fd(controller);
226 if (cfd < 0)
227 return false;
228
229 path = must_make_path_relative(cgroup, file, NULL);
230 return faccessat(cfd, path, F_OK, 0) == 0;
231 }
232
233 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
234 {
235 struct cg_proc_stat *first = NULL;
236
237 for (struct cg_proc_stat *prev = NULL; node; ) {
238 if (!cgroup_supports("cpu", node->cg, "cpu.shares")) {
239 call_cleaner(free_proc_stat_node) struct cg_proc_stat *cur = node;
240
241 if (prev)
242 prev->next = node->next;
243 else
244 first = node->next;
245
246 node = node->next;
247 lxcfs_debug("Removing stat node for %s\n", cur->cg);
248 } else {
249 if (!first)
250 first = node;
251 prev = node;
252 node = node->next;
253 }
254 }
255
256 return first;
257 }
258
259 #define PROC_STAT_PRUNE_INTERVAL 10
260 static void prune_proc_stat_history(void)
261 {
262 time_t now = time(NULL);
263
264 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) {
265 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
266
267 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
268 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
269 return;
270 }
271
272 if (proc_stat_history[i]->next) {
273 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
274 proc_stat_history[i]->lastcheck = now;
275 }
276
277 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
278 }
279 }
280
281 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head,
282 const char *cg)
283 {
284 struct cg_proc_stat *node;
285
286 pthread_rwlock_rdlock(&head->lock);
287
288 if (!head->next) {
289 pthread_rwlock_unlock(&head->lock);
290 return NULL;
291 }
292
293 node = head->next;
294
295 do {
296 if (strcmp(cg, node->cg) == 0)
297 goto out;
298 } while ((node = node->next));
299
300 node = NULL;
301
302 out:
303 pthread_rwlock_unlock(&head->lock);
304 prune_proc_stat_history();
305 return node;
306 }
307
308 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage,
309 int cpu_count, const char *cg)
310 {
311 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
312 struct cg_proc_stat_head *head = proc_stat_history[hash];
313 struct cg_proc_stat *node;
314
315 node = find_proc_stat_node(head, cg);
316 if (!node) {
317 node = new_proc_stat_node(usage, cpu_count, cg);
318 if (!node)
319 return NULL;
320
321 node = add_proc_stat_node(node);
322 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
323 }
324
325 pthread_mutex_lock(&node->lock);
326
327 /*
328 * If additional CPUs on the host have been enabled, CPU usage counter
329 * arrays have to be expanded.
330 */
331 if (node->cpu_count < cpu_count) {
332 lxcfs_debug("Expanding stat node %d->%d for %s\n",
333 node->cpu_count, cpu_count, cg);
334
335 if (!expand_proc_stat_node(node, cpu_count)) {
336 pthread_mutex_unlock(&node->lock);
337 return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg);
338 }
339 }
340
341 return node;
342 }
343
344 static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage,
345 uint64_t *counter, uint64_t threshold)
346 {
347 uint64_t free_space, to_add;
348
349 free_space = threshold - usage->user - usage->system;
350
351 if (free_space > usage->idle)
352 free_space = usage->idle;
353
354 if (free_space > *surplus)
355 to_add = *surplus;
356 else
357 to_add = free_space;
358
359 *counter += to_add;
360 usage->idle -= to_add;
361 *surplus -= to_add;
362 }
363
364 static uint64_t diff_cpu_usage(struct cpuacct_usage *older,
365 struct cpuacct_usage *newer,
366 struct cpuacct_usage *diff, int cpu_count)
367 {
368 uint64_t sum = 0;
369
370 for (int i = 0; i < cpu_count; i++) {
371 if (!newer[i].online)
372 continue;
373
374 /*
375 * When cpuset is changed on the fly, the CPUs might get
376 * reordered. We could either reset all counters, or check
377 * that the substractions below will return expected results.
378 */
379 if (newer[i].user > older[i].user)
380 diff[i].user = newer[i].user - older[i].user;
381 else
382 diff[i].user = 0;
383
384 if (newer[i].system > older[i].system)
385 diff[i].system = newer[i].system - older[i].system;
386 else
387 diff[i].system = 0;
388
389 if (newer[i].idle > older[i].idle)
390 diff[i].idle = newer[i].idle - older[i].idle;
391 else
392 diff[i].idle = 0;
393
394 sum += diff[i].user;
395 sum += diff[i].system;
396 sum += diff[i].idle;
397 }
398
399 return sum;
400 }
401
402 /*
403 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
404 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
405 * through `value`.
406 */
407 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
408 {
409 __do_free char *str = NULL;
410 char file[STRLITERALLEN("cpu.cfs_period_us") + 1];
411 bool first = true;
412 int ret;
413
414 if (pure_unified_layout(cgroup_ops)) {
415 first = !strcmp(param, "quota");
416 ret = snprintf(file, sizeof(file), "cpu.max");
417 } else {
418 ret = snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
419 }
420 if (ret < 0 || (size_t)ret >= sizeof(file))
421 return false;
422
423 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
424 return false;
425
426 return sscanf(str, first ? "%" PRId64 : "%*d %" PRId64, value) == 1;
427 }
428
429 /*
430 * Return the exact number of visible CPUs based on CPU quotas.
431 * If there is no quota set, zero is returned.
432 */
433 static double exact_cpu_count(const char *cg)
434 {
435 double rv;
436 int nprocs;
437 int64_t cfs_quota, cfs_period;
438
439 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
440 return 0;
441
442 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
443 return 0;
444
445 if (cfs_quota <= 0 || cfs_period <= 0)
446 return 0;
447
448 rv = (double)cfs_quota / (double)cfs_period;
449
450 nprocs = get_nprocs();
451
452 if (rv > nprocs)
453 rv = nprocs;
454
455 return rv;
456 }
457
458 /*
459 * Return the maximum number of visible CPUs based on CPU quotas.
460 * If there is no quota set, zero is returned.
461 */
462 int max_cpu_count(const char *cg)
463 {
464 __do_free char *cpuset = NULL;
465 int rv, nprocs;
466 int64_t cfs_quota, cfs_period;
467 int nr_cpus_in_cpuset = 0;
468
469 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
470 return 0;
471
472 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
473 return 0;
474
475 cpuset = get_cpuset(cg);
476 if (cpuset)
477 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
478
479 if (cfs_quota <= 0 || cfs_period <= 0) {
480 if (nr_cpus_in_cpuset > 0)
481 return nr_cpus_in_cpuset;
482
483 return 0;
484 }
485
486 rv = cfs_quota / cfs_period;
487
488 /*
489 * In case quota/period does not yield a whole number, add one CPU for
490 * the remainder.
491 */
492 if ((cfs_quota % cfs_period) > 0)
493 rv += 1;
494
495 nprocs = get_nprocs();
496 if (rv > nprocs)
497 rv = nprocs;
498
499 /* Use min value in cpu quota and cpuset. */
500 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
501 rv = nr_cpus_in_cpuset;
502
503 return rv;
504 }
505
506 int cpuview_proc_stat(const char *cg, const char *cpuset,
507 struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size,
508 FILE *f, char *buf, size_t buf_size)
509 {
510 __do_free char *line = NULL;
511 __do_free struct cpuacct_usage *diff = NULL;
512 size_t linelen = 0, total_len = 0;
513 int curcpu = -1; /* cpu numbering starts at 0 */
514 int physcpu, i;
515 int cpu_cnt = 0;
516 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
517 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
518 uint64_t user_sum = 0, system_sum = 0, idle_sum = 0;
519 uint64_t user_surplus = 0, system_surplus = 0;
520 int nprocs, max_cpus;
521 ssize_t l;
522 uint64_t total_sum, threshold;
523 struct cg_proc_stat *stat_node;
524
525 nprocs = get_nprocs_conf();
526 if (cg_cpu_usage_size < nprocs)
527 nprocs = cg_cpu_usage_size;
528
529 /* Read all CPU stats and stop when we've encountered other lines */
530 while (getline(&line, &linelen, f) != -1) {
531 int ret;
532 char cpu_char[10]; /* That's a lot of cores */
533 uint64_t all_used, cg_used;
534
535 if (strlen(line) == 0)
536 continue;
537
538 /* not a ^cpuN line containing a number N */
539 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
540 break;
541
542 if (sscanf(cpu_char, "%d", &physcpu) != 1)
543 continue;
544
545 if (physcpu >= cg_cpu_usage_size)
546 continue;
547
548 curcpu++;
549 cpu_cnt++;
550
551 if (!cpu_in_cpuset(physcpu, cpuset)) {
552 for (i = curcpu; i <= physcpu; i++)
553 cg_cpu_usage[i].online = false;
554 continue;
555 }
556
557 if (curcpu < physcpu) {
558 /* Some CPUs may be disabled */
559 for (i = curcpu; i < physcpu; i++)
560 cg_cpu_usage[i].online = false;
561
562 curcpu = physcpu;
563 }
564
565 cg_cpu_usage[curcpu].online = true;
566
567 ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu",
568 &user,
569 &nice,
570 &system,
571 &idle,
572 &iowait,
573 &irq,
574 &softirq,
575 &steal,
576 &guest,
577 &guest_nice);
578 if (ret != 10)
579 continue;
580
581 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
582 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
583
584 if (all_used >= cg_used) {
585 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
586
587 } else {
588 lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
589 curcpu, cg, all_used, cg_used);
590 cg_cpu_usage[curcpu].idle = idle;
591 }
592 }
593
594 /* Cannot use more CPUs than is available in cpuset. */
595 max_cpus = max_cpu_count(cg);
596 if (max_cpus > cpu_cnt || !max_cpus)
597 max_cpus = cpu_cnt;
598
599 /* takes lock pthread_mutex_lock(&node->lock) */
600 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
601 if (!stat_node)
602 return log_error(0, "Failed to find/create stat node for %s", cg);
603
604 diff = zalloc(sizeof(struct cpuacct_usage) * nprocs);
605 if (!diff)
606 goto out_pthread_mutex_unlock;
607
608 /*
609 * If the new values are LOWER than values stored in memory, it means
610 * the cgroup has been reset/recreated and we should reset too.
611 */
612 for (curcpu = 0; curcpu < nprocs; curcpu++) {
613 if (!cg_cpu_usage[curcpu].online)
614 continue;
615
616 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
617 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
618
619 break;
620 }
621
622 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
623
624 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
625 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
626
627 if (!stat_node->usage[curcpu].online)
628 continue;
629
630 i++;
631
632 stat_node->usage[curcpu].user += diff[curcpu].user;
633 stat_node->usage[curcpu].system += diff[curcpu].system;
634 stat_node->usage[curcpu].idle += diff[curcpu].idle;
635
636 if (max_cpus > 0 && i >= max_cpus) {
637 user_surplus += diff[curcpu].user;
638 system_surplus += diff[curcpu].system;
639 }
640 }
641
642 /* Calculate usage counters of visible CPUs */
643 if (max_cpus > 0) {
644 uint64_t diff_user = 0;
645 uint64_t diff_system = 0;
646 uint64_t diff_idle = 0;
647 uint64_t max_diff_idle = 0;
648 uint64_t max_diff_idle_index = 0;
649 double exact_cpus;
650 /* threshold = maximum usage per cpu, including idle */
651 threshold = total_sum / cpu_cnt * max_cpus;
652
653 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
654 if (!stat_node->usage[curcpu].online)
655 continue;
656
657 i++;
658
659 if (i == max_cpus)
660 break;
661
662 if (diff[curcpu].user + diff[curcpu].system >= threshold)
663 continue;
664
665 /* Add user */
666 add_cpu_usage(&user_surplus, &diff[curcpu],
667 &diff[curcpu].user, threshold);
668
669 if (diff[curcpu].user + diff[curcpu].system >= threshold)
670 continue;
671
672 /* If there is still room, add system */
673 add_cpu_usage(&system_surplus, &diff[curcpu],
674 &diff[curcpu].system, threshold);
675 }
676
677 if (user_surplus > 0)
678 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
679 if (system_surplus > 0)
680 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
681
682 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
683 if (!stat_node->usage[curcpu].online)
684 continue;
685
686 i++;
687
688 if (i == max_cpus)
689 break;
690
691 stat_node->view[curcpu].user += diff[curcpu].user;
692 stat_node->view[curcpu].system += diff[curcpu].system;
693 stat_node->view[curcpu].idle += diff[curcpu].idle;
694
695 user_sum += stat_node->view[curcpu].user;
696 system_sum += stat_node->view[curcpu].system;
697 idle_sum += stat_node->view[curcpu].idle;
698
699 diff_user += diff[curcpu].user;
700 diff_system += diff[curcpu].system;
701 diff_idle += diff[curcpu].idle;
702 if (diff[curcpu].idle > max_diff_idle) {
703 max_diff_idle = diff[curcpu].idle;
704 max_diff_idle_index = curcpu;
705 }
706
707 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
708 }
709 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
710
711 /* revise cpu usage view to support partial cpu case. */
712 exact_cpus = exact_cpu_count(cg);
713 if (exact_cpus < (double)max_cpus){
714 uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
715
716 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
717 lxcfs_v("delta: %lu\n", delta);
718 lxcfs_v("idle_sum before: %lu\n", idle_sum);
719 if (idle_sum > delta)
720 idle_sum = idle_sum - delta;
721 else
722 idle_sum = 0;
723 lxcfs_v("idle_sum after: %lu\n", idle_sum);
724
725 curcpu = max_diff_idle_index;
726 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
727 if (stat_node->view[curcpu].idle > delta)
728 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle - delta;
729 else
730 stat_node->view[curcpu].idle = 0;
731 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
732 }
733 } else {
734 for (curcpu = 0; curcpu < nprocs; curcpu++) {
735 if (!stat_node->usage[curcpu].online)
736 continue;
737
738 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
739 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
740 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
741
742 user_sum += stat_node->view[curcpu].user;
743 system_sum += stat_node->view[curcpu].system;
744 idle_sum += stat_node->view[curcpu].idle;
745 }
746 }
747
748 /* Render the file */
749 /* cpu-all */
750 l = snprintf(buf, buf_size,
751 "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
752 user_sum, system_sum, idle_sum);
753 lxcfs_v("cpu-all: %s\n", buf);
754 if (l < 0) {
755 lxcfs_error("Failed to write cache");
756 total_len = 0;
757 goto out_pthread_mutex_unlock;
758 }
759 if (l >= buf_size) {
760 lxcfs_error("Write to cache was truncated");
761 total_len = 0;
762 goto out_pthread_mutex_unlock;
763 }
764
765 buf += l;
766 buf_size -= l;
767 total_len += l;
768
769 /* Render visible CPUs */
770 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
771 if (!stat_node->usage[curcpu].online)
772 continue;
773
774 i++;
775
776 if (max_cpus > 0 && i == max_cpus)
777 break;
778
779 l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
780 i,
781 stat_node->view[curcpu].user,
782 stat_node->view[curcpu].system,
783 stat_node->view[curcpu].idle);
784 lxcfs_v("cpu: %s\n", buf);
785 if (l < 0) {
786 lxcfs_error("Failed to write cache");
787 total_len = 0;
788 goto out_pthread_mutex_unlock;
789 }
790 if (l >= buf_size) {
791 lxcfs_error("Write to cache was truncated");
792 total_len = 0;
793 goto out_pthread_mutex_unlock;
794 }
795
796 buf += l;
797 buf_size -= l;
798 total_len += l;
799 }
800
801 /* Pass the rest of /proc/stat, start with the last line read */
802 l = snprintf(buf, buf_size, "%s", line);
803 if (l < 0) {
804 lxcfs_error("Failed to write cache");
805 total_len = 0;
806 goto out_pthread_mutex_unlock;
807 }
808 if (l >= buf_size) {
809 lxcfs_error("Write to cache was truncated");
810 total_len = 0;
811 goto out_pthread_mutex_unlock;
812 }
813
814 buf += l;
815 buf_size -= l;
816 total_len += l;
817
818 /* Pass the rest of the host's /proc/stat */
819 while (getline(&line, &linelen, f) != -1) {
820 l = snprintf(buf, buf_size, "%s", line);
821 if (l < 0) {
822 lxcfs_error("Failed to write cache");
823 total_len = 0;
824 goto out_pthread_mutex_unlock;
825 }
826 if (l >= buf_size) {
827 lxcfs_error("Write to cache was truncated");
828 total_len = 0;
829 goto out_pthread_mutex_unlock;
830 }
831
832 buf += l;
833 buf_size -= l;
834 total_len += l;
835 }
836
837 out_pthread_mutex_unlock:
838 if (stat_node)
839 pthread_mutex_unlock(&stat_node->lock);
840
841 return total_len;
842 }
843
844 /*
845 * check whether this is a '^processor" line in /proc/cpuinfo
846 */
847 static inline bool is_processor_line(const char *line)
848 {
849 int cpu;
850 return sscanf(line, "processor : %d", &cpu) == 1;
851 }
852
853 static inline bool cpuline_in_cpuset(const char *line, const char *cpuset)
854 {
855 int cpu;
856
857 if (sscanf(line, "processor : %d", &cpu) == 1)
858 return cpu_in_cpuset(cpu, cpuset);
859
860 return false;
861 }
862
863 int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
864 struct fuse_file_info *fi)
865 {
866 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
867 __do_free void *fopen_cache = NULL;
868 __do_fclose FILE *f = NULL;
869 struct fuse_context *fc = fuse_get_context();
870 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
871 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
872 size_t linelen = 0, total_len = 0;
873 bool am_printing = false, firstline = true, is_s390x = false;
874 int curcpu = -1, cpu, max_cpus = 0;
875 bool use_view;
876 char *cache = d->buf;
877 size_t cache_size = d->buflen;
878
879 if (offset) {
880 int left;
881
882 if (offset > d->size)
883 return -EINVAL;
884
885 if (!d->cached)
886 return 0;
887
888 left = d->size - offset;
889 total_len = left > size ? size: left;
890 memcpy(buf, cache + offset, total_len);
891
892 return total_len;
893 }
894
895 pid_t initpid = lookup_initpid_in_store(fc->pid);
896 if (initpid <= 1 || is_shared_pidns(initpid))
897 initpid = fc->pid;
898
899 cg = get_pid_cgroup(initpid, "cpuset");
900 if (!cg)
901 return read_file_fuse("proc/cpuinfo", buf, size, d);
902 prune_init_slice(cg);
903
904 cpuset = get_cpuset(cg);
905 if (!cpuset)
906 return 0;
907
908 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
909 use_view = true;
910 else
911 use_view = false;
912 if (use_view)
913 max_cpus = max_cpu_count(cg);
914
915 f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache);
916 if (!f)
917 return 0;
918
919 while (getline(&line, &linelen, f) != -1) {
920 ssize_t l;
921 if (firstline) {
922 firstline = false;
923 if (strstr(line, "IBM/S390") != NULL) {
924 is_s390x = true;
925 am_printing = true;
926 continue;
927 }
928 }
929
930 if (strncmp(line, "# processors:", 12) == 0)
931 continue;
932
933 if (is_processor_line(line)) {
934 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
935 break;
936
937 am_printing = cpuline_in_cpuset(line, cpuset);
938 if (am_printing) {
939 curcpu++;
940 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
941 if (l < 0)
942 return log_error(0, "Failed to write cache");
943 if (l >= cache_size)
944 return log_error(0, "Write to cache was truncated");
945 cache += l;
946 cache_size -= l;
947 total_len += l;
948 }
949 continue;
950 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
951 char *p;
952
953 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
954 break;
955
956 if (!cpu_in_cpuset(cpu, cpuset))
957 continue;
958
959 curcpu ++;
960 p = strchr(line, ':');
961 if (!p || !*p)
962 return 0;
963 p++;
964
965 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
966 if (l < 0)
967 return log_error(0, "Failed to write cache");
968 if (l >= cache_size)
969 return log_error(0, "Write to cache was truncated");
970
971 cache += l;
972 cache_size -= l;
973 total_len += l;
974 continue;
975
976 }
977 if (am_printing) {
978 l = snprintf(cache, cache_size, "%s", line);
979 if (l < 0)
980 return log_error(0, "Failed to write cache");
981 if (l >= cache_size)
982 return log_error(0, "Write to cache was truncated");
983
984 cache += l;
985 cache_size -= l;
986 total_len += l;
987 }
988 }
989
990 if (is_s390x) {
991 __do_free char *origcache = d->buf;
992 ssize_t l;
993
994 d->buf = malloc(d->buflen);
995 if (!d->buf) {
996 d->buf = move_ptr(origcache);
997 return 0;
998 }
999
1000 cache = d->buf;
1001 cache_size = d->buflen;
1002 total_len = 0;
1003 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
1004 if (l < 0 || l >= cache_size)
1005 return 0;
1006
1007 cache_size -= l;
1008 cache += l;
1009 total_len += l;
1010 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
1011 if (l < 0 || l >= cache_size)
1012 return 0;
1013
1014 cache_size -= l;
1015 cache += l;
1016 total_len += l;
1017 l = snprintf(cache, cache_size, "%s", origcache);
1018 if (l < 0 || l >= cache_size)
1019 return 0;
1020 total_len += l;
1021 }
1022
1023 d->cached = 1;
1024 d->size = total_len;
1025 if (total_len > size)
1026 total_len = size;
1027
1028 /* read from off 0 */
1029 memcpy(buf, d->buf, total_len);
1030
1031 return total_len;
1032 }
1033
1034 /*
1035 * Returns 0 on success.
1036 * It is the caller's responsibility to free `return_usage`, unless this
1037 * function returns an error.
1038 */
1039 int read_cpuacct_usage_all(char *cg, char *cpuset,
1040 struct cpuacct_usage **return_usage, int *size)
1041 {
1042 __do_free char *usage_str = NULL;
1043 __do_free struct cpuacct_usage *cpu_usage = NULL;
1044 int i = 0, j = 0, read_pos = 0, read_cnt = 0;
1045 int cpucount;
1046 int ret;
1047 int cg_cpu;
1048 uint64_t cg_user, cg_system;
1049 int64_t ticks_per_sec;
1050
1051 ticks_per_sec = sysconf(_SC_CLK_TCK);
1052 if (ticks_per_sec < 0 && errno == EINVAL) {
1053 lxcfs_debug("%m - Failed to determine number of ticks per second");
1054 return -1;
1055 }
1056
1057 cpucount = get_nprocs_conf();
1058 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
1059 if (!cpu_usage)
1060 return -ENOMEM;
1061
1062 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1063 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
1064 char *sep = " \t\n";
1065 char *tok;
1066
1067 /* Read cpuacct.usage_percpu instead. */
1068 lxcfs_debug("Falling back to cpuacct.usage_percpu");
1069 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
1070 return -1;
1071
1072 lxc_iterate_parts(tok, usage_str, sep) {
1073 uint64_t percpu_user;
1074
1075 if (i >= cpucount)
1076 break;
1077
1078 tok = trim_whitespace_in_place(tok);
1079 ret = safe_uint64(tok, &percpu_user, 10);
1080 if (ret)
1081 return -1;
1082
1083 /* Convert the time from nanoseconds to USER_HZ */
1084 cpu_usage[i].user = percpu_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1085 cpu_usage[i].system = cpu_usage[i].user;
1086 i++;
1087 lxcfs_debug("cpu%d with time %s", i, tok);
1088 }
1089 } else {
1090 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0)
1091 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg);
1092
1093 read_pos += read_cnt;
1094
1095 for (i = 0, j = 0; i < cpucount; i++) {
1096 ret = sscanf(usage_str + read_pos,
1097 "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu,
1098 &cg_user, &cg_system, &read_cnt);
1099
1100 if (ret == EOF)
1101 break;
1102
1103 if (ret != 3)
1104 return log_error(-EINVAL, "Failed to parse cpuacct.usage_all line %s from cgroup %s",
1105 usage_str + read_pos, cg);
1106
1107 read_pos += read_cnt;
1108
1109 /* Convert the time from nanoseconds to USER_HZ */
1110 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1111 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
1112 j++;
1113 }
1114 }
1115
1116 *return_usage = move_ptr(cpu_usage);
1117 *size = cpucount;
1118 return 0;
1119 }
1120
1121 static bool cpuview_init_head(struct cg_proc_stat_head **head)
1122 {
1123 __do_free struct cg_proc_stat_head *h;
1124
1125 h = zalloc(sizeof(struct cg_proc_stat_head));
1126 if (!h)
1127 return false;
1128
1129 if (pthread_rwlock_init(&h->lock, NULL))
1130 return false;
1131
1132 h->lastcheck = time(NULL);
1133
1134 *head = move_ptr(h);
1135 return true;
1136 }
1137
1138 bool init_cpuview(void)
1139 {
1140 int i;
1141
1142 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
1143 proc_stat_history[i] = NULL;
1144
1145 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1146 if (!cpuview_init_head(&proc_stat_history[i]))
1147 goto err;
1148 }
1149
1150 return true;
1151
1152 err:
1153 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1154 if (proc_stat_history[i])
1155 free_disarm(proc_stat_history[i]);
1156 }
1157
1158 return false;
1159 }
1160
1161 static void cpuview_free_head(struct cg_proc_stat_head *head)
1162 {
1163 struct cg_proc_stat *node;
1164
1165 if (head->next) {
1166 node = head->next;
1167
1168 for (;;) {
1169 struct cg_proc_stat *cur = node;
1170 node = node->next;
1171 free_proc_stat_node(cur);
1172 if (!node)
1173 break;
1174 }
1175 }
1176
1177 pthread_rwlock_destroy(&head->lock);
1178 free_disarm(head);
1179 }
1180
1181 void free_cpuview(void)
1182 {
1183 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++)
1184 if (proc_stat_history[i])
1185 cpuview_free_head(proc_stat_history[i]);
1186 }