]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/proc_cpuview.c
proc_cpuview: cleanup cpuview_init_head()
[mirror_lxcfs.git] / src / proc_cpuview.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE
5 #endif
6
7 #ifndef FUSE_USE_VERSION
8 #define FUSE_USE_VERSION 26
9 #endif
10
11 #define _FILE_OFFSET_BITS 64
12
13 #define __STDC_FORMAT_MACROS
14 #include <dirent.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <fuse.h>
18 #include <inttypes.h>
19 #include <libgen.h>
20 #include <pthread.h>
21 #include <sched.h>
22 #include <stdarg.h>
23 #include <stdbool.h>
24 #include <stdint.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <time.h>
29 #include <unistd.h>
30 #include <wait.h>
31 #include <linux/magic.h>
32 #include <linux/sched.h>
33 #include <sys/epoll.h>
34 #include <sys/mman.h>
35 #include <sys/mount.h>
36 #include <sys/param.h>
37 #include <sys/socket.h>
38 #include <sys/syscall.h>
39 #include <sys/sysinfo.h>
40 #include <sys/vfs.h>
41
42 #include "bindings.h"
43 #include "config.h"
44 #include "cgroup_fuse.h"
45 #include "cpuset_parse.h"
46 #include "cgroups/cgroup.h"
47 #include "cgroups/cgroup_utils.h"
48 #include "memory_utils.h"
49 #include "proc_loadavg.h"
50 #include "utils.h"
51
52 /* Data for CPU view */
53 struct cg_proc_stat {
54 char *cg;
55 struct cpuacct_usage *usage; /* Real usage as read from the host's /proc/stat. */
56 struct cpuacct_usage *view; /* Usage stats reported to the container. */
57 int cpu_count;
58 pthread_mutex_t lock; /* For node manipulation. */
59 struct cg_proc_stat *next;
60 };
61
62 struct cg_proc_stat_head {
63 struct cg_proc_stat *next;
64 time_t lastcheck;
65
66 /*
67 * For access to the list. Reading can be parallel, pruning is exclusive.
68 */
69 pthread_rwlock_t lock;
70 };
71
72 #define CPUVIEW_HASH_SIZE 100
73 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
74
75 static void reset_proc_stat_node(struct cg_proc_stat *node,
76 struct cpuacct_usage *usage, int cpu_count)
77 {
78 lxcfs_debug("Resetting stat node for %s\n", node->cg);
79 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
80
81 for (int i = 0; i < cpu_count; i++) {
82 node->view[i].user = 0;
83 node->view[i].system = 0;
84 node->view[i].idle = 0;
85 }
86
87 node->cpu_count = cpu_count;
88 }
89
90 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
91 {
92 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
93
94 /* Allocate new memory */
95 new_usage = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
96 if (!new_usage)
97 return false;
98
99 new_view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
100 if (!new_view)
101 return false;
102
103 /* Copy existing data & initialize new elements */
104 for (int i = 0; i < cpu_count; i++) {
105 if (i < node->cpu_count) {
106 new_usage[i].user = node->usage[i].user;
107 new_usage[i].system = node->usage[i].system;
108 new_usage[i].idle = node->usage[i].idle;
109
110 new_view[i].user = node->view[i].user;
111 new_view[i].system = node->view[i].system;
112 new_view[i].idle = node->view[i].idle;
113 }
114 }
115
116 free(node->usage);
117 node->usage = move_ptr(new_usage);
118
119 free(node->view);
120 node->view = move_ptr(new_view);
121 node->cpu_count = cpu_count;
122
123 return true;
124 }
125
126 static void free_proc_stat_node(struct cg_proc_stat *node)
127 {
128 if (node) {
129 /*
130 * We're abusing the usage pointer to indicate that
131 * pthread_mutex_init() was successful. Don't judge me.
132 */
133 if (node->usage)
134 pthread_mutex_destroy(&node->lock);
135 free_disarm(node->cg);
136 free_disarm(node->usage);
137 free_disarm(node->view);
138 free_disarm(node);
139 }
140 }
141
142 define_cleanup_function(struct cg_proc_stat *, free_proc_stat_node);
143
144 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
145 {
146 call_cleaner(free_proc_stat_node) struct cg_proc_stat *new = new_node;
147 struct cg_proc_stat *rv = new_node;
148 int hash = calc_hash(new->cg) % CPUVIEW_HASH_SIZE;
149 struct cg_proc_stat_head *head = proc_stat_history[hash];
150 struct cg_proc_stat *cur;
151
152 pthread_rwlock_wrlock(&head->lock);
153
154 if (!head->next) {
155 head->next = move_ptr(new);
156 goto out_rwlock_unlock;
157 }
158
159 cur = head->next;
160
161 for (;;) {
162 /*
163 * The node to be added is already present in the list, so
164 * free the newly allocated one and return the one we found.
165 */
166 if (strcmp(cur->cg, new->cg) == 0) {
167 rv = cur;
168 goto out_rwlock_unlock;
169 }
170
171 /* Keep walking. */
172 if (cur->next) {
173 cur = cur->next;
174 continue;
175 }
176
177 /* Add new node to end of list. */
178 cur->next = move_ptr(new);
179 goto out_rwlock_unlock;
180 }
181
182 out_rwlock_unlock:
183 pthread_rwlock_unlock(&head->lock);
184 return move_ptr(rv);
185 }
186
187 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage,
188 int cpu_count, const char *cg)
189 {
190 call_cleaner(free_proc_stat_node) struct cg_proc_stat *node = NULL;
191 __do_free struct cpuacct_usage *new_usage = NULL;
192
193 node = zalloc(sizeof(struct cg_proc_stat));
194 if (!node)
195 return NULL;
196
197 node->cg = strdup(cg);
198 if (!node->cg)
199 return NULL;
200
201 new_usage = memdup(usage, sizeof(struct cpuacct_usage) * cpu_count);
202 if (!new_usage)
203 return NULL;
204
205 node->view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
206 if (!node->view)
207 return NULL;
208
209 node->cpu_count = cpu_count;
210
211 if (pthread_mutex_init(&node->lock, NULL))
212 return NULL;
213 /*
214 * We're abusing the usage pointer to indicate that
215 * pthread_mutex_init() was successful. Don't judge me.
216 */
217 node->usage = move_ptr(new_usage);
218
219 return move_ptr(node);
220 }
221
222 static bool cgroup_supports(const char *controller, const char *cgroup,
223 const char *file)
224 {
225 __do_free char *path = NULL;
226 int cfd;
227
228 cfd = get_cgroup_fd(controller);
229 if (cfd < 0)
230 return false;
231
232 path = must_make_path_relative(cgroup, file, NULL);
233 return faccessat(cfd, path, F_OK, 0) == 0;
234 }
235
236 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
237 {
238 struct cg_proc_stat *first = NULL;
239
240 for (struct cg_proc_stat *prev = NULL; node; ) {
241 if (!cgroup_supports("cpu", node->cg, "cpu.shares")) {
242 call_cleaner(free_proc_stat_node) struct cg_proc_stat *cur = node;
243
244 if (prev)
245 prev->next = node->next;
246 else
247 first = node->next;
248
249 node = node->next;
250 lxcfs_debug("Removing stat node for %s\n", cur->cg);
251 } else {
252 if (!first)
253 first = node;
254 prev = node;
255 node = node->next;
256 }
257 }
258
259 return first;
260 }
261
262 #define PROC_STAT_PRUNE_INTERVAL 10
263 static void prune_proc_stat_history(void)
264 {
265 time_t now = time(NULL);
266
267 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) {
268 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
269
270 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
271 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
272 return;
273 }
274
275 if (proc_stat_history[i]->next) {
276 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
277 proc_stat_history[i]->lastcheck = now;
278 }
279
280 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
281 }
282 }
283
284 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head,
285 const char *cg)
286 {
287 struct cg_proc_stat *node;
288
289 pthread_rwlock_rdlock(&head->lock);
290
291 if (!head->next) {
292 pthread_rwlock_unlock(&head->lock);
293 return NULL;
294 }
295
296 node = head->next;
297
298 do {
299 if (strcmp(cg, node->cg) == 0)
300 goto out;
301 } while ((node = node->next));
302
303 node = NULL;
304
305 out:
306 pthread_rwlock_unlock(&head->lock);
307 prune_proc_stat_history();
308 return node;
309 }
310
311 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
312 {
313 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
314 struct cg_proc_stat_head *head = proc_stat_history[hash];
315 struct cg_proc_stat *node;
316
317 node = find_proc_stat_node(head, cg);
318 if (!node) {
319 node = new_proc_stat_node(usage, cpu_count, cg);
320 if (!node)
321 return NULL;
322
323 node = add_proc_stat_node(node);
324 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
325 }
326
327 pthread_mutex_lock(&node->lock);
328
329 /*
330 * If additional CPUs on the host have been enabled, CPU usage counter
331 * arrays have to be expanded.
332 */
333 if (node->cpu_count < cpu_count) {
334 lxcfs_debug("Expanding stat node %d->%d for %s\n",
335 node->cpu_count, cpu_count, cg);
336
337 if (!expand_proc_stat_node(node, cpu_count)) {
338 pthread_mutex_unlock(&node->lock);
339 return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg);
340 }
341 }
342
343 return node;
344 }
345
346 static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage,
347 uint64_t *counter, uint64_t threshold)
348 {
349 uint64_t free_space, to_add;
350
351 free_space = threshold - usage->user - usage->system;
352
353 if (free_space > usage->idle)
354 free_space = usage->idle;
355
356 if (free_space > *surplus)
357 to_add = *surplus;
358 else
359 to_add = free_space;
360
361 *counter += to_add;
362 usage->idle -= to_add;
363 *surplus -= to_add;
364 }
365
366 static uint64_t diff_cpu_usage(struct cpuacct_usage *older,
367 struct cpuacct_usage *newer,
368 struct cpuacct_usage *diff, int cpu_count)
369 {
370 uint64_t sum = 0;
371
372 for (int i = 0; i < cpu_count; i++) {
373 if (!newer[i].online)
374 continue;
375
376 /*
377 * When cpuset is changed on the fly, the CPUs might get
378 * reordered. We could either reset all counters, or check
379 * that the substractions below will return expected results.
380 */
381 if (newer[i].user > older[i].user)
382 diff[i].user = newer[i].user - older[i].user;
383 else
384 diff[i].user = 0;
385
386 if (newer[i].system > older[i].system)
387 diff[i].system = newer[i].system - older[i].system;
388 else
389 diff[i].system = 0;
390
391 if (newer[i].idle > older[i].idle)
392 diff[i].idle = newer[i].idle - older[i].idle;
393 else
394 diff[i].idle = 0;
395
396 sum += diff[i].user;
397 sum += diff[i].system;
398 sum += diff[i].idle;
399 }
400
401 return sum;
402 }
403
404 /*
405 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
406 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
407 * throuh `value`.
408 */
409 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
410 {
411 __do_free char *str = NULL;
412 char file[STRLITERALLEN("cpu.cfs_period_us") + 1];
413 bool first = true;
414 int ret;
415
416 if (pure_unified_layout(cgroup_ops)) {
417 first = !strcmp(param, "quota");
418 ret = snprintf(file, sizeof(file), "cpu.max");
419 } else {
420 ret = snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
421 }
422 if (ret < 0 || (size_t)ret >= sizeof(file))
423 return false;
424
425 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
426 return false;
427
428 return sscanf(str, first ? "%" PRId64 : "%*d %" PRId64, value) == 1;
429 }
430
431 /*
432 * Return the exact number of visible CPUs based on CPU quotas.
433 * If there is no quota set, zero is returned.
434 */
435 static double exact_cpu_count(const char *cg)
436 {
437 double rv;
438 int nprocs;
439 int64_t cfs_quota, cfs_period;
440
441 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
442 return 0;
443
444 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
445 return 0;
446
447 if (cfs_quota <= 0 || cfs_period <= 0)
448 return 0;
449
450 rv = (double)cfs_quota / (double)cfs_period;
451
452 nprocs = get_nprocs();
453
454 if (rv > nprocs)
455 rv = nprocs;
456
457 return rv;
458 }
459
460 /*
461 * Return the maximum number of visible CPUs based on CPU quotas.
462 * If there is no quota set, zero is returned.
463 */
464 int max_cpu_count(const char *cg)
465 {
466 __do_free char *cpuset = NULL;
467 int rv, nprocs;
468 int64_t cfs_quota, cfs_period;
469 int nr_cpus_in_cpuset = 0;
470
471 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
472 return 0;
473
474 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
475 return 0;
476
477 cpuset = get_cpuset(cg);
478 if (cpuset)
479 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
480
481 if (cfs_quota <= 0 || cfs_period <= 0) {
482 if (nr_cpus_in_cpuset > 0)
483 return nr_cpus_in_cpuset;
484
485 return 0;
486 }
487
488 rv = cfs_quota / cfs_period;
489
490 /*
491 * In case quota/period does not yield a whole number, add one CPU for
492 * the remainder.
493 */
494 if ((cfs_quota % cfs_period) > 0)
495 rv += 1;
496
497 nprocs = get_nprocs();
498 if (rv > nprocs)
499 rv = nprocs;
500
501 /* Use min value in cpu quota and cpuset. */
502 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
503 rv = nr_cpus_in_cpuset;
504
505 return rv;
506 }
507
508 int cpuview_proc_stat(const char *cg, const char *cpuset,
509 struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size,
510 FILE *f, char *buf, size_t buf_size)
511 {
512 __do_free char *line = NULL;
513 __do_free struct cpuacct_usage *diff = NULL;
514 size_t linelen = 0, total_len = 0;
515 int curcpu = -1; /* cpu numbering starts at 0 */
516 int physcpu, i;
517 int cpu_cnt = 0;
518 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
519 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
520 uint64_t user_sum = 0, system_sum = 0, idle_sum = 0;
521 uint64_t user_surplus = 0, system_surplus = 0;
522 int nprocs, max_cpus;
523 ssize_t l;
524 uint64_t total_sum, threshold;
525 struct cg_proc_stat *stat_node;
526
527 nprocs = get_nprocs_conf();
528 if (cg_cpu_usage_size < nprocs)
529 nprocs = cg_cpu_usage_size;
530
531 /* Read all CPU stats and stop when we've encountered other lines */
532 while (getline(&line, &linelen, f) != -1) {
533 int ret;
534 char cpu_char[10]; /* That's a lot of cores */
535 uint64_t all_used, cg_used;
536
537 if (strlen(line) == 0)
538 continue;
539
540 /* not a ^cpuN line containing a number N */
541 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
542 break;
543
544 if (sscanf(cpu_char, "%d", &physcpu) != 1)
545 continue;
546
547 if (physcpu >= cg_cpu_usage_size)
548 continue;
549
550 curcpu++;
551 cpu_cnt++;
552
553 if (!cpu_in_cpuset(physcpu, cpuset)) {
554 for (i = curcpu; i <= physcpu; i++)
555 cg_cpu_usage[i].online = false;
556 continue;
557 }
558
559 if (curcpu < physcpu) {
560 /* Some CPUs may be disabled */
561 for (i = curcpu; i < physcpu; i++)
562 cg_cpu_usage[i].online = false;
563
564 curcpu = physcpu;
565 }
566
567 cg_cpu_usage[curcpu].online = true;
568
569 ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu",
570 &user,
571 &nice,
572 &system,
573 &idle,
574 &iowait,
575 &irq,
576 &softirq,
577 &steal,
578 &guest,
579 &guest_nice);
580 if (ret != 10)
581 continue;
582
583 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
584 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
585
586 if (all_used >= cg_used) {
587 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
588
589 } else {
590 lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
591 curcpu, cg, all_used, cg_used);
592 cg_cpu_usage[curcpu].idle = idle;
593 }
594 }
595
596 /* Cannot use more CPUs than is available in cpuset. */
597 max_cpus = max_cpu_count(cg);
598 if (max_cpus > cpu_cnt || !max_cpus)
599 max_cpus = cpu_cnt;
600
601 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
602 if (!stat_node)
603 return log_error(0, "Failed to find/create stat node for %s", cg);
604
605 diff = zalloc(sizeof(struct cpuacct_usage) * nprocs);
606 if (!diff)
607 return 0;
608
609 /*
610 * If the new values are LOWER than values stored in memory, it means
611 * the cgroup has been reset/recreated and we should reset too.
612 */
613 for (curcpu = 0; curcpu < nprocs; curcpu++) {
614 if (!cg_cpu_usage[curcpu].online)
615 continue;
616
617 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
618 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
619
620 break;
621 }
622
623 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
624
625 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
626 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
627
628 if (!stat_node->usage[curcpu].online)
629 continue;
630
631 i++;
632
633 stat_node->usage[curcpu].user += diff[curcpu].user;
634 stat_node->usage[curcpu].system += diff[curcpu].system;
635 stat_node->usage[curcpu].idle += diff[curcpu].idle;
636
637 if (max_cpus > 0 && i >= max_cpus) {
638 user_surplus += diff[curcpu].user;
639 system_surplus += diff[curcpu].system;
640 }
641 }
642
643 /* Calculate usage counters of visible CPUs */
644 if (max_cpus > 0) {
645 uint64_t diff_user = 0;
646 uint64_t diff_system = 0;
647 uint64_t diff_idle = 0;
648 uint64_t max_diff_idle = 0;
649 uint64_t max_diff_idle_index = 0;
650 double exact_cpus;
651
652 /* threshold = maximum usage per cpu, including idle */
653 threshold = total_sum / cpu_cnt * max_cpus;
654
655 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
656 if (!stat_node->usage[curcpu].online)
657 continue;
658
659 i++;
660
661 if (i == max_cpus)
662 break;
663
664 if (diff[curcpu].user + diff[curcpu].system >= threshold)
665 continue;
666
667 /* Add user */
668 add_cpu_usage(&user_surplus, &diff[curcpu],
669 &diff[curcpu].user, threshold);
670
671 if (diff[curcpu].user + diff[curcpu].system >= threshold)
672 continue;
673
674 /* If there is still room, add system */
675 add_cpu_usage(&system_surplus, &diff[curcpu],
676 &diff[curcpu].system, threshold);
677 }
678
679 if (user_surplus > 0)
680 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
681 if (system_surplus > 0)
682 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
683
684 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
685 if (!stat_node->usage[curcpu].online)
686 continue;
687
688 i++;
689
690 if (i == max_cpus)
691 break;
692
693 stat_node->view[curcpu].user += diff[curcpu].user;
694 stat_node->view[curcpu].system += diff[curcpu].system;
695 stat_node->view[curcpu].idle += diff[curcpu].idle;
696
697 user_sum += stat_node->view[curcpu].user;
698 system_sum += stat_node->view[curcpu].system;
699 idle_sum += stat_node->view[curcpu].idle;
700
701 diff_user += diff[curcpu].user;
702 diff_system += diff[curcpu].system;
703 diff_idle += diff[curcpu].idle;
704 if (diff[curcpu].idle > max_diff_idle) {
705 max_diff_idle = diff[curcpu].idle;
706 max_diff_idle_index = curcpu;
707 }
708
709 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
710 }
711 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
712
713 /* revise cpu usage view to support partial cpu case. */
714 exact_cpus = exact_cpu_count(cg);
715 if (exact_cpus < (double)max_cpus){
716 uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
717
718 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
719 lxcfs_v("delta: %lu\n", delta);
720 lxcfs_v("idle_sum before: %lu\n", idle_sum);
721 if (idle_sum > delta)
722 idle_sum = idle_sum - delta;
723 else
724 idle_sum = 0;
725 lxcfs_v("idle_sum after: %lu\n", idle_sum);
726
727 curcpu = max_diff_idle_index;
728 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
729 if (stat_node->view[curcpu].idle > delta)
730 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle - delta;
731 else
732 stat_node->view[curcpu].idle = 0;
733 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
734 }
735 } else {
736 for (curcpu = 0; curcpu < nprocs; curcpu++) {
737 if (!stat_node->usage[curcpu].online)
738 continue;
739
740 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
741 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
742 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
743
744 user_sum += stat_node->view[curcpu].user;
745 system_sum += stat_node->view[curcpu].system;
746 idle_sum += stat_node->view[curcpu].idle;
747 }
748 }
749
750 /* Render the file */
751 /* cpu-all */
752 l = snprintf(buf, buf_size,
753 "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
754 user_sum, system_sum, idle_sum);
755 lxcfs_v("cpu-all: %s\n", buf);
756 if (l < 0)
757 return log_error(0, "Failed to write cache");
758 if (l >= buf_size)
759 return log_error(0, "Write to cache was truncated");
760
761 buf += l;
762 buf_size -= l;
763 total_len += l;
764
765 /* Render visible CPUs */
766 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
767 if (!stat_node->usage[curcpu].online)
768 continue;
769
770 i++;
771
772 if (max_cpus > 0 && i == max_cpus)
773 break;
774
775 l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
776 i,
777 stat_node->view[curcpu].user,
778 stat_node->view[curcpu].system,
779 stat_node->view[curcpu].idle);
780 lxcfs_v("cpu: %s\n", buf);
781 if (l < 0)
782 return log_error(0, "Failed to write cache");
783 if (l >= buf_size)
784 return log_error(0, "Write to cache was truncated");
785
786 buf += l;
787 buf_size -= l;
788 total_len += l;
789 }
790
791 /* Pass the rest of /proc/stat, start with the last line read */
792 l = snprintf(buf, buf_size, "%s", line);
793 if (l < 0)
794 return log_error(0, "Failed to write cache");
795 if (l >= buf_size)
796 return log_error(0, "Write to cache was truncated");
797
798 buf += l;
799 buf_size -= l;
800 total_len += l;
801
802 /* Pass the rest of the host's /proc/stat */
803 while (getline(&line, &linelen, f) != -1) {
804 l = snprintf(buf, buf_size, "%s", line);
805 if (l < 0)
806 return log_error(0, "Failed to write cache");
807 if (l >= buf_size)
808 return log_error(0, "Write to cache was truncated");
809
810 buf += l;
811 buf_size -= l;
812 total_len += l;
813 }
814
815 if (stat_node)
816 pthread_mutex_unlock(&stat_node->lock);
817
818 return total_len;
819 }
820
821 /*
822 * check whether this is a '^processor" line in /proc/cpuinfo
823 */
824 static inline bool is_processor_line(const char *line)
825 {
826 int cpu;
827 return sscanf(line, "processor : %d", &cpu) == 1;
828 }
829
830 static inline bool cpuline_in_cpuset(const char *line, const char *cpuset)
831 {
832 int cpu;
833
834 if (sscanf(line, "processor : %d", &cpu) == 1)
835 return cpu_in_cpuset(cpu, cpuset);
836
837 return false;
838 }
839
840 int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
841 struct fuse_file_info *fi)
842 {
843 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
844 __do_free void *fopen_cache = NULL;
845 __do_fclose FILE *f = NULL;
846 struct fuse_context *fc = fuse_get_context();
847 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
848 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
849 size_t linelen = 0, total_len = 0;
850 bool am_printing = false, firstline = true, is_s390x = false;
851 int curcpu = -1, cpu, max_cpus = 0;
852 bool use_view;
853 char *cache = d->buf;
854 size_t cache_size = d->buflen;
855
856 if (offset) {
857 int left;
858
859 if (offset > d->size)
860 return -EINVAL;
861
862 if (!d->cached)
863 return 0;
864
865 left = d->size - offset;
866 total_len = left > size ? size: left;
867 memcpy(buf, cache + offset, total_len);
868
869 return total_len;
870 }
871
872 pid_t initpid = lookup_initpid_in_store(fc->pid);
873 if (initpid <= 1 || is_shared_pidns(initpid))
874 initpid = fc->pid;
875
876 cg = get_pid_cgroup(initpid, "cpuset");
877 if (!cg)
878 return read_file_fuse("proc/cpuinfo", buf, size, d);
879 prune_init_slice(cg);
880
881 cpuset = get_cpuset(cg);
882 if (!cpuset)
883 return 0;
884
885 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
886 use_view = true;
887 else
888 use_view = false;
889 if (use_view)
890 max_cpus = max_cpu_count(cg);
891
892 f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache);
893 if (!f)
894 return 0;
895
896 while (getline(&line, &linelen, f) != -1) {
897 ssize_t l;
898 if (firstline) {
899 firstline = false;
900 if (strstr(line, "IBM/S390") != NULL) {
901 is_s390x = true;
902 am_printing = true;
903 continue;
904 }
905 }
906
907 if (strncmp(line, "# processors:", 12) == 0)
908 continue;
909
910 if (is_processor_line(line)) {
911 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
912 break;
913
914 am_printing = cpuline_in_cpuset(line, cpuset);
915 if (am_printing) {
916 curcpu++;
917 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
918 if (l < 0)
919 return log_error(0, "Failed to write cache");
920 if (l >= cache_size)
921 return log_error(0, "Write to cache was truncated");
922 cache += l;
923 cache_size -= l;
924 total_len += l;
925 }
926 continue;
927 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
928 char *p;
929
930 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
931 break;
932
933 if (!cpu_in_cpuset(cpu, cpuset))
934 continue;
935
936 curcpu ++;
937 p = strchr(line, ':');
938 if (!p || !*p)
939 return 0;
940 p++;
941
942 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
943 if (l < 0)
944 return log_error(0, "Failed to write cache");
945 if (l >= cache_size)
946 return log_error(0, "Write to cache was truncated");
947
948 cache += l;
949 cache_size -= l;
950 total_len += l;
951 continue;
952
953 }
954 if (am_printing) {
955 l = snprintf(cache, cache_size, "%s", line);
956 if (l < 0)
957 return log_error(0, "Failed to write cache");
958 if (l >= cache_size)
959 return log_error(0, "Write to cache was truncated");
960
961 cache += l;
962 cache_size -= l;
963 total_len += l;
964 }
965 }
966
967 if (is_s390x) {
968 __do_free char *origcache = d->buf;
969 ssize_t l;
970
971 d->buf = malloc(d->buflen);
972 if (!d->buf) {
973 d->buf = move_ptr(origcache);
974 return 0;
975 }
976
977 cache = d->buf;
978 cache_size = d->buflen;
979 total_len = 0;
980 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
981 if (l < 0 || l >= cache_size)
982 return 0;
983
984 cache_size -= l;
985 cache += l;
986 total_len += l;
987 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
988 if (l < 0 || l >= cache_size)
989 return 0;
990
991 cache_size -= l;
992 cache += l;
993 total_len += l;
994 l = snprintf(cache, cache_size, "%s", origcache);
995 if (l < 0 || l >= cache_size)
996 return 0;
997 total_len += l;
998 }
999
1000 d->cached = 1;
1001 d->size = total_len;
1002 if (total_len > size)
1003 total_len = size;
1004
1005 /* read from off 0 */
1006 memcpy(buf, d->buf, total_len);
1007
1008 return total_len;
1009 }
1010
1011 /*
1012 * Returns 0 on success.
1013 * It is the caller's responsibility to free `return_usage`, unless this
1014 * function returns an error.
1015 */
1016 int read_cpuacct_usage_all(char *cg, char *cpuset,
1017 struct cpuacct_usage **return_usage, int *size)
1018 {
1019 __do_free char *usage_str = NULL;
1020 __do_free struct cpuacct_usage *cpu_usage = NULL;
1021 int i = 0, j = 0, read_pos = 0, read_cnt = 0;
1022 int cpucount;
1023 int ret;
1024 int cg_cpu;
1025 uint64_t cg_user, cg_system;
1026 int64_t ticks_per_sec;
1027
1028 ticks_per_sec = sysconf(_SC_CLK_TCK);
1029 if (ticks_per_sec < 0 && errno == EINVAL) {
1030 lxcfs_debug("%m - Failed to determine number of ticks per second");
1031 return -1;
1032 }
1033
1034 cpucount = get_nprocs_conf();
1035 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
1036 if (!cpu_usage)
1037 return -ENOMEM;
1038
1039 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1040 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
1041 char *sep = " \t\n";
1042 char *tok;
1043
1044 /* Read cpuacct.usage_percpu instead. */
1045 lxcfs_debug("Falling back to cpuacct.usage_percpu");
1046 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
1047 return -1;
1048
1049 lxc_iterate_parts(tok, usage_str, sep) {
1050 uint64_t percpu_user;
1051
1052 if (i >= cpucount)
1053 break;
1054
1055 tok = trim_whitespace_in_place(tok);
1056 ret = safe_uint64(tok, &percpu_user, 10);
1057 if (ret)
1058 return -1;
1059
1060 /* Convert the time from nanoseconds to USER_HZ */
1061 cpu_usage[i].user = percpu_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1062 cpu_usage[i].system = cpu_usage[i].user;
1063 i++;
1064 lxcfs_debug("cpu%d with time %s", i, tok);
1065 }
1066 } else {
1067 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0)
1068 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg);
1069
1070 read_pos += read_cnt;
1071
1072 for (i = 0, j = 0; i < cpucount; i++) {
1073 ret = sscanf(usage_str + read_pos,
1074 "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu,
1075 &cg_user, &cg_system, &read_cnt);
1076
1077 if (ret == EOF)
1078 break;
1079
1080 if (ret != 3)
1081 return log_error(-EINVAL, "Failed to parse cpuacct.usage_all line %s from cgroup %s",
1082 usage_str + read_pos, cg);
1083
1084 read_pos += read_cnt;
1085
1086 /* Convert the time from nanoseconds to USER_HZ */
1087 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1088 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
1089 j++;
1090 }
1091 }
1092
1093 *return_usage = move_ptr(cpu_usage);
1094 *size = cpucount;
1095 return 0;
1096 }
1097
1098 static bool cpuview_init_head(struct cg_proc_stat_head **head)
1099 {
1100 __do_free struct cg_proc_stat_head *h;
1101
1102 h = zalloc(sizeof(struct cg_proc_stat_head));
1103 if (!h)
1104 return false;
1105
1106 if (pthread_rwlock_init(&h->lock, NULL))
1107 return false;
1108
1109 h->lastcheck = time(NULL);
1110
1111 *head = move_ptr(h);
1112 return true;
1113 }
1114
1115 bool init_cpuview(void)
1116 {
1117 int i;
1118
1119 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
1120 proc_stat_history[i] = NULL;
1121
1122 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1123 if (!cpuview_init_head(&proc_stat_history[i]))
1124 goto err;
1125 }
1126
1127 return true;
1128
1129 err:
1130 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1131 if (proc_stat_history[i])
1132 free_disarm(proc_stat_history[i]);
1133 }
1134
1135 return false;
1136 }
1137
1138 static void cpuview_free_head(struct cg_proc_stat_head *head)
1139 {
1140 struct cg_proc_stat *node;
1141
1142 if (head->next) {
1143 node = head->next;
1144
1145 for (;;) {
1146 struct cg_proc_stat *cur = node;
1147 node = node->next;
1148 free_proc_stat_node(cur);
1149 if (!node)
1150 break;
1151 }
1152 }
1153
1154 pthread_rwlock_destroy(&head->lock);
1155 free_disarm(head);
1156 }
1157
1158 void free_cpuview(void)
1159 {
1160 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++)
1161 if (proc_stat_history[i])
1162 cpuview_free_head(proc_stat_history[i]);
1163 }