]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/proc_cpuview.c
proc_cpuview: cleanup add_proc_stat_node()
[mirror_lxcfs.git] / src / proc_cpuview.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE
5 #endif
6
7 #ifndef FUSE_USE_VERSION
8 #define FUSE_USE_VERSION 26
9 #endif
10
11 #define _FILE_OFFSET_BITS 64
12
13 #define __STDC_FORMAT_MACROS
14 #include <dirent.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <fuse.h>
18 #include <inttypes.h>
19 #include <libgen.h>
20 #include <pthread.h>
21 #include <sched.h>
22 #include <stdarg.h>
23 #include <stdbool.h>
24 #include <stdint.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <time.h>
29 #include <unistd.h>
30 #include <wait.h>
31 #include <linux/magic.h>
32 #include <linux/sched.h>
33 #include <sys/epoll.h>
34 #include <sys/mman.h>
35 #include <sys/mount.h>
36 #include <sys/param.h>
37 #include <sys/socket.h>
38 #include <sys/syscall.h>
39 #include <sys/sysinfo.h>
40 #include <sys/vfs.h>
41
42 #include "bindings.h"
43 #include "config.h"
44 #include "cgroup_fuse.h"
45 #include "cpuset_parse.h"
46 #include "cgroups/cgroup.h"
47 #include "cgroups/cgroup_utils.h"
48 #include "memory_utils.h"
49 #include "proc_loadavg.h"
50 #include "utils.h"
51
52 /* Data for CPU view */
53 struct cg_proc_stat {
54 char *cg;
55 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
56 struct cpuacct_usage *view; // Usage stats reported to the container
57 int cpu_count;
58 pthread_mutex_t lock; // For node manipulation
59 struct cg_proc_stat *next;
60 };
61
62 struct cg_proc_stat_head {
63 struct cg_proc_stat *next;
64 time_t lastcheck;
65
66 /*
67 * For access to the list. Reading can be parallel, pruning is exclusive.
68 */
69 pthread_rwlock_t lock;
70 };
71
72 #define CPUVIEW_HASH_SIZE 100
73 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
74
75 static void reset_proc_stat_node(struct cg_proc_stat *node,
76 struct cpuacct_usage *usage, int cpu_count)
77 {
78 lxcfs_debug("Resetting stat node for %s\n", node->cg);
79 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
80
81 for (int i = 0; i < cpu_count; i++) {
82 node->view[i].user = 0;
83 node->view[i].system = 0;
84 node->view[i].idle = 0;
85 }
86
87 node->cpu_count = cpu_count;
88 }
89
90 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
91 {
92 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
93
94 /* Allocate new memory */
95 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
96 if (!new_usage)
97 return false;
98
99 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
100 if (!new_view)
101 return false;
102
103 /* Copy existing data & initialize new elements */
104 for (int i = 0; i < cpu_count; i++) {
105 if (i < node->cpu_count) {
106 new_usage[i].user = node->usage[i].user;
107 new_usage[i].system = node->usage[i].system;
108 new_usage[i].idle = node->usage[i].idle;
109
110 new_view[i].user = node->view[i].user;
111 new_view[i].system = node->view[i].system;
112 new_view[i].idle = node->view[i].idle;
113 } else {
114 new_usage[i].user = 0;
115 new_usage[i].system = 0;
116 new_usage[i].idle = 0;
117
118 new_view[i].user = 0;
119 new_view[i].system = 0;
120 new_view[i].idle = 0;
121 }
122 }
123
124 free(node->usage);
125 node->usage = move_ptr(new_usage);
126
127 free(node->view);
128 node->view = move_ptr(new_view);
129 node->cpu_count = cpu_count;
130
131 return true;
132 }
133
134 static void free_proc_stat_node(struct cg_proc_stat *node)
135 {
136 if (node) {
137 /*
138 * We're abusing the usage pointer to indicate that
139 * pthread_mutex_init() was successful. Don't judge me.
140 */
141 if (node->usage)
142 pthread_mutex_destroy(&node->lock);
143 free_disarm(node->cg);
144 free_disarm(node->usage);
145 free_disarm(node->view);
146 free_disarm(node);
147 }
148 }
149
150 define_cleanup_function(struct cg_proc_stat *, free_proc_stat_node);
151
152 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
153 {
154 call_cleaner(free_proc_stat_node) struct cg_proc_stat *new = new_node;
155 struct cg_proc_stat *rv = new_node;
156 int hash = calc_hash(new->cg) % CPUVIEW_HASH_SIZE;
157 struct cg_proc_stat_head *head = proc_stat_history[hash];
158 struct cg_proc_stat *cur;
159
160 pthread_rwlock_wrlock(&head->lock);
161
162 if (!head->next) {
163 head->next = move_ptr(new);
164 goto out_rwlock_unlock;
165 }
166
167 cur = head->next;
168
169 for (;;) {
170 /*
171 * The node to be added is already present in the list, so
172 * free the newly allocated one and return the one we found.
173 */
174 if (strcmp(cur->cg, new->cg) == 0) {
175 rv = cur;
176 goto out_rwlock_unlock;
177 }
178
179 /* Keep walking. */
180 if (cur->next) {
181 cur = cur->next;
182 continue;
183 }
184
185 /* Add new node to end of list. */
186 cur->next = move_ptr(new);
187 goto out_rwlock_unlock;
188 }
189
190 out_rwlock_unlock:
191 pthread_rwlock_unlock(&head->lock);
192 return move_ptr(rv);
193 }
194
195 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage,
196 int cpu_count, const char *cg)
197 {
198 call_cleaner(free_proc_stat_node) struct cg_proc_stat *node = NULL;
199 __do_free struct cpuacct_usage *new_usage = NULL;
200
201 node = zalloc(sizeof(struct cg_proc_stat));
202 if (!node)
203 return NULL;
204
205 node->cg = strdup(cg);
206 if (!node->cg)
207 return NULL;
208
209 new_usage = memdup(usage, sizeof(struct cpuacct_usage) * cpu_count);
210 if (!new_usage)
211 return NULL;
212
213 node->view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
214 if (!node->view)
215 return NULL;
216
217 node->cpu_count = cpu_count;
218
219 if (pthread_mutex_init(&node->lock, NULL))
220 return NULL;
221 /*
222 * We're abusing the usage pointer to indicate that
223 * pthread_mutex_init() was successful. Don't judge me.
224 */
225 node->usage = move_ptr(new_usage);
226
227 return move_ptr(node);
228 }
229
230 static bool cgfs_param_exist(const char *controller, const char *cgroup,
231 const char *file)
232 {
233 __do_free char *path = NULL;
234 int cfd;
235
236 cfd = get_cgroup_fd(controller);
237 if (cfd < 0)
238 return false;
239
240 path = must_make_path_relative(cgroup, file, NULL);
241 return (faccessat(cfd, path, F_OK, 0) == 0);
242 }
243
244 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
245 {
246 struct cg_proc_stat *first = NULL;
247
248 for (struct cg_proc_stat *prev = NULL; node; ) {
249 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
250 struct cg_proc_stat *tmp = node;
251
252 lxcfs_debug("Removing stat node for %s\n", node->cg);
253
254 if (prev)
255 prev->next = node->next;
256 else
257 first = node->next;
258
259 node = node->next;
260 free_proc_stat_node(tmp);
261 } else {
262 if (!first)
263 first = node;
264 prev = node;
265 node = node->next;
266 }
267 }
268
269 return first;
270 }
271
272 #define PROC_STAT_PRUNE_INTERVAL 10
273 static void prune_proc_stat_history(void)
274 {
275 time_t now = time(NULL);
276
277 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) {
278 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
279
280 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
281 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
282 return;
283 }
284
285 if (proc_stat_history[i]->next) {
286 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
287 proc_stat_history[i]->lastcheck = now;
288 }
289
290 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
291 }
292 }
293
294 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head,
295 const char *cg)
296 {
297 struct cg_proc_stat *node;
298
299 pthread_rwlock_rdlock(&head->lock);
300
301 if (!head->next) {
302 pthread_rwlock_unlock(&head->lock);
303 return NULL;
304 }
305
306 node = head->next;
307
308 do {
309 if (strcmp(cg, node->cg) == 0)
310 goto out;
311 } while ((node = node->next));
312
313 node = NULL;
314
315 out:
316 pthread_rwlock_unlock(&head->lock);
317 prune_proc_stat_history();
318 return node;
319 }
320
321 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
322 {
323 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
324 struct cg_proc_stat_head *head = proc_stat_history[hash];
325 struct cg_proc_stat *node;
326
327 node = find_proc_stat_node(head, cg);
328 if (!node) {
329 node = new_proc_stat_node(usage, cpu_count, cg);
330 if (!node)
331 return NULL;
332
333 node = add_proc_stat_node(node);
334 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
335 }
336
337 pthread_mutex_lock(&node->lock);
338
339 /* If additional CPUs on the host have been enabled, CPU usage counter
340 * arrays have to be expanded */
341 if (node->cpu_count < cpu_count) {
342 lxcfs_debug("Expanding stat node %d->%d for %s\n",
343 node->cpu_count, cpu_count, cg);
344
345 if (!expand_proc_stat_node(node, cpu_count)) {
346 pthread_mutex_unlock(&node->lock);
347 return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg);
348 }
349 }
350
351 return node;
352 }
353
354 static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage,
355 uint64_t *counter, uint64_t threshold)
356 {
357 uint64_t free_space, to_add;
358
359 free_space = threshold - usage->user - usage->system;
360
361 if (free_space > usage->idle)
362 free_space = usage->idle;
363
364 to_add = free_space > *surplus ? *surplus : free_space;
365
366 *counter += to_add;
367 usage->idle -= to_add;
368 *surplus -= to_add;
369 }
370
371 static uint64_t diff_cpu_usage(struct cpuacct_usage *older,
372 struct cpuacct_usage *newer,
373 struct cpuacct_usage *diff, int cpu_count)
374 {
375 uint64_t sum = 0;
376
377 for (int i = 0; i < cpu_count; i++) {
378 if (!newer[i].online)
379 continue;
380
381 /*
382 * When cpuset is changed on the fly, the CPUs might get
383 * reordered. We could either reset all counters, or check
384 * that the substractions below will return expected results.
385 */
386 if (newer[i].user > older[i].user)
387 diff[i].user = newer[i].user - older[i].user;
388 else
389 diff[i].user = 0;
390
391 if (newer[i].system > older[i].system)
392 diff[i].system = newer[i].system - older[i].system;
393 else
394 diff[i].system = 0;
395
396 if (newer[i].idle > older[i].idle)
397 diff[i].idle = newer[i].idle - older[i].idle;
398 else
399 diff[i].idle = 0;
400
401 sum += diff[i].user;
402 sum += diff[i].system;
403 sum += diff[i].idle;
404 }
405
406 return sum;
407 }
408
409 /*
410 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
411 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
412 * throuh `value`.
413 */
414 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
415 {
416 __do_free char *str = NULL;
417 char file[11 + 6 + 1]; /* cpu.cfs__us + quota/period + \0 */
418 bool first = true;
419
420 if (!pure_unified_layout(cgroup_ops)) {
421 snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
422 } else {
423 strcpy(file, "cpu.max");
424 first = !strcmp(param, "quota");
425 }
426
427 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
428 return false;
429
430 if (sscanf(str, first ? "%" PRId64 : "%*d %" PRId64, value) != 1)
431 return false;
432
433 return true;
434 }
435
436 /*
437 * Return the exact number of visible CPUs based on CPU quotas.
438 * If there is no quota set, zero is returned.
439 */
440 static double exact_cpu_count(const char *cg)
441 {
442 double rv;
443 int nprocs;
444 int64_t cfs_quota, cfs_period;
445
446 read_cpu_cfs_param(cg, "quota", &cfs_quota);
447 read_cpu_cfs_param(cg, "period", &cfs_period);
448
449 if (cfs_quota <= 0 || cfs_period <= 0)
450 return 0;
451
452 rv = (double)cfs_quota / (double)cfs_period;
453
454 nprocs = get_nprocs();
455
456 if (rv > nprocs)
457 rv = nprocs;
458
459 return rv;
460 }
461
462 /*
463 * Return the maximum number of visible CPUs based on CPU quotas.
464 * If there is no quota set, zero is returned.
465 */
466 int max_cpu_count(const char *cg)
467 {
468 __do_free char *cpuset = NULL;
469 int rv, nprocs;
470 int64_t cfs_quota, cfs_period;
471 int nr_cpus_in_cpuset = 0;
472
473 read_cpu_cfs_param(cg, "quota", &cfs_quota);
474 read_cpu_cfs_param(cg, "period", &cfs_period);
475
476 cpuset = get_cpuset(cg);
477 if (cpuset)
478 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
479
480 if (cfs_quota <= 0 || cfs_period <= 0){
481 if (nr_cpus_in_cpuset > 0)
482 return nr_cpus_in_cpuset;
483
484 return 0;
485 }
486
487 rv = cfs_quota / cfs_period;
488
489 /* In case quota/period does not yield a whole number, add one CPU for
490 * the remainder.
491 */
492 if ((cfs_quota % cfs_period) > 0)
493 rv += 1;
494
495 nprocs = get_nprocs();
496 if (rv > nprocs)
497 rv = nprocs;
498
499 /* use min value in cpu quota and cpuset */
500 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
501 rv = nr_cpus_in_cpuset;
502
503 return rv;
504 }
505
506 int cpuview_proc_stat(const char *cg, const char *cpuset,
507 struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size,
508 FILE *f, char *buf, size_t buf_size)
509 {
510 __do_free char *line = NULL;
511 __do_free struct cpuacct_usage *diff = NULL;
512 size_t linelen = 0, total_len = 0;
513 int curcpu = -1; /* cpu numbering starts at 0 */
514 int physcpu, i;
515 int cpu_cnt = 0;
516 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
517 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
518 uint64_t user_sum = 0, system_sum = 0, idle_sum = 0;
519 uint64_t user_surplus = 0, system_surplus = 0;
520 int nprocs, max_cpus;
521 ssize_t l;
522 uint64_t total_sum, threshold;
523 struct cg_proc_stat *stat_node;
524
525 nprocs = get_nprocs_conf();
526 if (cg_cpu_usage_size < nprocs)
527 nprocs = cg_cpu_usage_size;
528
529 /* Read all CPU stats and stop when we've encountered other lines */
530 while (getline(&line, &linelen, f) != -1) {
531 int ret;
532 char cpu_char[10]; /* That's a lot of cores */
533 uint64_t all_used, cg_used;
534
535 if (strlen(line) == 0)
536 continue;
537
538 /* not a ^cpuN line containing a number N */
539 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
540 break;
541
542 if (sscanf(cpu_char, "%d", &physcpu) != 1)
543 continue;
544
545 if (physcpu >= cg_cpu_usage_size)
546 continue;
547
548 curcpu++;
549 cpu_cnt++;
550
551 if (!cpu_in_cpuset(physcpu, cpuset)) {
552 for (i = curcpu; i <= physcpu; i++)
553 cg_cpu_usage[i].online = false;
554 continue;
555 }
556
557 if (curcpu < physcpu) {
558 /* Some CPUs may be disabled */
559 for (i = curcpu; i < physcpu; i++)
560 cg_cpu_usage[i].online = false;
561
562 curcpu = physcpu;
563 }
564
565 cg_cpu_usage[curcpu].online = true;
566
567 ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu",
568 &user,
569 &nice,
570 &system,
571 &idle,
572 &iowait,
573 &irq,
574 &softirq,
575 &steal,
576 &guest,
577 &guest_nice);
578 if (ret != 10)
579 continue;
580
581 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
582 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
583
584 if (all_used >= cg_used) {
585 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
586
587 } else {
588 lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
589 curcpu, cg, all_used, cg_used);
590 cg_cpu_usage[curcpu].idle = idle;
591 }
592 }
593
594 /* Cannot use more CPUs than is available in cpuset. */
595 max_cpus = max_cpu_count(cg);
596 if (max_cpus > cpu_cnt || !max_cpus)
597 max_cpus = cpu_cnt;
598
599 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
600 if (!stat_node)
601 return log_error(0, "Failed to find/create stat node for %s", cg);
602
603 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
604 if (!diff)
605 return 0;
606
607 /*
608 * If the new values are LOWER than values stored in memory, it means
609 * the cgroup has been reset/recreated and we should reset too.
610 */
611 for (curcpu = 0; curcpu < nprocs; curcpu++) {
612 if (!cg_cpu_usage[curcpu].online)
613 continue;
614
615 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
616 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
617
618 break;
619 }
620
621 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
622
623 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
624 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
625
626 if (!stat_node->usage[curcpu].online)
627 continue;
628
629 i++;
630
631 stat_node->usage[curcpu].user += diff[curcpu].user;
632 stat_node->usage[curcpu].system += diff[curcpu].system;
633 stat_node->usage[curcpu].idle += diff[curcpu].idle;
634
635 if (max_cpus > 0 && i >= max_cpus) {
636 user_surplus += diff[curcpu].user;
637 system_surplus += diff[curcpu].system;
638 }
639 }
640
641 /* Calculate usage counters of visible CPUs */
642 if (max_cpus > 0) {
643 uint64_t diff_user = 0;
644 uint64_t diff_system = 0;
645 uint64_t diff_idle = 0;
646 uint64_t max_diff_idle = 0;
647 uint64_t max_diff_idle_index = 0;
648 double exact_cpus;
649
650 /* threshold = maximum usage per cpu, including idle */
651 threshold = total_sum / cpu_cnt * max_cpus;
652
653 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
654 if (!stat_node->usage[curcpu].online)
655 continue;
656
657 i++;
658
659 if (i == max_cpus)
660 break;
661
662 if (diff[curcpu].user + diff[curcpu].system >= threshold)
663 continue;
664
665 /* Add user */
666 add_cpu_usage(&user_surplus, &diff[curcpu],
667 &diff[curcpu].user, threshold);
668
669 if (diff[curcpu].user + diff[curcpu].system >= threshold)
670 continue;
671
672 /* If there is still room, add system */
673 add_cpu_usage(&system_surplus, &diff[curcpu],
674 &diff[curcpu].system, threshold);
675 }
676
677 if (user_surplus > 0)
678 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
679 if (system_surplus > 0)
680 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
681
682 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
683 if (!stat_node->usage[curcpu].online)
684 continue;
685
686 i++;
687
688 if (i == max_cpus)
689 break;
690
691 stat_node->view[curcpu].user += diff[curcpu].user;
692 stat_node->view[curcpu].system += diff[curcpu].system;
693 stat_node->view[curcpu].idle += diff[curcpu].idle;
694
695 user_sum += stat_node->view[curcpu].user;
696 system_sum += stat_node->view[curcpu].system;
697 idle_sum += stat_node->view[curcpu].idle;
698
699 diff_user += diff[curcpu].user;
700 diff_system += diff[curcpu].system;
701 diff_idle += diff[curcpu].idle;
702 if (diff[curcpu].idle > max_diff_idle) {
703 max_diff_idle = diff[curcpu].idle;
704 max_diff_idle_index = curcpu;
705 }
706
707 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
708 }
709 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
710
711 /* revise cpu usage view to support partial cpu case. */
712 exact_cpus = exact_cpu_count(cg);
713 if (exact_cpus < (double)max_cpus){
714 uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
715
716 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
717 lxcfs_v("delta: %lu\n", delta);
718 lxcfs_v("idle_sum before: %lu\n", idle_sum);
719 idle_sum = idle_sum > delta ? idle_sum - delta : 0;
720 lxcfs_v("idle_sum after: %lu\n", idle_sum);
721
722 curcpu = max_diff_idle_index;
723 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
724 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
725 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
726 }
727 } else {
728 for (curcpu = 0; curcpu < nprocs; curcpu++) {
729 if (!stat_node->usage[curcpu].online)
730 continue;
731
732 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
733 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
734 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
735
736 user_sum += stat_node->view[curcpu].user;
737 system_sum += stat_node->view[curcpu].system;
738 idle_sum += stat_node->view[curcpu].idle;
739 }
740 }
741
742 /* Render the file */
743 /* cpu-all */
744 l = snprintf(buf, buf_size,
745 "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
746 user_sum, system_sum, idle_sum);
747 lxcfs_v("cpu-all: %s\n", buf);
748 if (l < 0)
749 return log_error(0, "Failed to write cache");
750 if (l >= buf_size)
751 return log_error(0, "Write to cache was truncated");
752
753 buf += l;
754 buf_size -= l;
755 total_len += l;
756
757 /* Render visible CPUs */
758 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
759 if (!stat_node->usage[curcpu].online)
760 continue;
761
762 i++;
763
764 if (max_cpus > 0 && i == max_cpus)
765 break;
766
767 l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
768 i,
769 stat_node->view[curcpu].user,
770 stat_node->view[curcpu].system,
771 stat_node->view[curcpu].idle);
772 lxcfs_v("cpu: %s\n", buf);
773 if (l < 0)
774 return log_error(0, "Failed to write cache");
775 if (l >= buf_size)
776 return log_error(0, "Write to cache was truncated");
777
778 buf += l;
779 buf_size -= l;
780 total_len += l;
781 }
782
783 /* Pass the rest of /proc/stat, start with the last line read */
784 l = snprintf(buf, buf_size, "%s", line);
785 if (l < 0)
786 return log_error(0, "Failed to write cache");
787 if (l >= buf_size)
788 return log_error(0, "Write to cache was truncated");
789
790 buf += l;
791 buf_size -= l;
792 total_len += l;
793
794 /* Pass the rest of the host's /proc/stat */
795 while (getline(&line, &linelen, f) != -1) {
796 l = snprintf(buf, buf_size, "%s", line);
797 if (l < 0)
798 return log_error(0, "Failed to write cache");
799 if (l >= buf_size)
800 return log_error(0, "Write to cache was truncated");
801
802 buf += l;
803 buf_size -= l;
804 total_len += l;
805 }
806
807 if (stat_node)
808 pthread_mutex_unlock(&stat_node->lock);
809
810 return total_len;
811 }
812
813 /*
814 * check whether this is a '^processor" line in /proc/cpuinfo
815 */
816 static inline bool is_processor_line(const char *line)
817 {
818 int cpu;
819 return sscanf(line, "processor : %d", &cpu) == 1;
820 }
821
822 static inline bool cpuline_in_cpuset(const char *line, const char *cpuset)
823 {
824 int cpu;
825
826 if (sscanf(line, "processor : %d", &cpu) == 1)
827 return cpu_in_cpuset(cpu, cpuset);
828
829 return false;
830 }
831
832 int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
833 struct fuse_file_info *fi)
834 {
835 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
836 __do_free void *fopen_cache = NULL;
837 __do_fclose FILE *f = NULL;
838 struct fuse_context *fc = fuse_get_context();
839 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
840 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
841 size_t linelen = 0, total_len = 0;
842 bool am_printing = false, firstline = true, is_s390x = false;
843 int curcpu = -1, cpu, max_cpus = 0;
844 bool use_view;
845 char *cache = d->buf;
846 size_t cache_size = d->buflen;
847
848 if (offset) {
849 int left;
850
851 if (offset > d->size)
852 return -EINVAL;
853
854 if (!d->cached)
855 return 0;
856
857 left = d->size - offset;
858 total_len = left > size ? size: left;
859 memcpy(buf, cache + offset, total_len);
860
861 return total_len;
862 }
863
864 pid_t initpid = lookup_initpid_in_store(fc->pid);
865 if (initpid <= 1 || is_shared_pidns(initpid))
866 initpid = fc->pid;
867
868 cg = get_pid_cgroup(initpid, "cpuset");
869 if (!cg)
870 return read_file_fuse("proc/cpuinfo", buf, size, d);
871 prune_init_slice(cg);
872
873 cpuset = get_cpuset(cg);
874 if (!cpuset)
875 return 0;
876
877 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
878 use_view = true;
879 else
880 use_view = false;
881 if (use_view)
882 max_cpus = max_cpu_count(cg);
883
884 f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache);
885 if (!f)
886 return 0;
887
888 while (getline(&line, &linelen, f) != -1) {
889 ssize_t l;
890 if (firstline) {
891 firstline = false;
892 if (strstr(line, "IBM/S390") != NULL) {
893 is_s390x = true;
894 am_printing = true;
895 continue;
896 }
897 }
898
899 if (strncmp(line, "# processors:", 12) == 0)
900 continue;
901
902 if (is_processor_line(line)) {
903 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
904 break;
905
906 am_printing = cpuline_in_cpuset(line, cpuset);
907 if (am_printing) {
908 curcpu++;
909 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
910 if (l < 0)
911 return log_error(0, "Failed to write cache");
912 if (l >= cache_size)
913 return log_error(0, "Write to cache was truncated");
914 cache += l;
915 cache_size -= l;
916 total_len += l;
917 }
918 continue;
919 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
920 char *p;
921
922 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
923 break;
924
925 if (!cpu_in_cpuset(cpu, cpuset))
926 continue;
927
928 curcpu ++;
929 p = strchr(line, ':');
930 if (!p || !*p)
931 return 0;
932 p++;
933
934 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
935 if (l < 0)
936 return log_error(0, "Failed to write cache");
937 if (l >= cache_size)
938 return log_error(0, "Write to cache was truncated");
939
940 cache += l;
941 cache_size -= l;
942 total_len += l;
943 continue;
944
945 }
946 if (am_printing) {
947 l = snprintf(cache, cache_size, "%s", line);
948 if (l < 0)
949 return log_error(0, "Failed to write cache");
950 if (l >= cache_size)
951 return log_error(0, "Write to cache was truncated");
952
953 cache += l;
954 cache_size -= l;
955 total_len += l;
956 }
957 }
958
959 if (is_s390x) {
960 __do_free char *origcache = d->buf;
961 ssize_t l;
962
963 d->buf = malloc(d->buflen);
964 if (!d->buf) {
965 d->buf = move_ptr(origcache);
966 return 0;
967 }
968
969 cache = d->buf;
970 cache_size = d->buflen;
971 total_len = 0;
972 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
973 if (l < 0 || l >= cache_size)
974 return 0;
975
976 cache_size -= l;
977 cache += l;
978 total_len += l;
979 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
980 if (l < 0 || l >= cache_size)
981 return 0;
982
983 cache_size -= l;
984 cache += l;
985 total_len += l;
986 l = snprintf(cache, cache_size, "%s", origcache);
987 if (l < 0 || l >= cache_size)
988 return 0;
989 total_len += l;
990 }
991
992 d->cached = 1;
993 d->size = total_len;
994 if (total_len > size)
995 total_len = size;
996
997 /* read from off 0 */
998 memcpy(buf, d->buf, total_len);
999
1000 return total_len;
1001 }
1002
1003 /*
1004 * Returns 0 on success.
1005 * It is the caller's responsibility to free `return_usage`, unless this
1006 * function returns an error.
1007 */
1008 int read_cpuacct_usage_all(char *cg, char *cpuset,
1009 struct cpuacct_usage **return_usage, int *size)
1010 {
1011 __do_free char *usage_str = NULL;
1012 __do_free struct cpuacct_usage *cpu_usage = NULL;
1013 int i = 0, j = 0, read_pos = 0, read_cnt = 0;
1014 int cpucount;
1015 int ret;
1016 int cg_cpu;
1017 uint64_t cg_user, cg_system;
1018 int64_t ticks_per_sec;
1019
1020 ticks_per_sec = sysconf(_SC_CLK_TCK);
1021 if (ticks_per_sec < 0 && errno == EINVAL) {
1022 lxcfs_debug("%m - Failed to determine number of ticks per second");
1023 return -1;
1024 }
1025
1026 cpucount = get_nprocs_conf();
1027 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
1028 if (!cpu_usage)
1029 return -ENOMEM;
1030
1031 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1032 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
1033 char *sep = " \t\n";
1034 char *tok;
1035
1036 /* Read cpuacct.usage_percpu instead. */
1037 lxcfs_debug("Falling back to cpuacct.usage_percpu");
1038 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
1039 return -1;
1040
1041 lxc_iterate_parts(tok, usage_str, sep) {
1042 uint64_t percpu_user;
1043
1044 if (i >= cpucount)
1045 break;
1046
1047 tok = trim_whitespace_in_place(tok);
1048 ret = safe_uint64(tok, &percpu_user, 10);
1049 if (ret)
1050 return -1;
1051
1052 /* Convert the time from nanoseconds to USER_HZ */
1053 cpu_usage[i].user = percpu_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1054 cpu_usage[i].system = cpu_usage[i].user;
1055 i++;
1056 lxcfs_debug("cpu%d with time %s", i, tok);
1057 }
1058 } else {
1059 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0)
1060 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg);
1061
1062 read_pos += read_cnt;
1063
1064 for (i = 0, j = 0; i < cpucount; i++) {
1065 ret = sscanf(usage_str + read_pos,
1066 "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu,
1067 &cg_user, &cg_system, &read_cnt);
1068
1069 if (ret == EOF)
1070 break;
1071
1072 if (ret != 3)
1073 return log_error(-EINVAL, "Failed to parse cpuacct.usage_all line %s from cgroup %s",
1074 usage_str + read_pos, cg);
1075
1076 read_pos += read_cnt;
1077
1078 /* Convert the time from nanoseconds to USER_HZ */
1079 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1080 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
1081 j++;
1082 }
1083 }
1084
1085 *return_usage = move_ptr(cpu_usage);
1086 *size = cpucount;
1087 return 0;
1088 }
1089
1090 static bool cpuview_init_head(struct cg_proc_stat_head **head)
1091 {
1092 *head = malloc(sizeof(struct cg_proc_stat_head));
1093 if (!(*head))
1094 return log_error(false, "%s", strerror(errno));
1095
1096 (*head)->lastcheck = time(NULL);
1097 (*head)->next = NULL;
1098
1099 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
1100 free_disarm(*head);
1101 return log_error(false, "Failed to initialize list lock");
1102 }
1103
1104 return true;
1105 }
1106
1107 bool init_cpuview(void)
1108 {
1109 int i;
1110
1111 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
1112 proc_stat_history[i] = NULL;
1113
1114 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1115 if (!cpuview_init_head(&proc_stat_history[i]))
1116 goto err;
1117 }
1118
1119 return true;
1120
1121 err:
1122 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1123 if (proc_stat_history[i])
1124 free_disarm(proc_stat_history[i]);
1125 }
1126
1127 return false;
1128 }
1129
1130 static void cpuview_free_head(struct cg_proc_stat_head *head)
1131 {
1132 struct cg_proc_stat *node;
1133
1134 if (head->next) {
1135 node = head->next;
1136
1137 for (;;) {
1138 struct cg_proc_stat *cur = node;
1139 node = node->next;
1140 free_proc_stat_node(cur);
1141 if (!node)
1142 break;
1143 }
1144 }
1145
1146 pthread_rwlock_destroy(&head->lock);
1147 free_disarm(head);
1148 }
1149
1150 void free_cpuview(void)
1151 {
1152 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++)
1153 if (proc_stat_history[i])
1154 cpuview_free_head(proc_stat_history[i]);
1155 }