]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/proc_cpuview.c
proc_cpuview: release lock before returning
[mirror_lxcfs.git] / src / proc_cpuview.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE
5 #endif
6
7 #include "config.h"
8
9 #ifdef HAVE_FUSE3
10 #ifndef FUSE_USE_VERSION
11 #define FUSE_USE_VERSION 30
12 #endif
13 #else
14 #ifndef FUSE_USE_VERSION
15 #define FUSE_USE_VERSION 26
16 #endif
17 #endif
18
19 #define _FILE_OFFSET_BITS 64
20
21 #define __STDC_FORMAT_MACROS
22 #include <dirent.h>
23 #include <errno.h>
24 #include <fcntl.h>
25 #include <fuse.h>
26 #include <inttypes.h>
27 #include <libgen.h>
28 #include <pthread.h>
29 #include <sched.h>
30 #include <stdarg.h>
31 #include <stdbool.h>
32 #include <stdint.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <time.h>
37 #include <unistd.h>
38 #include <wait.h>
39 #include <linux/magic.h>
40 #include <linux/sched.h>
41 #include <sys/epoll.h>
42 #include <sys/mman.h>
43 #include <sys/mount.h>
44 #include <sys/param.h>
45 #include <sys/socket.h>
46 #include <sys/syscall.h>
47 #include <sys/sysinfo.h>
48 #include <sys/vfs.h>
49
50 #include "bindings.h"
51 #include "cgroup_fuse.h"
52 #include "cpuset_parse.h"
53 #include "cgroups/cgroup.h"
54 #include "cgroups/cgroup_utils.h"
55 #include "memory_utils.h"
56 #include "proc_loadavg.h"
57 #include "utils.h"
58
59 /* Data for CPU view */
60 struct cg_proc_stat {
61 char *cg;
62 struct cpuacct_usage *usage; /* Real usage as read from the host's /proc/stat. */
63 struct cpuacct_usage *view; /* Usage stats reported to the container. */
64 int cpu_count;
65 pthread_mutex_t lock; /* For node manipulation. */
66 struct cg_proc_stat *next;
67 };
68
69 struct cg_proc_stat_head {
70 struct cg_proc_stat *next;
71 time_t lastcheck;
72
73 /*
74 * For access to the list. Reading can be parallel, pruning is exclusive.
75 */
76 pthread_rwlock_t lock;
77 };
78
79 #define CPUVIEW_HASH_SIZE 100
80 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
81
82 static void reset_proc_stat_node(struct cg_proc_stat *node,
83 struct cpuacct_usage *usage, int cpu_count)
84 {
85 lxcfs_debug("Resetting stat node for %s\n", node->cg);
86 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
87
88 for (int i = 0; i < cpu_count; i++) {
89 node->view[i].user = 0;
90 node->view[i].system = 0;
91 node->view[i].idle = 0;
92 }
93
94 node->cpu_count = cpu_count;
95 }
96
97 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
98 {
99 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
100
101 /* Allocate new memory */
102 new_usage = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
103 if (!new_usage)
104 return false;
105
106 new_view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
107 if (!new_view)
108 return false;
109
110 /* Copy existing data & initialize new elements */
111 for (int i = 0; i < cpu_count; i++) {
112 if (i < node->cpu_count) {
113 new_usage[i].user = node->usage[i].user;
114 new_usage[i].system = node->usage[i].system;
115 new_usage[i].idle = node->usage[i].idle;
116
117 new_view[i].user = node->view[i].user;
118 new_view[i].system = node->view[i].system;
119 new_view[i].idle = node->view[i].idle;
120 }
121 }
122
123 free(node->usage);
124 node->usage = move_ptr(new_usage);
125
126 free(node->view);
127 node->view = move_ptr(new_view);
128 node->cpu_count = cpu_count;
129
130 return true;
131 }
132
133 static void free_proc_stat_node(struct cg_proc_stat *node)
134 {
135 if (node) {
136 /*
137 * We're abusing the usage pointer to indicate that
138 * pthread_mutex_init() was successful. Don't judge me.
139 */
140 if (node->usage)
141 pthread_mutex_destroy(&node->lock);
142 free_disarm(node->cg);
143 free_disarm(node->usage);
144 free_disarm(node->view);
145 free_disarm(node);
146 }
147 }
148
149 define_cleanup_function(struct cg_proc_stat *, free_proc_stat_node);
150
151 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
152 {
153 call_cleaner(free_proc_stat_node) struct cg_proc_stat *new = new_node;
154 struct cg_proc_stat *rv = new_node;
155 int hash = calc_hash(new->cg) % CPUVIEW_HASH_SIZE;
156 struct cg_proc_stat_head *head = proc_stat_history[hash];
157 struct cg_proc_stat *cur;
158
159 pthread_rwlock_wrlock(&head->lock);
160
161 if (!head->next) {
162 head->next = move_ptr(new);
163 goto out_rwlock_unlock;
164 }
165
166 cur = head->next;
167
168 for (;;) {
169 /*
170 * The node to be added is already present in the list, so
171 * free the newly allocated one and return the one we found.
172 */
173 if (strcmp(cur->cg, new->cg) == 0) {
174 rv = cur;
175 goto out_rwlock_unlock;
176 }
177
178 /* Keep walking. */
179 if (cur->next) {
180 cur = cur->next;
181 continue;
182 }
183
184 /* Add new node to end of list. */
185 cur->next = move_ptr(new);
186 goto out_rwlock_unlock;
187 }
188
189 out_rwlock_unlock:
190 pthread_rwlock_unlock(&head->lock);
191 return move_ptr(rv);
192 }
193
194 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage,
195 int cpu_count, const char *cg)
196 {
197 call_cleaner(free_proc_stat_node) struct cg_proc_stat *node = NULL;
198 __do_free struct cpuacct_usage *new_usage = NULL;
199
200 node = zalloc(sizeof(struct cg_proc_stat));
201 if (!node)
202 return NULL;
203
204 node->cg = strdup(cg);
205 if (!node->cg)
206 return NULL;
207
208 new_usage = memdup(usage, sizeof(struct cpuacct_usage) * cpu_count);
209 if (!new_usage)
210 return NULL;
211
212 node->view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
213 if (!node->view)
214 return NULL;
215
216 node->cpu_count = cpu_count;
217
218 if (pthread_mutex_init(&node->lock, NULL))
219 return NULL;
220 /*
221 * We're abusing the usage pointer to indicate that
222 * pthread_mutex_init() was successful. Don't judge me.
223 */
224 node->usage = move_ptr(new_usage);
225
226 return move_ptr(node);
227 }
228
229 static bool cgroup_supports(const char *controller, const char *cgroup,
230 const char *file)
231 {
232 __do_free char *path = NULL;
233 int cfd;
234
235 cfd = get_cgroup_fd(controller);
236 if (cfd < 0)
237 return false;
238
239 path = must_make_path_relative(cgroup, file, NULL);
240 return faccessat(cfd, path, F_OK, 0) == 0;
241 }
242
243 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
244 {
245 struct cg_proc_stat *first = NULL;
246
247 for (struct cg_proc_stat *prev = NULL; node; ) {
248 if (!cgroup_supports("cpu", node->cg, "cpu.shares")) {
249 call_cleaner(free_proc_stat_node) struct cg_proc_stat *cur = node;
250
251 if (prev)
252 prev->next = node->next;
253 else
254 first = node->next;
255
256 node = node->next;
257 lxcfs_debug("Removing stat node for %s\n", cur->cg);
258 } else {
259 if (!first)
260 first = node;
261 prev = node;
262 node = node->next;
263 }
264 }
265
266 return first;
267 }
268
269 #define PROC_STAT_PRUNE_INTERVAL 10
270 static void prune_proc_stat_history(void)
271 {
272 time_t now = time(NULL);
273
274 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) {
275 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
276
277 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
278 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
279 return;
280 }
281
282 if (proc_stat_history[i]->next) {
283 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
284 proc_stat_history[i]->lastcheck = now;
285 }
286
287 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
288 }
289 }
290
291 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head,
292 const char *cg)
293 {
294 struct cg_proc_stat *node;
295
296 pthread_rwlock_rdlock(&head->lock);
297
298 if (!head->next) {
299 pthread_rwlock_unlock(&head->lock);
300 return NULL;
301 }
302
303 node = head->next;
304
305 do {
306 if (strcmp(cg, node->cg) == 0)
307 goto out;
308 } while ((node = node->next));
309
310 node = NULL;
311
312 out:
313 pthread_rwlock_unlock(&head->lock);
314 prune_proc_stat_history();
315 return node;
316 }
317
318 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage,
319 int cpu_count, const char *cg)
320 {
321 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
322 struct cg_proc_stat_head *head = proc_stat_history[hash];
323 struct cg_proc_stat *node;
324
325 node = find_proc_stat_node(head, cg);
326 if (!node) {
327 node = new_proc_stat_node(usage, cpu_count, cg);
328 if (!node)
329 return NULL;
330
331 node = add_proc_stat_node(node);
332 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
333 }
334
335 pthread_mutex_lock(&node->lock);
336
337 /*
338 * If additional CPUs on the host have been enabled, CPU usage counter
339 * arrays have to be expanded.
340 */
341 if (node->cpu_count < cpu_count) {
342 lxcfs_debug("Expanding stat node %d->%d for %s\n",
343 node->cpu_count, cpu_count, cg);
344
345 if (!expand_proc_stat_node(node, cpu_count)) {
346 pthread_mutex_unlock(&node->lock);
347 return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg);
348 }
349 }
350
351 return node;
352 }
353
354 static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage,
355 uint64_t *counter, uint64_t threshold)
356 {
357 uint64_t free_space, to_add;
358
359 free_space = threshold - usage->user - usage->system;
360
361 if (free_space > usage->idle)
362 free_space = usage->idle;
363
364 if (free_space > *surplus)
365 to_add = *surplus;
366 else
367 to_add = free_space;
368
369 *counter += to_add;
370 usage->idle -= to_add;
371 *surplus -= to_add;
372 }
373
374 static uint64_t diff_cpu_usage(struct cpuacct_usage *older,
375 struct cpuacct_usage *newer,
376 struct cpuacct_usage *diff, int cpu_count)
377 {
378 uint64_t sum = 0;
379
380 for (int i = 0; i < cpu_count; i++) {
381 if (!newer[i].online)
382 continue;
383
384 /*
385 * When cpuset is changed on the fly, the CPUs might get
386 * reordered. We could either reset all counters, or check
387 * that the substractions below will return expected results.
388 */
389 if (newer[i].user > older[i].user)
390 diff[i].user = newer[i].user - older[i].user;
391 else
392 diff[i].user = 0;
393
394 if (newer[i].system > older[i].system)
395 diff[i].system = newer[i].system - older[i].system;
396 else
397 diff[i].system = 0;
398
399 if (newer[i].idle > older[i].idle)
400 diff[i].idle = newer[i].idle - older[i].idle;
401 else
402 diff[i].idle = 0;
403
404 sum += diff[i].user;
405 sum += diff[i].system;
406 sum += diff[i].idle;
407 }
408
409 return sum;
410 }
411
412 /*
413 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
414 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
415 * through `value`.
416 */
417 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
418 {
419 __do_free char *str = NULL;
420 char file[STRLITERALLEN("cpu.cfs_period_us") + 1];
421 bool first = true;
422 int ret;
423
424 if (pure_unified_layout(cgroup_ops)) {
425 first = !strcmp(param, "quota");
426 ret = snprintf(file, sizeof(file), "cpu.max");
427 } else {
428 ret = snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
429 }
430 if (ret < 0 || (size_t)ret >= sizeof(file))
431 return false;
432
433 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
434 return false;
435
436 return sscanf(str, first ? "%" PRId64 : "%*d %" PRId64, value) == 1;
437 }
438
439 /*
440 * Return the exact number of visible CPUs based on CPU quotas.
441 * If there is no quota set, zero is returned.
442 */
443 static double exact_cpu_count(const char *cg)
444 {
445 double rv;
446 int nprocs;
447 int64_t cfs_quota, cfs_period;
448
449 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
450 return 0;
451
452 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
453 return 0;
454
455 if (cfs_quota <= 0 || cfs_period <= 0)
456 return 0;
457
458 rv = (double)cfs_quota / (double)cfs_period;
459
460 nprocs = get_nprocs();
461
462 if (rv > nprocs)
463 rv = nprocs;
464
465 return rv;
466 }
467
468 /*
469 * Return the maximum number of visible CPUs based on CPU quotas.
470 * If there is no quota set, zero is returned.
471 */
472 int max_cpu_count(const char *cg)
473 {
474 __do_free char *cpuset = NULL;
475 int rv, nprocs;
476 int64_t cfs_quota, cfs_period;
477 int nr_cpus_in_cpuset = 0;
478
479 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
480 return 0;
481
482 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
483 return 0;
484
485 cpuset = get_cpuset(cg);
486 if (cpuset)
487 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
488
489 if (cfs_quota <= 0 || cfs_period <= 0) {
490 if (nr_cpus_in_cpuset > 0)
491 return nr_cpus_in_cpuset;
492
493 return 0;
494 }
495
496 rv = cfs_quota / cfs_period;
497
498 /*
499 * In case quota/period does not yield a whole number, add one CPU for
500 * the remainder.
501 */
502 if ((cfs_quota % cfs_period) > 0)
503 rv += 1;
504
505 nprocs = get_nprocs();
506 if (rv > nprocs)
507 rv = nprocs;
508
509 /* Use min value in cpu quota and cpuset. */
510 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
511 rv = nr_cpus_in_cpuset;
512
513 return rv;
514 }
515
516 int cpuview_proc_stat(const char *cg, const char *cpuset,
517 struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size,
518 FILE *f, char *buf, size_t buf_size)
519 {
520 __do_free char *line = NULL;
521 __do_free struct cpuacct_usage *diff = NULL;
522 size_t linelen = 0, total_len = 0;
523 int curcpu = -1; /* cpu numbering starts at 0 */
524 int physcpu, i;
525 int cpu_cnt = 0;
526 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
527 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
528 uint64_t user_sum = 0, system_sum = 0, idle_sum = 0;
529 uint64_t user_surplus = 0, system_surplus = 0;
530 int nprocs, max_cpus;
531 ssize_t l;
532 uint64_t total_sum, threshold;
533 struct cg_proc_stat *stat_node;
534
535 nprocs = get_nprocs_conf();
536 if (cg_cpu_usage_size < nprocs)
537 nprocs = cg_cpu_usage_size;
538
539 /* Read all CPU stats and stop when we've encountered other lines */
540 while (getline(&line, &linelen, f) != -1) {
541 int ret;
542 char cpu_char[10]; /* That's a lot of cores */
543 uint64_t all_used, cg_used;
544
545 if (strlen(line) == 0)
546 continue;
547
548 /* not a ^cpuN line containing a number N */
549 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
550 break;
551
552 if (sscanf(cpu_char, "%d", &physcpu) != 1)
553 continue;
554
555 if (physcpu >= cg_cpu_usage_size)
556 continue;
557
558 curcpu++;
559 cpu_cnt++;
560
561 if (!cpu_in_cpuset(physcpu, cpuset)) {
562 for (i = curcpu; i <= physcpu; i++)
563 cg_cpu_usage[i].online = false;
564 continue;
565 }
566
567 if (curcpu < physcpu) {
568 /* Some CPUs may be disabled */
569 for (i = curcpu; i < physcpu; i++)
570 cg_cpu_usage[i].online = false;
571
572 curcpu = physcpu;
573 }
574
575 cg_cpu_usage[curcpu].online = true;
576
577 ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu",
578 &user,
579 &nice,
580 &system,
581 &idle,
582 &iowait,
583 &irq,
584 &softirq,
585 &steal,
586 &guest,
587 &guest_nice);
588 if (ret != 10)
589 continue;
590
591 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
592 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
593
594 if (all_used >= cg_used) {
595 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
596
597 } else {
598 lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
599 curcpu, cg, all_used, cg_used);
600 cg_cpu_usage[curcpu].idle = idle;
601 }
602 }
603
604 /* Cannot use more CPUs than is available in cpuset. */
605 max_cpus = max_cpu_count(cg);
606 if (max_cpus > cpu_cnt || !max_cpus)
607 max_cpus = cpu_cnt;
608
609 /* takes lock pthread_mutex_lock(&node->lock) */
610 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
611 if (!stat_node)
612 return log_error(0, "Failed to find/create stat node for %s", cg);
613
614 diff = zalloc(sizeof(struct cpuacct_usage) * nprocs);
615 if (!diff)
616 return 0;
617
618 /*
619 * If the new values are LOWER than values stored in memory, it means
620 * the cgroup has been reset/recreated and we should reset too.
621 */
622 for (curcpu = 0; curcpu < nprocs; curcpu++) {
623 if (!cg_cpu_usage[curcpu].online)
624 continue;
625
626 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
627 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
628
629 break;
630 }
631
632 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
633
634 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
635 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
636
637 if (!stat_node->usage[curcpu].online)
638 continue;
639
640 i++;
641
642 stat_node->usage[curcpu].user += diff[curcpu].user;
643 stat_node->usage[curcpu].system += diff[curcpu].system;
644 stat_node->usage[curcpu].idle += diff[curcpu].idle;
645
646 if (max_cpus > 0 && i >= max_cpus) {
647 user_surplus += diff[curcpu].user;
648 system_surplus += diff[curcpu].system;
649 }
650 }
651
652 /* Calculate usage counters of visible CPUs */
653 if (max_cpus > 0) {
654 uint64_t diff_user = 0;
655 uint64_t diff_system = 0;
656 uint64_t diff_idle = 0;
657 uint64_t max_diff_idle = 0;
658 uint64_t max_diff_idle_index = 0;
659 double exact_cpus;
660
661 /* threshold = maximum usage per cpu, including idle */
662 threshold = total_sum / cpu_cnt * max_cpus;
663
664 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
665 if (!stat_node->usage[curcpu].online)
666 continue;
667
668 i++;
669
670 if (i == max_cpus)
671 break;
672
673 if (diff[curcpu].user + diff[curcpu].system >= threshold)
674 continue;
675
676 /* Add user */
677 add_cpu_usage(&user_surplus, &diff[curcpu],
678 &diff[curcpu].user, threshold);
679
680 if (diff[curcpu].user + diff[curcpu].system >= threshold)
681 continue;
682
683 /* If there is still room, add system */
684 add_cpu_usage(&system_surplus, &diff[curcpu],
685 &diff[curcpu].system, threshold);
686 }
687
688 if (user_surplus > 0)
689 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
690 if (system_surplus > 0)
691 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
692
693 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
694 if (!stat_node->usage[curcpu].online)
695 continue;
696
697 i++;
698
699 if (i == max_cpus)
700 break;
701
702 stat_node->view[curcpu].user += diff[curcpu].user;
703 stat_node->view[curcpu].system += diff[curcpu].system;
704 stat_node->view[curcpu].idle += diff[curcpu].idle;
705
706 user_sum += stat_node->view[curcpu].user;
707 system_sum += stat_node->view[curcpu].system;
708 idle_sum += stat_node->view[curcpu].idle;
709
710 diff_user += diff[curcpu].user;
711 diff_system += diff[curcpu].system;
712 diff_idle += diff[curcpu].idle;
713 if (diff[curcpu].idle > max_diff_idle) {
714 max_diff_idle = diff[curcpu].idle;
715 max_diff_idle_index = curcpu;
716 }
717
718 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
719 }
720 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
721
722 /* revise cpu usage view to support partial cpu case. */
723 exact_cpus = exact_cpu_count(cg);
724 if (exact_cpus < (double)max_cpus){
725 uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
726
727 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
728 lxcfs_v("delta: %lu\n", delta);
729 lxcfs_v("idle_sum before: %lu\n", idle_sum);
730 if (idle_sum > delta)
731 idle_sum = idle_sum - delta;
732 else
733 idle_sum = 0;
734 lxcfs_v("idle_sum after: %lu\n", idle_sum);
735
736 curcpu = max_diff_idle_index;
737 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
738 if (stat_node->view[curcpu].idle > delta)
739 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle - delta;
740 else
741 stat_node->view[curcpu].idle = 0;
742 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
743 }
744 } else {
745 for (curcpu = 0; curcpu < nprocs; curcpu++) {
746 if (!stat_node->usage[curcpu].online)
747 continue;
748
749 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
750 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
751 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
752
753 user_sum += stat_node->view[curcpu].user;
754 system_sum += stat_node->view[curcpu].system;
755 idle_sum += stat_node->view[curcpu].idle;
756 }
757 }
758
759 /* Render the file */
760 /* cpu-all */
761 l = snprintf(buf, buf_size,
762 "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
763 user_sum, system_sum, idle_sum);
764 lxcfs_v("cpu-all: %s\n", buf);
765 if (l < 0) {
766 lxcfs_error("Failed to write cache");
767 total_len = 0;
768 goto out_pthread_mutex_unlock;
769 }
770 if (l >= buf_size)
771 return log_error(0, "Write to cache was truncated");
772
773 buf += l;
774 buf_size -= l;
775 total_len += l;
776
777 /* Render visible CPUs */
778 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
779 if (!stat_node->usage[curcpu].online)
780 continue;
781
782 i++;
783
784 if (max_cpus > 0 && i == max_cpus)
785 break;
786
787 l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
788 i,
789 stat_node->view[curcpu].user,
790 stat_node->view[curcpu].system,
791 stat_node->view[curcpu].idle);
792 lxcfs_v("cpu: %s\n", buf);
793 if (l < 0) {
794 lxcfs_error("Failed to write cache");
795 total_len = 0;
796 goto out_pthread_mutex_unlock;
797 }
798 if (l >= buf_size) {
799 lxcfs_error("Write to cache was truncated");
800 total_len = 0;
801 goto out_pthread_mutex_unlock;
802 }
803
804 buf += l;
805 buf_size -= l;
806 total_len += l;
807 }
808
809 /* Pass the rest of /proc/stat, start with the last line read */
810 l = snprintf(buf, buf_size, "%s", line);
811 if (l < 0) {
812 lxcfs_error("Failed to write cache");
813 total_len = 0;
814 goto out_pthread_mutex_unlock;
815 }
816 if (l >= buf_size) {
817 lxcfs_error("Write to cache was truncated");
818 total_len = 0;
819 goto out_pthread_mutex_unlock;
820 }
821
822 buf += l;
823 buf_size -= l;
824 total_len += l;
825
826 /* Pass the rest of the host's /proc/stat */
827 while (getline(&line, &linelen, f) != -1) {
828 l = snprintf(buf, buf_size, "%s", line);
829 if (l < 0) {
830 lxcfs_error("Failed to write cache");
831 total_len = 0;
832 goto out_pthread_mutex_unlock;
833 }
834 if (l >= buf_size) {
835 lxcfs_error("Write to cache was truncated");
836 total_len = 0;
837 goto out_pthread_mutex_unlock;
838 }
839
840 buf += l;
841 buf_size -= l;
842 total_len += l;
843 }
844
845 out_pthread_mutex_unlock:
846 if (stat_node)
847 pthread_mutex_unlock(&stat_node->lock);
848
849 return total_len;
850 }
851
852 /*
853 * check whether this is a '^processor" line in /proc/cpuinfo
854 */
855 static inline bool is_processor_line(const char *line)
856 {
857 int cpu;
858 return sscanf(line, "processor : %d", &cpu) == 1;
859 }
860
861 static inline bool cpuline_in_cpuset(const char *line, const char *cpuset)
862 {
863 int cpu;
864
865 if (sscanf(line, "processor : %d", &cpu) == 1)
866 return cpu_in_cpuset(cpu, cpuset);
867
868 return false;
869 }
870
871 int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
872 struct fuse_file_info *fi)
873 {
874 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
875 __do_free void *fopen_cache = NULL;
876 __do_fclose FILE *f = NULL;
877 struct fuse_context *fc = fuse_get_context();
878 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
879 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
880 size_t linelen = 0, total_len = 0;
881 bool am_printing = false, firstline = true, is_s390x = false;
882 int curcpu = -1, cpu, max_cpus = 0;
883 bool use_view;
884 char *cache = d->buf;
885 size_t cache_size = d->buflen;
886
887 if (offset) {
888 int left;
889
890 if (offset > d->size)
891 return -EINVAL;
892
893 if (!d->cached)
894 return 0;
895
896 left = d->size - offset;
897 total_len = left > size ? size: left;
898 memcpy(buf, cache + offset, total_len);
899
900 return total_len;
901 }
902
903 pid_t initpid = lookup_initpid_in_store(fc->pid);
904 if (initpid <= 1 || is_shared_pidns(initpid))
905 initpid = fc->pid;
906
907 cg = get_pid_cgroup(initpid, "cpuset");
908 if (!cg)
909 return read_file_fuse("proc/cpuinfo", buf, size, d);
910 prune_init_slice(cg);
911
912 cpuset = get_cpuset(cg);
913 if (!cpuset)
914 return 0;
915
916 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
917 use_view = true;
918 else
919 use_view = false;
920 if (use_view)
921 max_cpus = max_cpu_count(cg);
922
923 f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache);
924 if (!f)
925 return 0;
926
927 while (getline(&line, &linelen, f) != -1) {
928 ssize_t l;
929 if (firstline) {
930 firstline = false;
931 if (strstr(line, "IBM/S390") != NULL) {
932 is_s390x = true;
933 am_printing = true;
934 continue;
935 }
936 }
937
938 if (strncmp(line, "# processors:", 12) == 0)
939 continue;
940
941 if (is_processor_line(line)) {
942 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
943 break;
944
945 am_printing = cpuline_in_cpuset(line, cpuset);
946 if (am_printing) {
947 curcpu++;
948 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
949 if (l < 0)
950 return log_error(0, "Failed to write cache");
951 if (l >= cache_size)
952 return log_error(0, "Write to cache was truncated");
953 cache += l;
954 cache_size -= l;
955 total_len += l;
956 }
957 continue;
958 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
959 char *p;
960
961 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
962 break;
963
964 if (!cpu_in_cpuset(cpu, cpuset))
965 continue;
966
967 curcpu ++;
968 p = strchr(line, ':');
969 if (!p || !*p)
970 return 0;
971 p++;
972
973 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
974 if (l < 0)
975 return log_error(0, "Failed to write cache");
976 if (l >= cache_size)
977 return log_error(0, "Write to cache was truncated");
978
979 cache += l;
980 cache_size -= l;
981 total_len += l;
982 continue;
983
984 }
985 if (am_printing) {
986 l = snprintf(cache, cache_size, "%s", line);
987 if (l < 0)
988 return log_error(0, "Failed to write cache");
989 if (l >= cache_size)
990 return log_error(0, "Write to cache was truncated");
991
992 cache += l;
993 cache_size -= l;
994 total_len += l;
995 }
996 }
997
998 if (is_s390x) {
999 __do_free char *origcache = d->buf;
1000 ssize_t l;
1001
1002 d->buf = malloc(d->buflen);
1003 if (!d->buf) {
1004 d->buf = move_ptr(origcache);
1005 return 0;
1006 }
1007
1008 cache = d->buf;
1009 cache_size = d->buflen;
1010 total_len = 0;
1011 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
1012 if (l < 0 || l >= cache_size)
1013 return 0;
1014
1015 cache_size -= l;
1016 cache += l;
1017 total_len += l;
1018 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
1019 if (l < 0 || l >= cache_size)
1020 return 0;
1021
1022 cache_size -= l;
1023 cache += l;
1024 total_len += l;
1025 l = snprintf(cache, cache_size, "%s", origcache);
1026 if (l < 0 || l >= cache_size)
1027 return 0;
1028 total_len += l;
1029 }
1030
1031 d->cached = 1;
1032 d->size = total_len;
1033 if (total_len > size)
1034 total_len = size;
1035
1036 /* read from off 0 */
1037 memcpy(buf, d->buf, total_len);
1038
1039 return total_len;
1040 }
1041
1042 /*
1043 * Returns 0 on success.
1044 * It is the caller's responsibility to free `return_usage`, unless this
1045 * function returns an error.
1046 */
1047 int read_cpuacct_usage_all(char *cg, char *cpuset,
1048 struct cpuacct_usage **return_usage, int *size)
1049 {
1050 __do_free char *usage_str = NULL;
1051 __do_free struct cpuacct_usage *cpu_usage = NULL;
1052 int i = 0, j = 0, read_pos = 0, read_cnt = 0;
1053 int cpucount;
1054 int ret;
1055 int cg_cpu;
1056 uint64_t cg_user, cg_system;
1057 int64_t ticks_per_sec;
1058
1059 ticks_per_sec = sysconf(_SC_CLK_TCK);
1060 if (ticks_per_sec < 0 && errno == EINVAL) {
1061 lxcfs_debug("%m - Failed to determine number of ticks per second");
1062 return -1;
1063 }
1064
1065 cpucount = get_nprocs_conf();
1066 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
1067 if (!cpu_usage)
1068 return -ENOMEM;
1069
1070 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1071 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
1072 char *sep = " \t\n";
1073 char *tok;
1074
1075 /* Read cpuacct.usage_percpu instead. */
1076 lxcfs_debug("Falling back to cpuacct.usage_percpu");
1077 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
1078 return -1;
1079
1080 lxc_iterate_parts(tok, usage_str, sep) {
1081 uint64_t percpu_user;
1082
1083 if (i >= cpucount)
1084 break;
1085
1086 tok = trim_whitespace_in_place(tok);
1087 ret = safe_uint64(tok, &percpu_user, 10);
1088 if (ret)
1089 return -1;
1090
1091 /* Convert the time from nanoseconds to USER_HZ */
1092 cpu_usage[i].user = percpu_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1093 cpu_usage[i].system = cpu_usage[i].user;
1094 i++;
1095 lxcfs_debug("cpu%d with time %s", i, tok);
1096 }
1097 } else {
1098 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0)
1099 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg);
1100
1101 read_pos += read_cnt;
1102
1103 for (i = 0, j = 0; i < cpucount; i++) {
1104 ret = sscanf(usage_str + read_pos,
1105 "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu,
1106 &cg_user, &cg_system, &read_cnt);
1107
1108 if (ret == EOF)
1109 break;
1110
1111 if (ret != 3)
1112 return log_error(-EINVAL, "Failed to parse cpuacct.usage_all line %s from cgroup %s",
1113 usage_str + read_pos, cg);
1114
1115 read_pos += read_cnt;
1116
1117 /* Convert the time from nanoseconds to USER_HZ */
1118 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1119 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
1120 j++;
1121 }
1122 }
1123
1124 *return_usage = move_ptr(cpu_usage);
1125 *size = cpucount;
1126 return 0;
1127 }
1128
1129 static bool cpuview_init_head(struct cg_proc_stat_head **head)
1130 {
1131 __do_free struct cg_proc_stat_head *h;
1132
1133 h = zalloc(sizeof(struct cg_proc_stat_head));
1134 if (!h)
1135 return false;
1136
1137 if (pthread_rwlock_init(&h->lock, NULL))
1138 return false;
1139
1140 h->lastcheck = time(NULL);
1141
1142 *head = move_ptr(h);
1143 return true;
1144 }
1145
1146 bool init_cpuview(void)
1147 {
1148 int i;
1149
1150 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
1151 proc_stat_history[i] = NULL;
1152
1153 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1154 if (!cpuview_init_head(&proc_stat_history[i]))
1155 goto err;
1156 }
1157
1158 return true;
1159
1160 err:
1161 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1162 if (proc_stat_history[i])
1163 free_disarm(proc_stat_history[i]);
1164 }
1165
1166 return false;
1167 }
1168
1169 static void cpuview_free_head(struct cg_proc_stat_head *head)
1170 {
1171 struct cg_proc_stat *node;
1172
1173 if (head->next) {
1174 node = head->next;
1175
1176 for (;;) {
1177 struct cg_proc_stat *cur = node;
1178 node = node->next;
1179 free_proc_stat_node(cur);
1180 if (!node)
1181 break;
1182 }
1183 }
1184
1185 pthread_rwlock_destroy(&head->lock);
1186 free_disarm(head);
1187 }
1188
1189 void free_cpuview(void)
1190 {
1191 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++)
1192 if (proc_stat_history[i])
1193 cpuview_free_head(proc_stat_history[i]);
1194 }