]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/proc_cpuview.c
proc_cpuview: clean up expand_proc_stat_node()
[mirror_lxcfs.git] / src / proc_cpuview.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE
5 #endif
6
7 #ifndef FUSE_USE_VERSION
8 #define FUSE_USE_VERSION 26
9 #endif
10
11 #define _FILE_OFFSET_BITS 64
12
13 #define __STDC_FORMAT_MACROS
14 #include <dirent.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <fuse.h>
18 #include <inttypes.h>
19 #include <libgen.h>
20 #include <pthread.h>
21 #include <sched.h>
22 #include <stdarg.h>
23 #include <stdbool.h>
24 #include <stdint.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <time.h>
29 #include <unistd.h>
30 #include <wait.h>
31 #include <linux/magic.h>
32 #include <linux/sched.h>
33 #include <sys/epoll.h>
34 #include <sys/mman.h>
35 #include <sys/mount.h>
36 #include <sys/param.h>
37 #include <sys/socket.h>
38 #include <sys/syscall.h>
39 #include <sys/sysinfo.h>
40 #include <sys/vfs.h>
41
42 #include "bindings.h"
43 #include "config.h"
44 #include "cgroup_fuse.h"
45 #include "cpuset_parse.h"
46 #include "cgroups/cgroup.h"
47 #include "cgroups/cgroup_utils.h"
48 #include "memory_utils.h"
49 #include "proc_loadavg.h"
50 #include "utils.h"
51
52 /* Data for CPU view */
53 struct cg_proc_stat {
54 char *cg;
55 struct cpuacct_usage *usage; /* Real usage as read from the host's /proc/stat. */
56 struct cpuacct_usage *view; /* Usage stats reported to the container. */
57 int cpu_count;
58 pthread_mutex_t lock; /* For node manipulation. */
59 struct cg_proc_stat *next;
60 };
61
62 struct cg_proc_stat_head {
63 struct cg_proc_stat *next;
64 time_t lastcheck;
65
66 /*
67 * For access to the list. Reading can be parallel, pruning is exclusive.
68 */
69 pthread_rwlock_t lock;
70 };
71
72 #define CPUVIEW_HASH_SIZE 100
73 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
74
75 static void reset_proc_stat_node(struct cg_proc_stat *node,
76 struct cpuacct_usage *usage, int cpu_count)
77 {
78 lxcfs_debug("Resetting stat node for %s\n", node->cg);
79 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
80
81 for (int i = 0; i < cpu_count; i++) {
82 node->view[i].user = 0;
83 node->view[i].system = 0;
84 node->view[i].idle = 0;
85 }
86
87 node->cpu_count = cpu_count;
88 }
89
90 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
91 {
92 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
93
94 /* Allocate new memory */
95 new_usage = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
96 if (!new_usage)
97 return false;
98
99 new_view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
100 if (!new_view)
101 return false;
102
103 /* Copy existing data & initialize new elements */
104 for (int i = 0; i < cpu_count; i++) {
105 if (i < node->cpu_count) {
106 new_usage[i].user = node->usage[i].user;
107 new_usage[i].system = node->usage[i].system;
108 new_usage[i].idle = node->usage[i].idle;
109
110 new_view[i].user = node->view[i].user;
111 new_view[i].system = node->view[i].system;
112 new_view[i].idle = node->view[i].idle;
113 }
114 }
115
116 free(node->usage);
117 node->usage = move_ptr(new_usage);
118
119 free(node->view);
120 node->view = move_ptr(new_view);
121 node->cpu_count = cpu_count;
122
123 return true;
124 }
125
126 static void free_proc_stat_node(struct cg_proc_stat *node)
127 {
128 if (node) {
129 /*
130 * We're abusing the usage pointer to indicate that
131 * pthread_mutex_init() was successful. Don't judge me.
132 */
133 if (node->usage)
134 pthread_mutex_destroy(&node->lock);
135 free_disarm(node->cg);
136 free_disarm(node->usage);
137 free_disarm(node->view);
138 free_disarm(node);
139 }
140 }
141
142 define_cleanup_function(struct cg_proc_stat *, free_proc_stat_node);
143
144 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
145 {
146 call_cleaner(free_proc_stat_node) struct cg_proc_stat *new = new_node;
147 struct cg_proc_stat *rv = new_node;
148 int hash = calc_hash(new->cg) % CPUVIEW_HASH_SIZE;
149 struct cg_proc_stat_head *head = proc_stat_history[hash];
150 struct cg_proc_stat *cur;
151
152 pthread_rwlock_wrlock(&head->lock);
153
154 if (!head->next) {
155 head->next = move_ptr(new);
156 goto out_rwlock_unlock;
157 }
158
159 cur = head->next;
160
161 for (;;) {
162 /*
163 * The node to be added is already present in the list, so
164 * free the newly allocated one and return the one we found.
165 */
166 if (strcmp(cur->cg, new->cg) == 0) {
167 rv = cur;
168 goto out_rwlock_unlock;
169 }
170
171 /* Keep walking. */
172 if (cur->next) {
173 cur = cur->next;
174 continue;
175 }
176
177 /* Add new node to end of list. */
178 cur->next = move_ptr(new);
179 goto out_rwlock_unlock;
180 }
181
182 out_rwlock_unlock:
183 pthread_rwlock_unlock(&head->lock);
184 return move_ptr(rv);
185 }
186
187 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage,
188 int cpu_count, const char *cg)
189 {
190 call_cleaner(free_proc_stat_node) struct cg_proc_stat *node = NULL;
191 __do_free struct cpuacct_usage *new_usage = NULL;
192
193 node = zalloc(sizeof(struct cg_proc_stat));
194 if (!node)
195 return NULL;
196
197 node->cg = strdup(cg);
198 if (!node->cg)
199 return NULL;
200
201 new_usage = memdup(usage, sizeof(struct cpuacct_usage) * cpu_count);
202 if (!new_usage)
203 return NULL;
204
205 node->view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
206 if (!node->view)
207 return NULL;
208
209 node->cpu_count = cpu_count;
210
211 if (pthread_mutex_init(&node->lock, NULL))
212 return NULL;
213 /*
214 * We're abusing the usage pointer to indicate that
215 * pthread_mutex_init() was successful. Don't judge me.
216 */
217 node->usage = move_ptr(new_usage);
218
219 return move_ptr(node);
220 }
221
222 static bool cgfs_param_exist(const char *controller, const char *cgroup,
223 const char *file)
224 {
225 __do_free char *path = NULL;
226 int cfd;
227
228 cfd = get_cgroup_fd(controller);
229 if (cfd < 0)
230 return false;
231
232 path = must_make_path_relative(cgroup, file, NULL);
233 return (faccessat(cfd, path, F_OK, 0) == 0);
234 }
235
236 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
237 {
238 struct cg_proc_stat *first = NULL;
239
240 for (struct cg_proc_stat *prev = NULL; node; ) {
241 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
242 struct cg_proc_stat *tmp = node;
243
244 lxcfs_debug("Removing stat node for %s\n", node->cg);
245
246 if (prev)
247 prev->next = node->next;
248 else
249 first = node->next;
250
251 node = node->next;
252 free_proc_stat_node(tmp);
253 } else {
254 if (!first)
255 first = node;
256 prev = node;
257 node = node->next;
258 }
259 }
260
261 return first;
262 }
263
264 #define PROC_STAT_PRUNE_INTERVAL 10
265 static void prune_proc_stat_history(void)
266 {
267 time_t now = time(NULL);
268
269 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) {
270 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
271
272 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
273 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
274 return;
275 }
276
277 if (proc_stat_history[i]->next) {
278 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
279 proc_stat_history[i]->lastcheck = now;
280 }
281
282 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
283 }
284 }
285
286 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head,
287 const char *cg)
288 {
289 struct cg_proc_stat *node;
290
291 pthread_rwlock_rdlock(&head->lock);
292
293 if (!head->next) {
294 pthread_rwlock_unlock(&head->lock);
295 return NULL;
296 }
297
298 node = head->next;
299
300 do {
301 if (strcmp(cg, node->cg) == 0)
302 goto out;
303 } while ((node = node->next));
304
305 node = NULL;
306
307 out:
308 pthread_rwlock_unlock(&head->lock);
309 prune_proc_stat_history();
310 return node;
311 }
312
313 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
314 {
315 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
316 struct cg_proc_stat_head *head = proc_stat_history[hash];
317 struct cg_proc_stat *node;
318
319 node = find_proc_stat_node(head, cg);
320 if (!node) {
321 node = new_proc_stat_node(usage, cpu_count, cg);
322 if (!node)
323 return NULL;
324
325 node = add_proc_stat_node(node);
326 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
327 }
328
329 pthread_mutex_lock(&node->lock);
330
331 /* If additional CPUs on the host have been enabled, CPU usage counter
332 * arrays have to be expanded */
333 if (node->cpu_count < cpu_count) {
334 lxcfs_debug("Expanding stat node %d->%d for %s\n",
335 node->cpu_count, cpu_count, cg);
336
337 if (!expand_proc_stat_node(node, cpu_count)) {
338 pthread_mutex_unlock(&node->lock);
339 return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg);
340 }
341 }
342
343 return node;
344 }
345
346 static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage,
347 uint64_t *counter, uint64_t threshold)
348 {
349 uint64_t free_space, to_add;
350
351 free_space = threshold - usage->user - usage->system;
352
353 if (free_space > usage->idle)
354 free_space = usage->idle;
355
356 to_add = free_space > *surplus ? *surplus : free_space;
357
358 *counter += to_add;
359 usage->idle -= to_add;
360 *surplus -= to_add;
361 }
362
363 static uint64_t diff_cpu_usage(struct cpuacct_usage *older,
364 struct cpuacct_usage *newer,
365 struct cpuacct_usage *diff, int cpu_count)
366 {
367 uint64_t sum = 0;
368
369 for (int i = 0; i < cpu_count; i++) {
370 if (!newer[i].online)
371 continue;
372
373 /*
374 * When cpuset is changed on the fly, the CPUs might get
375 * reordered. We could either reset all counters, or check
376 * that the substractions below will return expected results.
377 */
378 if (newer[i].user > older[i].user)
379 diff[i].user = newer[i].user - older[i].user;
380 else
381 diff[i].user = 0;
382
383 if (newer[i].system > older[i].system)
384 diff[i].system = newer[i].system - older[i].system;
385 else
386 diff[i].system = 0;
387
388 if (newer[i].idle > older[i].idle)
389 diff[i].idle = newer[i].idle - older[i].idle;
390 else
391 diff[i].idle = 0;
392
393 sum += diff[i].user;
394 sum += diff[i].system;
395 sum += diff[i].idle;
396 }
397
398 return sum;
399 }
400
401 /*
402 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
403 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
404 * throuh `value`.
405 */
406 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
407 {
408 __do_free char *str = NULL;
409 char file[11 + 6 + 1]; /* cpu.cfs__us + quota/period + \0 */
410 bool first = true;
411
412 if (!pure_unified_layout(cgroup_ops)) {
413 snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
414 } else {
415 strcpy(file, "cpu.max");
416 first = !strcmp(param, "quota");
417 }
418
419 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
420 return false;
421
422 if (sscanf(str, first ? "%" PRId64 : "%*d %" PRId64, value) != 1)
423 return false;
424
425 return true;
426 }
427
428 /*
429 * Return the exact number of visible CPUs based on CPU quotas.
430 * If there is no quota set, zero is returned.
431 */
432 static double exact_cpu_count(const char *cg)
433 {
434 double rv;
435 int nprocs;
436 int64_t cfs_quota, cfs_period;
437
438 read_cpu_cfs_param(cg, "quota", &cfs_quota);
439 read_cpu_cfs_param(cg, "period", &cfs_period);
440
441 if (cfs_quota <= 0 || cfs_period <= 0)
442 return 0;
443
444 rv = (double)cfs_quota / (double)cfs_period;
445
446 nprocs = get_nprocs();
447
448 if (rv > nprocs)
449 rv = nprocs;
450
451 return rv;
452 }
453
454 /*
455 * Return the maximum number of visible CPUs based on CPU quotas.
456 * If there is no quota set, zero is returned.
457 */
458 int max_cpu_count(const char *cg)
459 {
460 __do_free char *cpuset = NULL;
461 int rv, nprocs;
462 int64_t cfs_quota, cfs_period;
463 int nr_cpus_in_cpuset = 0;
464
465 read_cpu_cfs_param(cg, "quota", &cfs_quota);
466 read_cpu_cfs_param(cg, "period", &cfs_period);
467
468 cpuset = get_cpuset(cg);
469 if (cpuset)
470 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
471
472 if (cfs_quota <= 0 || cfs_period <= 0){
473 if (nr_cpus_in_cpuset > 0)
474 return nr_cpus_in_cpuset;
475
476 return 0;
477 }
478
479 rv = cfs_quota / cfs_period;
480
481 /* In case quota/period does not yield a whole number, add one CPU for
482 * the remainder.
483 */
484 if ((cfs_quota % cfs_period) > 0)
485 rv += 1;
486
487 nprocs = get_nprocs();
488 if (rv > nprocs)
489 rv = nprocs;
490
491 /* use min value in cpu quota and cpuset */
492 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
493 rv = nr_cpus_in_cpuset;
494
495 return rv;
496 }
497
498 int cpuview_proc_stat(const char *cg, const char *cpuset,
499 struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size,
500 FILE *f, char *buf, size_t buf_size)
501 {
502 __do_free char *line = NULL;
503 __do_free struct cpuacct_usage *diff = NULL;
504 size_t linelen = 0, total_len = 0;
505 int curcpu = -1; /* cpu numbering starts at 0 */
506 int physcpu, i;
507 int cpu_cnt = 0;
508 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
509 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
510 uint64_t user_sum = 0, system_sum = 0, idle_sum = 0;
511 uint64_t user_surplus = 0, system_surplus = 0;
512 int nprocs, max_cpus;
513 ssize_t l;
514 uint64_t total_sum, threshold;
515 struct cg_proc_stat *stat_node;
516
517 nprocs = get_nprocs_conf();
518 if (cg_cpu_usage_size < nprocs)
519 nprocs = cg_cpu_usage_size;
520
521 /* Read all CPU stats and stop when we've encountered other lines */
522 while (getline(&line, &linelen, f) != -1) {
523 int ret;
524 char cpu_char[10]; /* That's a lot of cores */
525 uint64_t all_used, cg_used;
526
527 if (strlen(line) == 0)
528 continue;
529
530 /* not a ^cpuN line containing a number N */
531 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
532 break;
533
534 if (sscanf(cpu_char, "%d", &physcpu) != 1)
535 continue;
536
537 if (physcpu >= cg_cpu_usage_size)
538 continue;
539
540 curcpu++;
541 cpu_cnt++;
542
543 if (!cpu_in_cpuset(physcpu, cpuset)) {
544 for (i = curcpu; i <= physcpu; i++)
545 cg_cpu_usage[i].online = false;
546 continue;
547 }
548
549 if (curcpu < physcpu) {
550 /* Some CPUs may be disabled */
551 for (i = curcpu; i < physcpu; i++)
552 cg_cpu_usage[i].online = false;
553
554 curcpu = physcpu;
555 }
556
557 cg_cpu_usage[curcpu].online = true;
558
559 ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu",
560 &user,
561 &nice,
562 &system,
563 &idle,
564 &iowait,
565 &irq,
566 &softirq,
567 &steal,
568 &guest,
569 &guest_nice);
570 if (ret != 10)
571 continue;
572
573 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
574 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
575
576 if (all_used >= cg_used) {
577 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
578
579 } else {
580 lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
581 curcpu, cg, all_used, cg_used);
582 cg_cpu_usage[curcpu].idle = idle;
583 }
584 }
585
586 /* Cannot use more CPUs than is available in cpuset. */
587 max_cpus = max_cpu_count(cg);
588 if (max_cpus > cpu_cnt || !max_cpus)
589 max_cpus = cpu_cnt;
590
591 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
592 if (!stat_node)
593 return log_error(0, "Failed to find/create stat node for %s", cg);
594
595 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
596 if (!diff)
597 return 0;
598
599 /*
600 * If the new values are LOWER than values stored in memory, it means
601 * the cgroup has been reset/recreated and we should reset too.
602 */
603 for (curcpu = 0; curcpu < nprocs; curcpu++) {
604 if (!cg_cpu_usage[curcpu].online)
605 continue;
606
607 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
608 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
609
610 break;
611 }
612
613 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
614
615 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
616 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
617
618 if (!stat_node->usage[curcpu].online)
619 continue;
620
621 i++;
622
623 stat_node->usage[curcpu].user += diff[curcpu].user;
624 stat_node->usage[curcpu].system += diff[curcpu].system;
625 stat_node->usage[curcpu].idle += diff[curcpu].idle;
626
627 if (max_cpus > 0 && i >= max_cpus) {
628 user_surplus += diff[curcpu].user;
629 system_surplus += diff[curcpu].system;
630 }
631 }
632
633 /* Calculate usage counters of visible CPUs */
634 if (max_cpus > 0) {
635 uint64_t diff_user = 0;
636 uint64_t diff_system = 0;
637 uint64_t diff_idle = 0;
638 uint64_t max_diff_idle = 0;
639 uint64_t max_diff_idle_index = 0;
640 double exact_cpus;
641
642 /* threshold = maximum usage per cpu, including idle */
643 threshold = total_sum / cpu_cnt * max_cpus;
644
645 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
646 if (!stat_node->usage[curcpu].online)
647 continue;
648
649 i++;
650
651 if (i == max_cpus)
652 break;
653
654 if (diff[curcpu].user + diff[curcpu].system >= threshold)
655 continue;
656
657 /* Add user */
658 add_cpu_usage(&user_surplus, &diff[curcpu],
659 &diff[curcpu].user, threshold);
660
661 if (diff[curcpu].user + diff[curcpu].system >= threshold)
662 continue;
663
664 /* If there is still room, add system */
665 add_cpu_usage(&system_surplus, &diff[curcpu],
666 &diff[curcpu].system, threshold);
667 }
668
669 if (user_surplus > 0)
670 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
671 if (system_surplus > 0)
672 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
673
674 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
675 if (!stat_node->usage[curcpu].online)
676 continue;
677
678 i++;
679
680 if (i == max_cpus)
681 break;
682
683 stat_node->view[curcpu].user += diff[curcpu].user;
684 stat_node->view[curcpu].system += diff[curcpu].system;
685 stat_node->view[curcpu].idle += diff[curcpu].idle;
686
687 user_sum += stat_node->view[curcpu].user;
688 system_sum += stat_node->view[curcpu].system;
689 idle_sum += stat_node->view[curcpu].idle;
690
691 diff_user += diff[curcpu].user;
692 diff_system += diff[curcpu].system;
693 diff_idle += diff[curcpu].idle;
694 if (diff[curcpu].idle > max_diff_idle) {
695 max_diff_idle = diff[curcpu].idle;
696 max_diff_idle_index = curcpu;
697 }
698
699 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
700 }
701 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
702
703 /* revise cpu usage view to support partial cpu case. */
704 exact_cpus = exact_cpu_count(cg);
705 if (exact_cpus < (double)max_cpus){
706 uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
707
708 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
709 lxcfs_v("delta: %lu\n", delta);
710 lxcfs_v("idle_sum before: %lu\n", idle_sum);
711 idle_sum = idle_sum > delta ? idle_sum - delta : 0;
712 lxcfs_v("idle_sum after: %lu\n", idle_sum);
713
714 curcpu = max_diff_idle_index;
715 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
716 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
717 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
718 }
719 } else {
720 for (curcpu = 0; curcpu < nprocs; curcpu++) {
721 if (!stat_node->usage[curcpu].online)
722 continue;
723
724 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
725 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
726 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
727
728 user_sum += stat_node->view[curcpu].user;
729 system_sum += stat_node->view[curcpu].system;
730 idle_sum += stat_node->view[curcpu].idle;
731 }
732 }
733
734 /* Render the file */
735 /* cpu-all */
736 l = snprintf(buf, buf_size,
737 "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
738 user_sum, system_sum, idle_sum);
739 lxcfs_v("cpu-all: %s\n", buf);
740 if (l < 0)
741 return log_error(0, "Failed to write cache");
742 if (l >= buf_size)
743 return log_error(0, "Write to cache was truncated");
744
745 buf += l;
746 buf_size -= l;
747 total_len += l;
748
749 /* Render visible CPUs */
750 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
751 if (!stat_node->usage[curcpu].online)
752 continue;
753
754 i++;
755
756 if (max_cpus > 0 && i == max_cpus)
757 break;
758
759 l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
760 i,
761 stat_node->view[curcpu].user,
762 stat_node->view[curcpu].system,
763 stat_node->view[curcpu].idle);
764 lxcfs_v("cpu: %s\n", buf);
765 if (l < 0)
766 return log_error(0, "Failed to write cache");
767 if (l >= buf_size)
768 return log_error(0, "Write to cache was truncated");
769
770 buf += l;
771 buf_size -= l;
772 total_len += l;
773 }
774
775 /* Pass the rest of /proc/stat, start with the last line read */
776 l = snprintf(buf, buf_size, "%s", line);
777 if (l < 0)
778 return log_error(0, "Failed to write cache");
779 if (l >= buf_size)
780 return log_error(0, "Write to cache was truncated");
781
782 buf += l;
783 buf_size -= l;
784 total_len += l;
785
786 /* Pass the rest of the host's /proc/stat */
787 while (getline(&line, &linelen, f) != -1) {
788 l = snprintf(buf, buf_size, "%s", line);
789 if (l < 0)
790 return log_error(0, "Failed to write cache");
791 if (l >= buf_size)
792 return log_error(0, "Write to cache was truncated");
793
794 buf += l;
795 buf_size -= l;
796 total_len += l;
797 }
798
799 if (stat_node)
800 pthread_mutex_unlock(&stat_node->lock);
801
802 return total_len;
803 }
804
805 /*
806 * check whether this is a '^processor" line in /proc/cpuinfo
807 */
808 static inline bool is_processor_line(const char *line)
809 {
810 int cpu;
811 return sscanf(line, "processor : %d", &cpu) == 1;
812 }
813
814 static inline bool cpuline_in_cpuset(const char *line, const char *cpuset)
815 {
816 int cpu;
817
818 if (sscanf(line, "processor : %d", &cpu) == 1)
819 return cpu_in_cpuset(cpu, cpuset);
820
821 return false;
822 }
823
824 int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
825 struct fuse_file_info *fi)
826 {
827 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
828 __do_free void *fopen_cache = NULL;
829 __do_fclose FILE *f = NULL;
830 struct fuse_context *fc = fuse_get_context();
831 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
832 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
833 size_t linelen = 0, total_len = 0;
834 bool am_printing = false, firstline = true, is_s390x = false;
835 int curcpu = -1, cpu, max_cpus = 0;
836 bool use_view;
837 char *cache = d->buf;
838 size_t cache_size = d->buflen;
839
840 if (offset) {
841 int left;
842
843 if (offset > d->size)
844 return -EINVAL;
845
846 if (!d->cached)
847 return 0;
848
849 left = d->size - offset;
850 total_len = left > size ? size: left;
851 memcpy(buf, cache + offset, total_len);
852
853 return total_len;
854 }
855
856 pid_t initpid = lookup_initpid_in_store(fc->pid);
857 if (initpid <= 1 || is_shared_pidns(initpid))
858 initpid = fc->pid;
859
860 cg = get_pid_cgroup(initpid, "cpuset");
861 if (!cg)
862 return read_file_fuse("proc/cpuinfo", buf, size, d);
863 prune_init_slice(cg);
864
865 cpuset = get_cpuset(cg);
866 if (!cpuset)
867 return 0;
868
869 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
870 use_view = true;
871 else
872 use_view = false;
873 if (use_view)
874 max_cpus = max_cpu_count(cg);
875
876 f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache);
877 if (!f)
878 return 0;
879
880 while (getline(&line, &linelen, f) != -1) {
881 ssize_t l;
882 if (firstline) {
883 firstline = false;
884 if (strstr(line, "IBM/S390") != NULL) {
885 is_s390x = true;
886 am_printing = true;
887 continue;
888 }
889 }
890
891 if (strncmp(line, "# processors:", 12) == 0)
892 continue;
893
894 if (is_processor_line(line)) {
895 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
896 break;
897
898 am_printing = cpuline_in_cpuset(line, cpuset);
899 if (am_printing) {
900 curcpu++;
901 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
902 if (l < 0)
903 return log_error(0, "Failed to write cache");
904 if (l >= cache_size)
905 return log_error(0, "Write to cache was truncated");
906 cache += l;
907 cache_size -= l;
908 total_len += l;
909 }
910 continue;
911 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
912 char *p;
913
914 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
915 break;
916
917 if (!cpu_in_cpuset(cpu, cpuset))
918 continue;
919
920 curcpu ++;
921 p = strchr(line, ':');
922 if (!p || !*p)
923 return 0;
924 p++;
925
926 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
927 if (l < 0)
928 return log_error(0, "Failed to write cache");
929 if (l >= cache_size)
930 return log_error(0, "Write to cache was truncated");
931
932 cache += l;
933 cache_size -= l;
934 total_len += l;
935 continue;
936
937 }
938 if (am_printing) {
939 l = snprintf(cache, cache_size, "%s", line);
940 if (l < 0)
941 return log_error(0, "Failed to write cache");
942 if (l >= cache_size)
943 return log_error(0, "Write to cache was truncated");
944
945 cache += l;
946 cache_size -= l;
947 total_len += l;
948 }
949 }
950
951 if (is_s390x) {
952 __do_free char *origcache = d->buf;
953 ssize_t l;
954
955 d->buf = malloc(d->buflen);
956 if (!d->buf) {
957 d->buf = move_ptr(origcache);
958 return 0;
959 }
960
961 cache = d->buf;
962 cache_size = d->buflen;
963 total_len = 0;
964 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
965 if (l < 0 || l >= cache_size)
966 return 0;
967
968 cache_size -= l;
969 cache += l;
970 total_len += l;
971 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
972 if (l < 0 || l >= cache_size)
973 return 0;
974
975 cache_size -= l;
976 cache += l;
977 total_len += l;
978 l = snprintf(cache, cache_size, "%s", origcache);
979 if (l < 0 || l >= cache_size)
980 return 0;
981 total_len += l;
982 }
983
984 d->cached = 1;
985 d->size = total_len;
986 if (total_len > size)
987 total_len = size;
988
989 /* read from off 0 */
990 memcpy(buf, d->buf, total_len);
991
992 return total_len;
993 }
994
995 /*
996 * Returns 0 on success.
997 * It is the caller's responsibility to free `return_usage`, unless this
998 * function returns an error.
999 */
1000 int read_cpuacct_usage_all(char *cg, char *cpuset,
1001 struct cpuacct_usage **return_usage, int *size)
1002 {
1003 __do_free char *usage_str = NULL;
1004 __do_free struct cpuacct_usage *cpu_usage = NULL;
1005 int i = 0, j = 0, read_pos = 0, read_cnt = 0;
1006 int cpucount;
1007 int ret;
1008 int cg_cpu;
1009 uint64_t cg_user, cg_system;
1010 int64_t ticks_per_sec;
1011
1012 ticks_per_sec = sysconf(_SC_CLK_TCK);
1013 if (ticks_per_sec < 0 && errno == EINVAL) {
1014 lxcfs_debug("%m - Failed to determine number of ticks per second");
1015 return -1;
1016 }
1017
1018 cpucount = get_nprocs_conf();
1019 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
1020 if (!cpu_usage)
1021 return -ENOMEM;
1022
1023 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1024 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
1025 char *sep = " \t\n";
1026 char *tok;
1027
1028 /* Read cpuacct.usage_percpu instead. */
1029 lxcfs_debug("Falling back to cpuacct.usage_percpu");
1030 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
1031 return -1;
1032
1033 lxc_iterate_parts(tok, usage_str, sep) {
1034 uint64_t percpu_user;
1035
1036 if (i >= cpucount)
1037 break;
1038
1039 tok = trim_whitespace_in_place(tok);
1040 ret = safe_uint64(tok, &percpu_user, 10);
1041 if (ret)
1042 return -1;
1043
1044 /* Convert the time from nanoseconds to USER_HZ */
1045 cpu_usage[i].user = percpu_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1046 cpu_usage[i].system = cpu_usage[i].user;
1047 i++;
1048 lxcfs_debug("cpu%d with time %s", i, tok);
1049 }
1050 } else {
1051 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0)
1052 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg);
1053
1054 read_pos += read_cnt;
1055
1056 for (i = 0, j = 0; i < cpucount; i++) {
1057 ret = sscanf(usage_str + read_pos,
1058 "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu,
1059 &cg_user, &cg_system, &read_cnt);
1060
1061 if (ret == EOF)
1062 break;
1063
1064 if (ret != 3)
1065 return log_error(-EINVAL, "Failed to parse cpuacct.usage_all line %s from cgroup %s",
1066 usage_str + read_pos, cg);
1067
1068 read_pos += read_cnt;
1069
1070 /* Convert the time from nanoseconds to USER_HZ */
1071 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1072 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
1073 j++;
1074 }
1075 }
1076
1077 *return_usage = move_ptr(cpu_usage);
1078 *size = cpucount;
1079 return 0;
1080 }
1081
1082 static bool cpuview_init_head(struct cg_proc_stat_head **head)
1083 {
1084 *head = malloc(sizeof(struct cg_proc_stat_head));
1085 if (!(*head))
1086 return log_error(false, "%s", strerror(errno));
1087
1088 (*head)->lastcheck = time(NULL);
1089 (*head)->next = NULL;
1090
1091 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
1092 free_disarm(*head);
1093 return log_error(false, "Failed to initialize list lock");
1094 }
1095
1096 return true;
1097 }
1098
1099 bool init_cpuview(void)
1100 {
1101 int i;
1102
1103 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
1104 proc_stat_history[i] = NULL;
1105
1106 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1107 if (!cpuview_init_head(&proc_stat_history[i]))
1108 goto err;
1109 }
1110
1111 return true;
1112
1113 err:
1114 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1115 if (proc_stat_history[i])
1116 free_disarm(proc_stat_history[i]);
1117 }
1118
1119 return false;
1120 }
1121
1122 static void cpuview_free_head(struct cg_proc_stat_head *head)
1123 {
1124 struct cg_proc_stat *node;
1125
1126 if (head->next) {
1127 node = head->next;
1128
1129 for (;;) {
1130 struct cg_proc_stat *cur = node;
1131 node = node->next;
1132 free_proc_stat_node(cur);
1133 if (!node)
1134 break;
1135 }
1136 }
1137
1138 pthread_rwlock_destroy(&head->lock);
1139 free_disarm(head);
1140 }
1141
1142 void free_cpuview(void)
1143 {
1144 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++)
1145 if (proc_stat_history[i])
1146 cpuview_free_head(proc_stat_history[i]);
1147 }