]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/proc_cpuview.c
Merge pull request #361 from 3XX0/unicg_fixups
[mirror_lxcfs.git] / src / proc_cpuview.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE
5 #endif
6
7 #ifndef FUSE_USE_VERSION
8 #define FUSE_USE_VERSION 26
9 #endif
10
11 #define _FILE_OFFSET_BITS 64
12
13 #define __STDC_FORMAT_MACROS
14 #include <dirent.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <fuse.h>
18 #include <inttypes.h>
19 #include <libgen.h>
20 #include <pthread.h>
21 #include <sched.h>
22 #include <stdarg.h>
23 #include <stdbool.h>
24 #include <stdint.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <time.h>
29 #include <unistd.h>
30 #include <wait.h>
31 #include <linux/magic.h>
32 #include <linux/sched.h>
33 #include <sys/epoll.h>
34 #include <sys/mman.h>
35 #include <sys/mount.h>
36 #include <sys/param.h>
37 #include <sys/socket.h>
38 #include <sys/syscall.h>
39 #include <sys/sysinfo.h>
40 #include <sys/vfs.h>
41
42 #include "bindings.h"
43 #include "config.h"
44 #include "cgroup_fuse.h"
45 #include "cpuset_parse.h"
46 #include "cgroups/cgroup.h"
47 #include "cgroups/cgroup_utils.h"
48 #include "memory_utils.h"
49 #include "proc_loadavg.h"
50 #include "utils.h"
51
52 /* Data for CPU view */
53 struct cg_proc_stat {
54 char *cg;
55 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
56 struct cpuacct_usage *view; // Usage stats reported to the container
57 int cpu_count;
58 pthread_mutex_t lock; // For node manipulation
59 struct cg_proc_stat *next;
60 };
61
62 struct cg_proc_stat_head {
63 struct cg_proc_stat *next;
64 time_t lastcheck;
65
66 /*
67 * For access to the list. Reading can be parallel, pruning is exclusive.
68 */
69 pthread_rwlock_t lock;
70 };
71
72 #define CPUVIEW_HASH_SIZE 100
73 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
74
75 static void reset_proc_stat_node(struct cg_proc_stat *node,
76 struct cpuacct_usage *usage, int cpu_count)
77 {
78 lxcfs_debug("Resetting stat node for %s\n", node->cg);
79 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
80
81 for (int i = 0; i < cpu_count; i++) {
82 node->view[i].user = 0;
83 node->view[i].system = 0;
84 node->view[i].idle = 0;
85 }
86
87 node->cpu_count = cpu_count;
88 }
89
90 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
91 {
92 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
93
94 /* Allocate new memory */
95 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
96 if (!new_usage)
97 return false;
98
99 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
100 if (!new_view)
101 return false;
102
103 /* Copy existing data & initialize new elements */
104 for (int i = 0; i < cpu_count; i++) {
105 if (i < node->cpu_count) {
106 new_usage[i].user = node->usage[i].user;
107 new_usage[i].system = node->usage[i].system;
108 new_usage[i].idle = node->usage[i].idle;
109
110 new_view[i].user = node->view[i].user;
111 new_view[i].system = node->view[i].system;
112 new_view[i].idle = node->view[i].idle;
113 } else {
114 new_usage[i].user = 0;
115 new_usage[i].system = 0;
116 new_usage[i].idle = 0;
117
118 new_view[i].user = 0;
119 new_view[i].system = 0;
120 new_view[i].idle = 0;
121 }
122 }
123
124 free(node->usage);
125 node->usage = move_ptr(new_usage);
126
127 free(node->view);
128 node->view = move_ptr(new_view);
129 node->cpu_count = cpu_count;
130
131 return true;
132 }
133
134 static void free_proc_stat_node(struct cg_proc_stat *node)
135 {
136 pthread_mutex_destroy(&node->lock);
137 free_disarm(node->cg);
138 free_disarm(node->usage);
139 free_disarm(node->view);
140 free_disarm(node);
141 }
142
143 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
144 {
145 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
146 struct cg_proc_stat_head *head = proc_stat_history[hash];
147 struct cg_proc_stat *node, *rv = new_node;
148
149 pthread_rwlock_wrlock(&head->lock);
150
151 if (!head->next) {
152 head->next = new_node;
153 goto out;
154 }
155
156 node = head->next;
157
158 for (;;) {
159 if (strcmp(node->cg, new_node->cg) == 0) {
160 /* The node is already present, return it */
161 free_proc_stat_node(new_node);
162 rv = node;
163 goto out;
164 }
165
166 if (node->next) {
167 node = node->next;
168 continue;
169 }
170
171 node->next = new_node;
172 goto out;
173 }
174
175 out:
176 pthread_rwlock_unlock(&head->lock);
177 return rv;
178 }
179
180 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
181 {
182 struct cg_proc_stat *node;
183 int i;
184
185 node = malloc(sizeof(struct cg_proc_stat));
186 if (!node)
187 goto err;
188
189 node->cg = NULL;
190 node->usage = NULL;
191 node->view = NULL;
192
193 node->cg = malloc(strlen(cg) + 1);
194 if (!node->cg)
195 goto err;
196
197 strcpy(node->cg, cg);
198
199 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
200 if (!node->usage)
201 goto err;
202
203 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
204
205 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
206 if (!node->view)
207 goto err;
208
209 node->cpu_count = cpu_count;
210 node->next = NULL;
211
212 if (pthread_mutex_init(&node->lock, NULL) != 0)
213 log_error(goto err, "Failed to initialize node lock");
214
215 for (i = 0; i < cpu_count; i++) {
216 node->view[i].user = 0;
217 node->view[i].system = 0;
218 node->view[i].idle = 0;
219 }
220
221 return node;
222
223 err:
224 if (node && node->cg)
225 free(node->cg);
226 if (node && node->usage)
227 free(node->usage);
228 if (node && node->view)
229 free(node->view);
230 if (node)
231 free(node);
232
233 return NULL;
234 }
235
236 static bool cgfs_param_exist(const char *controller, const char *cgroup,
237 const char *file)
238 {
239 __do_free char *path = NULL;
240 int cfd;
241
242 cfd = get_cgroup_fd(controller);
243 if (cfd < 0)
244 return false;
245
246 path = must_make_path(dot_or_empty(cgroup), cgroup, file, NULL);
247 return (faccessat(cfd, path, F_OK, 0) == 0);
248 }
249
250 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
251 {
252 struct cg_proc_stat *first = NULL;
253
254 for (struct cg_proc_stat *prev = NULL; node; ) {
255 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
256 struct cg_proc_stat *tmp = node;
257
258 lxcfs_debug("Removing stat node for %s\n", node->cg);
259
260 if (prev)
261 prev->next = node->next;
262 else
263 first = node->next;
264
265 node = node->next;
266 free_proc_stat_node(tmp);
267 } else {
268 if (!first)
269 first = node;
270 prev = node;
271 node = node->next;
272 }
273 }
274
275 return first;
276 }
277
278 #define PROC_STAT_PRUNE_INTERVAL 10
279 static void prune_proc_stat_history(void)
280 {
281 time_t now = time(NULL);
282
283 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) {
284 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
285
286 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
287 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
288 return;
289 }
290
291 if (proc_stat_history[i]->next) {
292 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
293 proc_stat_history[i]->lastcheck = now;
294 }
295
296 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
297 }
298 }
299
300 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head,
301 const char *cg)
302 {
303 struct cg_proc_stat *node;
304
305 pthread_rwlock_rdlock(&head->lock);
306
307 if (!head->next) {
308 pthread_rwlock_unlock(&head->lock);
309 return NULL;
310 }
311
312 node = head->next;
313
314 do {
315 if (strcmp(cg, node->cg) == 0)
316 goto out;
317 } while ((node = node->next));
318
319 node = NULL;
320
321 out:
322 pthread_rwlock_unlock(&head->lock);
323 prune_proc_stat_history();
324 return node;
325 }
326
327 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
328 {
329 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
330 struct cg_proc_stat_head *head = proc_stat_history[hash];
331 struct cg_proc_stat *node;
332
333 node = find_proc_stat_node(head, cg);
334 if (!node) {
335 node = new_proc_stat_node(usage, cpu_count, cg);
336 if (!node)
337 return NULL;
338
339 node = add_proc_stat_node(node);
340 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
341 }
342
343 pthread_mutex_lock(&node->lock);
344
345 /* If additional CPUs on the host have been enabled, CPU usage counter
346 * arrays have to be expanded */
347 if (node->cpu_count < cpu_count) {
348 lxcfs_debug("Expanding stat node %d->%d for %s\n",
349 node->cpu_count, cpu_count, cg);
350
351 if (!expand_proc_stat_node(node, cpu_count)) {
352 pthread_mutex_unlock(&node->lock);
353 return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg);
354 }
355 }
356
357 return node;
358 }
359
360 static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage,
361 uint64_t *counter, uint64_t threshold)
362 {
363 unsigned long free_space, to_add;
364
365 free_space = threshold - usage->user - usage->system;
366
367 if (free_space > usage->idle)
368 free_space = usage->idle;
369
370 to_add = free_space > *surplus ? *surplus : free_space;
371
372 *counter += to_add;
373 usage->idle -= to_add;
374 *surplus -= to_add;
375 }
376
377 static unsigned long diff_cpu_usage(struct cpuacct_usage *older,
378 struct cpuacct_usage *newer,
379 struct cpuacct_usage *diff, int cpu_count)
380 {
381 unsigned long sum = 0;
382
383 for (int i = 0; i < cpu_count; i++) {
384 if (!newer[i].online)
385 continue;
386
387 /*
388 * When cpuset is changed on the fly, the CPUs might get
389 * reordered. We could either reset all counters, or check
390 * that the substractions below will return expected results.
391 */
392 if (newer[i].user > older[i].user)
393 diff[i].user = newer[i].user - older[i].user;
394 else
395 diff[i].user = 0;
396
397 if (newer[i].system > older[i].system)
398 diff[i].system = newer[i].system - older[i].system;
399 else
400 diff[i].system = 0;
401
402 if (newer[i].idle > older[i].idle)
403 diff[i].idle = newer[i].idle - older[i].idle;
404 else
405 diff[i].idle = 0;
406
407 sum += diff[i].user;
408 sum += diff[i].system;
409 sum += diff[i].idle;
410 }
411
412 return sum;
413 }
414
415 /*
416 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
417 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
418 * throuh `value`.
419 */
420 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
421 {
422 __do_free char *str = NULL;
423 char file[11 + 6 + 1]; /* cpu.cfs__us + quota/period + \0 */
424 bool first = true;
425
426 if (!pure_unified_layout(cgroup_ops)) {
427 snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
428 } else {
429 strcpy(file, "cpu.max");
430 first = !strcmp(param, "quota");
431 }
432
433 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
434 return false;
435
436 if (sscanf(str, first ? "%"PRId64 : "%*"PRId64" %"PRId64, value) != 1)
437 return false;
438
439 return true;
440 }
441
442 /*
443 * Return the exact number of visible CPUs based on CPU quotas.
444 * If there is no quota set, zero is returned.
445 */
446 static double exact_cpu_count(const char *cg)
447 {
448 double rv;
449 int nprocs;
450 int64_t cfs_quota, cfs_period;
451
452 read_cpu_cfs_param(cg, "quota", &cfs_quota);
453 read_cpu_cfs_param(cg, "period", &cfs_period);
454
455 if (cfs_quota <= 0 || cfs_period <= 0)
456 return 0;
457
458 rv = (double)cfs_quota / (double)cfs_period;
459
460 nprocs = get_nprocs();
461
462 if (rv > nprocs)
463 rv = nprocs;
464
465 return rv;
466 }
467
468 /*
469 * Return the maximum number of visible CPUs based on CPU quotas.
470 * If there is no quota set, zero is returned.
471 */
472 int max_cpu_count(const char *cg)
473 {
474 __do_free char *cpuset = NULL;
475 int rv, nprocs;
476 int64_t cfs_quota, cfs_period;
477 int nr_cpus_in_cpuset = 0;
478
479 read_cpu_cfs_param(cg, "quota", &cfs_quota);
480 read_cpu_cfs_param(cg, "period", &cfs_period);
481
482 cpuset = get_cpuset(cg);
483 if (cpuset)
484 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
485
486 if (cfs_quota <= 0 || cfs_period <= 0){
487 if (nr_cpus_in_cpuset > 0)
488 return nr_cpus_in_cpuset;
489
490 return 0;
491 }
492
493 rv = cfs_quota / cfs_period;
494
495 /* In case quota/period does not yield a whole number, add one CPU for
496 * the remainder.
497 */
498 if ((cfs_quota % cfs_period) > 0)
499 rv += 1;
500
501 nprocs = get_nprocs();
502 if (rv > nprocs)
503 rv = nprocs;
504
505 /* use min value in cpu quota and cpuset */
506 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
507 rv = nr_cpus_in_cpuset;
508
509 return rv;
510 }
511
512 int cpuview_proc_stat(const char *cg, const char *cpuset,
513 struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size,
514 FILE *f, char *buf, size_t buf_size)
515 {
516 __do_free char *line = NULL;
517 __do_free struct cpuacct_usage *diff = NULL;
518 size_t linelen = 0, total_len = 0;
519 int curcpu = -1; /* cpu numbering starts at 0 */
520 int physcpu, i;
521 int cpu_cnt = 0;
522 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
523 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
524 uint64_t user_sum = 0, system_sum = 0, idle_sum = 0;
525 uint64_t user_surplus = 0, system_surplus = 0;
526 int nprocs, max_cpus;
527 ssize_t l;
528 uint64_t total_sum, threshold;
529 struct cg_proc_stat *stat_node;
530
531 nprocs = get_nprocs_conf();
532 if (cg_cpu_usage_size < nprocs)
533 nprocs = cg_cpu_usage_size;
534
535 /* Read all CPU stats and stop when we've encountered other lines */
536 while (getline(&line, &linelen, f) != -1) {
537 int ret;
538 char cpu_char[10]; /* That's a lot of cores */
539 uint64_t all_used, cg_used;
540
541 if (strlen(line) == 0)
542 continue;
543
544 /* not a ^cpuN line containing a number N */
545 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
546 break;
547
548 if (sscanf(cpu_char, "%d", &physcpu) != 1)
549 continue;
550
551 if (physcpu >= cg_cpu_usage_size)
552 continue;
553
554 curcpu++;
555 cpu_cnt++;
556
557 if (!cpu_in_cpuset(physcpu, cpuset)) {
558 for (i = curcpu; i <= physcpu; i++)
559 cg_cpu_usage[i].online = false;
560 continue;
561 }
562
563 if (curcpu < physcpu) {
564 /* Some CPUs may be disabled */
565 for (i = curcpu; i < physcpu; i++)
566 cg_cpu_usage[i].online = false;
567
568 curcpu = physcpu;
569 }
570
571 cg_cpu_usage[curcpu].online = true;
572
573 ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu",
574 &user,
575 &nice,
576 &system,
577 &idle,
578 &iowait,
579 &irq,
580 &softirq,
581 &steal,
582 &guest,
583 &guest_nice);
584 if (ret != 10)
585 continue;
586
587 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
588 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
589
590 if (all_used >= cg_used) {
591 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
592
593 } else {
594 lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
595 curcpu, cg, all_used, cg_used);
596 cg_cpu_usage[curcpu].idle = idle;
597 }
598 }
599
600 /* Cannot use more CPUs than is available in cpuset. */
601 max_cpus = max_cpu_count(cg);
602 if (max_cpus > cpu_cnt || !max_cpus)
603 max_cpus = cpu_cnt;
604
605 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
606 if (!stat_node)
607 return log_error(0, "Failed to find/create stat node for %s", cg);
608
609 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
610 if (!diff)
611 return 0;
612
613 /*
614 * If the new values are LOWER than values stored in memory, it means
615 * the cgroup has been reset/recreated and we should reset too.
616 */
617 for (curcpu = 0; curcpu < nprocs; curcpu++) {
618 if (!cg_cpu_usage[curcpu].online)
619 continue;
620
621 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
622 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
623
624 break;
625 }
626
627 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
628
629 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
630 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
631
632 if (!stat_node->usage[curcpu].online)
633 continue;
634
635 i++;
636
637 stat_node->usage[curcpu].user += diff[curcpu].user;
638 stat_node->usage[curcpu].system += diff[curcpu].system;
639 stat_node->usage[curcpu].idle += diff[curcpu].idle;
640
641 if (max_cpus > 0 && i >= max_cpus) {
642 user_surplus += diff[curcpu].user;
643 system_surplus += diff[curcpu].system;
644 }
645 }
646
647 /* Calculate usage counters of visible CPUs */
648 if (max_cpus > 0) {
649 uint64_t diff_user = 0;
650 uint64_t diff_system = 0;
651 uint64_t diff_idle = 0;
652 uint64_t max_diff_idle = 0;
653 uint64_t max_diff_idle_index = 0;
654 double exact_cpus;
655
656 /* threshold = maximum usage per cpu, including idle */
657 threshold = total_sum / cpu_cnt * max_cpus;
658
659 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
660 if (!stat_node->usage[curcpu].online)
661 continue;
662
663 i++;
664
665 if (i == max_cpus)
666 break;
667
668 if (diff[curcpu].user + diff[curcpu].system >= threshold)
669 continue;
670
671 /* Add user */
672 add_cpu_usage(&user_surplus, &diff[curcpu],
673 &diff[curcpu].user, threshold);
674
675 if (diff[curcpu].user + diff[curcpu].system >= threshold)
676 continue;
677
678 /* If there is still room, add system */
679 add_cpu_usage(&system_surplus, &diff[curcpu],
680 &diff[curcpu].system, threshold);
681 }
682
683 if (user_surplus > 0)
684 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
685 if (system_surplus > 0)
686 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
687
688 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
689 if (!stat_node->usage[curcpu].online)
690 continue;
691
692 i++;
693
694 if (i == max_cpus)
695 break;
696
697 stat_node->view[curcpu].user += diff[curcpu].user;
698 stat_node->view[curcpu].system += diff[curcpu].system;
699 stat_node->view[curcpu].idle += diff[curcpu].idle;
700
701 user_sum += stat_node->view[curcpu].user;
702 system_sum += stat_node->view[curcpu].system;
703 idle_sum += stat_node->view[curcpu].idle;
704
705 diff_user += diff[curcpu].user;
706 diff_system += diff[curcpu].system;
707 diff_idle += diff[curcpu].idle;
708 if (diff[curcpu].idle > max_diff_idle) {
709 max_diff_idle = diff[curcpu].idle;
710 max_diff_idle_index = curcpu;
711 }
712
713 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
714 }
715 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
716
717 /* revise cpu usage view to support partial cpu case. */
718 exact_cpus = exact_cpu_count(cg);
719 if (exact_cpus < (double)max_cpus){
720 unsigned long delta = (unsigned long)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
721
722 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
723 lxcfs_v("delta: %lu\n", delta);
724 lxcfs_v("idle_sum before: %lu\n", idle_sum);
725 idle_sum = idle_sum > delta ? idle_sum - delta : 0;
726 lxcfs_v("idle_sum after: %lu\n", idle_sum);
727
728 curcpu = max_diff_idle_index;
729 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
730 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
731 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
732 }
733 } else {
734 for (curcpu = 0; curcpu < nprocs; curcpu++) {
735 if (!stat_node->usage[curcpu].online)
736 continue;
737
738 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
739 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
740 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
741
742 user_sum += stat_node->view[curcpu].user;
743 system_sum += stat_node->view[curcpu].system;
744 idle_sum += stat_node->view[curcpu].idle;
745 }
746 }
747
748 /* Render the file */
749 /* cpu-all */
750 l = snprintf(buf, buf_size,
751 "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
752 user_sum, system_sum, idle_sum);
753 lxcfs_v("cpu-all: %s\n", buf);
754 if (l < 0)
755 return log_error(0, "Failed to write cache");
756 if (l >= buf_size)
757 return log_error(0, "Write to cache was truncated");
758
759 buf += l;
760 buf_size -= l;
761 total_len += l;
762
763 /* Render visible CPUs */
764 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
765 if (!stat_node->usage[curcpu].online)
766 continue;
767
768 i++;
769
770 if (max_cpus > 0 && i == max_cpus)
771 break;
772
773 l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
774 i,
775 stat_node->view[curcpu].user,
776 stat_node->view[curcpu].system,
777 stat_node->view[curcpu].idle);
778 lxcfs_v("cpu: %s\n", buf);
779 if (l < 0)
780 return log_error(0, "Failed to write cache");
781 if (l >= buf_size)
782 return log_error(0, "Write to cache was truncated");
783
784 buf += l;
785 buf_size -= l;
786 total_len += l;
787 }
788
789 /* Pass the rest of /proc/stat, start with the last line read */
790 l = snprintf(buf, buf_size, "%s", line);
791 if (l < 0)
792 return log_error(0, "Failed to write cache");
793 if (l >= buf_size)
794 return log_error(0, "Write to cache was truncated");
795
796 buf += l;
797 buf_size -= l;
798 total_len += l;
799
800 /* Pass the rest of the host's /proc/stat */
801 while (getline(&line, &linelen, f) != -1) {
802 l = snprintf(buf, buf_size, "%s", line);
803 if (l < 0)
804 return log_error(0, "Failed to write cache");
805 if (l >= buf_size)
806 return log_error(0, "Write to cache was truncated");
807
808 buf += l;
809 buf_size -= l;
810 total_len += l;
811 }
812
813 if (stat_node)
814 pthread_mutex_unlock(&stat_node->lock);
815
816 return total_len;
817 }
818
819 /*
820 * check whether this is a '^processor" line in /proc/cpuinfo
821 */
822 static inline bool is_processor_line(const char *line)
823 {
824 int cpu;
825 return sscanf(line, "processor : %d", &cpu) == 1;
826 }
827
828 static inline bool cpuline_in_cpuset(const char *line, const char *cpuset)
829 {
830 int cpu;
831
832 if (sscanf(line, "processor : %d", &cpu) == 1)
833 return cpu_in_cpuset(cpu, cpuset);
834
835 return false;
836 }
837
838 int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
839 struct fuse_file_info *fi)
840 {
841 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
842 __do_free void *fopen_cache = NULL;
843 __do_fclose FILE *f = NULL;
844 struct fuse_context *fc = fuse_get_context();
845 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
846 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
847 size_t linelen = 0, total_len = 0;
848 bool am_printing = false, firstline = true, is_s390x = false;
849 int curcpu = -1, cpu, max_cpus = 0;
850 bool use_view;
851 char *cache = d->buf;
852 size_t cache_size = d->buflen;
853
854 if (offset) {
855 int left;
856
857 if (offset > d->size)
858 return -EINVAL;
859
860 if (!d->cached)
861 return 0;
862
863 left = d->size - offset;
864 total_len = left > size ? size: left;
865 memcpy(buf, cache + offset, total_len);
866
867 return total_len;
868 }
869
870 pid_t initpid = lookup_initpid_in_store(fc->pid);
871 if (initpid <= 1 || is_shared_pidns(initpid))
872 initpid = fc->pid;
873
874 cg = get_pid_cgroup(initpid, "cpuset");
875 if (!cg)
876 return read_file_fuse("proc/cpuinfo", buf, size, d);
877 prune_init_slice(cg);
878
879 cpuset = get_cpuset(cg);
880 if (!cpuset)
881 return 0;
882
883 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
884 use_view = true;
885 else
886 use_view = false;
887 if (use_view)
888 max_cpus = max_cpu_count(cg);
889
890 f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache);
891 if (!f)
892 return 0;
893
894 while (getline(&line, &linelen, f) != -1) {
895 ssize_t l;
896 if (firstline) {
897 firstline = false;
898 if (strstr(line, "IBM/S390") != NULL) {
899 is_s390x = true;
900 am_printing = true;
901 continue;
902 }
903 }
904
905 if (strncmp(line, "# processors:", 12) == 0)
906 continue;
907
908 if (is_processor_line(line)) {
909 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
910 break;
911
912 am_printing = cpuline_in_cpuset(line, cpuset);
913 if (am_printing) {
914 curcpu++;
915 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
916 if (l < 0)
917 return log_error(0, "Failed to write cache");
918 if (l >= cache_size)
919 return log_error(0, "Write to cache was truncated");
920 cache += l;
921 cache_size -= l;
922 total_len += l;
923 }
924 continue;
925 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
926 char *p;
927
928 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
929 break;
930
931 if (!cpu_in_cpuset(cpu, cpuset))
932 continue;
933
934 curcpu ++;
935 p = strchr(line, ':');
936 if (!p || !*p)
937 return 0;
938 p++;
939
940 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
941 if (l < 0)
942 return log_error(0, "Failed to write cache");
943 if (l >= cache_size)
944 return log_error(0, "Write to cache was truncated");
945
946 cache += l;
947 cache_size -= l;
948 total_len += l;
949 continue;
950
951 }
952 if (am_printing) {
953 l = snprintf(cache, cache_size, "%s", line);
954 if (l < 0)
955 return log_error(0, "Failed to write cache");
956 if (l >= cache_size)
957 return log_error(0, "Write to cache was truncated");
958
959 cache += l;
960 cache_size -= l;
961 total_len += l;
962 }
963 }
964
965 if (is_s390x) {
966 __do_free char *origcache = d->buf;
967 ssize_t l;
968
969 d->buf = malloc(d->buflen);
970 if (!d->buf) {
971 d->buf = move_ptr(origcache);
972 return 0;
973 }
974
975 cache = d->buf;
976 cache_size = d->buflen;
977 total_len = 0;
978 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
979 if (l < 0 || l >= cache_size)
980 return 0;
981
982 cache_size -= l;
983 cache += l;
984 total_len += l;
985 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
986 if (l < 0 || l >= cache_size)
987 return 0;
988
989 cache_size -= l;
990 cache += l;
991 total_len += l;
992 l = snprintf(cache, cache_size, "%s", origcache);
993 if (l < 0 || l >= cache_size)
994 return 0;
995 total_len += l;
996 }
997
998 d->cached = 1;
999 d->size = total_len;
1000 if (total_len > size)
1001 total_len = size;
1002
1003 /* read from off 0 */
1004 memcpy(buf, d->buf, total_len);
1005
1006 return total_len;
1007 }
1008
1009 /*
1010 * Returns 0 on success.
1011 * It is the caller's responsibility to free `return_usage`, unless this
1012 * function returns an error.
1013 */
1014 int read_cpuacct_usage_all(char *cg, char *cpuset,
1015 struct cpuacct_usage **return_usage, int *size)
1016 {
1017 __do_free char *usage_str = NULL;
1018 __do_free struct cpuacct_usage *cpu_usage = NULL;
1019 int cpucount;
1020 int i = 0, j = 0, read_pos = 0, read_cnt = 0;
1021 int ret;
1022 int cg_cpu;
1023 uint64_t cg_user, cg_system;
1024 int64_t ticks_per_sec;
1025
1026 ticks_per_sec = sysconf(_SC_CLK_TCK);
1027 if (ticks_per_sec < 0 && errno == EINVAL) {
1028 lxcfs_v(
1029 "%s\n",
1030 "read_cpuacct_usage_all failed to determine number of clock ticks "
1031 "in a second");
1032 return -1;
1033 }
1034
1035 cpucount = get_nprocs_conf();
1036 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
1037 if (!cpu_usage)
1038 return -ENOMEM;
1039
1040 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1041 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
1042 char *data = NULL;
1043 size_t sz = 0, asz = 0;
1044
1045 /* read cpuacct.usage_percpu instead. */
1046 lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", "");
1047 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
1048 return -1;
1049 lxcfs_v("usage_str: %s\n", usage_str);
1050
1051 /* convert cpuacct.usage_percpu into cpuacct.usage_all. */
1052 lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", "");
1053
1054 must_strcat(&data, &sz, &asz, "cpu user system\n");
1055
1056 while (sscanf(usage_str + read_pos, "%" PRIu64 " %n", &cg_user, &read_cnt) > 0) {
1057 lxcfs_debug("i: %d, cg_user: %" PRIu64 ", read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt);
1058 must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user);
1059 i++;
1060 read_pos += read_cnt;
1061 }
1062
1063 usage_str = data;
1064
1065 lxcfs_v("usage_str: %s\n", usage_str);
1066 }
1067
1068 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0)
1069 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg);
1070
1071 read_pos += read_cnt;
1072
1073 for (i = 0, j = 0; i < cpucount; i++) {
1074 ret = sscanf(usage_str + read_pos,
1075 "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu,
1076 &cg_user, &cg_system, &read_cnt);
1077
1078 if (ret == EOF)
1079 break;
1080
1081 if (ret != 3)
1082 return log_error(-1, "read_cpuacct_usage_all reading from %s/cpuacct.usage_all failed", cg);
1083
1084 read_pos += read_cnt;
1085
1086 /* Convert the time from nanoseconds to USER_HZ */
1087 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1088 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
1089 j++;
1090 }
1091
1092 *return_usage = move_ptr(cpu_usage);
1093 *size = cpucount;
1094 return 0;
1095 }
1096
1097 static bool cpuview_init_head(struct cg_proc_stat_head **head)
1098 {
1099 *head = malloc(sizeof(struct cg_proc_stat_head));
1100 if (!(*head))
1101 return log_error(false, "%s", strerror(errno));
1102
1103 (*head)->lastcheck = time(NULL);
1104 (*head)->next = NULL;
1105
1106 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
1107 free_disarm(*head);
1108 return log_error(false, "Failed to initialize list lock");
1109 }
1110
1111 return true;
1112 }
1113
1114 bool init_cpuview(void)
1115 {
1116 int i;
1117
1118 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
1119 proc_stat_history[i] = NULL;
1120
1121 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1122 if (!cpuview_init_head(&proc_stat_history[i]))
1123 goto err;
1124 }
1125
1126 return true;
1127
1128 err:
1129 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1130 if (proc_stat_history[i])
1131 free_disarm(proc_stat_history[i]);
1132 }
1133
1134 return false;
1135 }
1136
1137 static void cpuview_free_head(struct cg_proc_stat_head *head)
1138 {
1139 struct cg_proc_stat *node, *tmp;
1140
1141 if (head->next) {
1142 node = head->next;
1143
1144 for (;;) {
1145 tmp = node;
1146 node = node->next;
1147 free_proc_stat_node(tmp);
1148
1149 if (!node)
1150 break;
1151 }
1152 }
1153
1154 pthread_rwlock_destroy(&head->lock);
1155 free_disarm(head);
1156 }
1157
1158 void free_cpuview(void)
1159 {
1160 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++)
1161 if (proc_stat_history[i])
1162 cpuview_free_head(proc_stat_history[i]);
1163 }