]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/proc_cpuview.c
cpuset_parse: check input string in cpuset_nexttok
[mirror_lxcfs.git] / src / proc_cpuview.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include "config.h"
4
5 #include <dirent.h>
6 #include <errno.h>
7 #include <fcntl.h>
8 #include <inttypes.h>
9 #include <libgen.h>
10 #include <pthread.h>
11 #include <sched.h>
12 #include <stdarg.h>
13 #include <stdbool.h>
14 #include <stdint.h>
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <time.h>
19 #include <unistd.h>
20 #include <wait.h>
21 #include <linux/magic.h>
22 #include <linux/sched.h>
23 #include <sys/epoll.h>
24 #include <sys/mman.h>
25 #include <sys/mount.h>
26 #include <sys/param.h>
27 #include <sys/socket.h>
28 #include <sys/syscall.h>
29 #include <sys/sysinfo.h>
30 #include <sys/vfs.h>
31
32 #include "proc_cpuview.h"
33
34 #include "bindings.h"
35 #include "cgroup_fuse.h"
36 #include "cpuset_parse.h"
37 #include "cgroups/cgroup.h"
38 #include "cgroups/cgroup_utils.h"
39 #include "memory_utils.h"
40 #include "proc_loadavg.h"
41 #include "utils.h"
42
43 /* Data for CPU view */
44 struct cg_proc_stat {
45 char *cg;
46 struct cpuacct_usage *usage; /* Real usage as read from the host's /proc/stat. */
47 struct cpuacct_usage *view; /* Usage stats reported to the container. */
48 int cpu_count;
49 pthread_mutex_t lock; /* For node manipulation. */
50 struct cg_proc_stat *next;
51 };
52
53 struct cg_proc_stat_head {
54 struct cg_proc_stat *next;
55 time_t lastcheck;
56
57 /*
58 * For access to the list. Reading can be parallel, pruning is exclusive.
59 */
60 pthread_rwlock_t lock;
61 };
62
63 #define CPUVIEW_HASH_SIZE 100
64 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
65
66 static void reset_proc_stat_node(struct cg_proc_stat *node,
67 struct cpuacct_usage *usage, int cpu_count)
68 {
69 lxcfs_debug("Resetting stat node for %s\n", node->cg);
70 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
71
72 for (int i = 0; i < cpu_count; i++) {
73 node->view[i].user = 0;
74 node->view[i].system = 0;
75 node->view[i].idle = 0;
76 }
77
78 node->cpu_count = cpu_count;
79 }
80
81 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
82 {
83 __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
84
85 /* Allocate new memory */
86 new_usage = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
87 if (!new_usage)
88 return false;
89
90 new_view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
91 if (!new_view)
92 return false;
93
94 /* Copy existing data & initialize new elements */
95 for (int i = 0; i < cpu_count; i++) {
96 if (i < node->cpu_count) {
97 new_usage[i].user = node->usage[i].user;
98 new_usage[i].system = node->usage[i].system;
99 new_usage[i].idle = node->usage[i].idle;
100
101 new_view[i].user = node->view[i].user;
102 new_view[i].system = node->view[i].system;
103 new_view[i].idle = node->view[i].idle;
104 }
105 }
106
107 free(node->usage);
108 node->usage = move_ptr(new_usage);
109
110 free(node->view);
111 node->view = move_ptr(new_view);
112 node->cpu_count = cpu_count;
113
114 return true;
115 }
116
117 static void free_proc_stat_node(struct cg_proc_stat *node)
118 {
119 if (node) {
120 /*
121 * We're abusing the usage pointer to indicate that
122 * pthread_mutex_init() was successful. Don't judge me.
123 */
124 if (node->usage)
125 pthread_mutex_destroy(&node->lock);
126 free_disarm(node->cg);
127 free_disarm(node->usage);
128 free_disarm(node->view);
129 free_disarm(node);
130 }
131 }
132
133 define_cleanup_function(struct cg_proc_stat *, free_proc_stat_node);
134
135 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
136 {
137 call_cleaner(free_proc_stat_node) struct cg_proc_stat *new = new_node;
138 struct cg_proc_stat *rv = new_node;
139 int hash = calc_hash(new->cg) % CPUVIEW_HASH_SIZE;
140 struct cg_proc_stat_head *head = proc_stat_history[hash];
141 struct cg_proc_stat *cur;
142
143 pthread_rwlock_wrlock(&head->lock);
144
145 if (!head->next) {
146 head->next = move_ptr(new);
147 goto out_rwlock_unlock;
148 }
149
150 cur = head->next;
151
152 for (;;) {
153 /*
154 * The node to be added is already present in the list, so
155 * free the newly allocated one and return the one we found.
156 */
157 if (strcmp(cur->cg, new->cg) == 0) {
158 rv = cur;
159 goto out_rwlock_unlock;
160 }
161
162 /* Keep walking. */
163 if (cur->next) {
164 cur = cur->next;
165 continue;
166 }
167
168 /* Add new node to end of list. */
169 cur->next = move_ptr(new);
170 goto out_rwlock_unlock;
171 }
172
173 out_rwlock_unlock:
174 pthread_rwlock_unlock(&head->lock);
175 return move_ptr(rv);
176 }
177
178 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage,
179 int cpu_count, const char *cg)
180 {
181 call_cleaner(free_proc_stat_node) struct cg_proc_stat *node = NULL;
182 __do_free struct cpuacct_usage *new_usage = NULL;
183
184 node = zalloc(sizeof(struct cg_proc_stat));
185 if (!node)
186 return NULL;
187
188 node->cg = strdup(cg);
189 if (!node->cg)
190 return NULL;
191
192 new_usage = memdup(usage, sizeof(struct cpuacct_usage) * cpu_count);
193 if (!new_usage)
194 return NULL;
195
196 node->view = zalloc(sizeof(struct cpuacct_usage) * cpu_count);
197 if (!node->view)
198 return NULL;
199
200 node->cpu_count = cpu_count;
201
202 if (pthread_mutex_init(&node->lock, NULL))
203 return NULL;
204 /*
205 * We're abusing the usage pointer to indicate that
206 * pthread_mutex_init() was successful. Don't judge me.
207 */
208 node->usage = move_ptr(new_usage);
209
210 return move_ptr(node);
211 }
212
213 static bool cgroup_supports(const char *controller, const char *cgroup,
214 const char *file)
215 {
216 __do_free char *path = NULL;
217 int cfd;
218
219 cfd = get_cgroup_fd(controller);
220 if (cfd < 0)
221 return false;
222
223 path = must_make_path_relative(cgroup, file, NULL);
224 return faccessat(cfd, path, F_OK, 0) == 0;
225 }
226
227 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
228 {
229 struct cg_proc_stat *first = NULL;
230
231 for (struct cg_proc_stat *prev = NULL; node; ) {
232 if (!cgroup_supports("cpu", node->cg, "cpu.shares")) {
233 struct cg_proc_stat *cur = node;
234
235 if (prev)
236 prev->next = node->next;
237 else
238 first = node->next;
239
240 node = node->next;
241 lxcfs_debug("Removing stat node for %s\n", cur);
242
243 free_proc_stat_node(cur);
244 } else {
245 if (!first)
246 first = node;
247 prev = node;
248 node = node->next;
249 }
250 }
251
252 return first;
253 }
254
255 #define PROC_STAT_PRUNE_INTERVAL 10
256 static void prune_proc_stat_history(void)
257 {
258 time_t now = time(NULL);
259
260 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) {
261 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
262
263 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
264 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
265 return;
266 }
267
268 if (proc_stat_history[i]->next) {
269 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
270 proc_stat_history[i]->lastcheck = now;
271 }
272
273 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
274 }
275 }
276
277 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head,
278 const char *cg)
279 {
280 struct cg_proc_stat *node;
281
282 pthread_rwlock_rdlock(&head->lock);
283
284 if (!head->next) {
285 pthread_rwlock_unlock(&head->lock);
286 return NULL;
287 }
288
289 node = head->next;
290
291 do {
292 if (strcmp(cg, node->cg) == 0)
293 goto out;
294 } while ((node = node->next));
295
296 node = NULL;
297
298 out:
299 pthread_rwlock_unlock(&head->lock);
300 prune_proc_stat_history();
301 return node;
302 }
303
304 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage,
305 int cpu_count, const char *cg)
306 {
307 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
308 struct cg_proc_stat_head *head = proc_stat_history[hash];
309 struct cg_proc_stat *node;
310
311 node = find_proc_stat_node(head, cg);
312 if (!node) {
313 node = new_proc_stat_node(usage, cpu_count, cg);
314 if (!node)
315 return NULL;
316
317 node = add_proc_stat_node(node);
318 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
319 }
320
321 pthread_mutex_lock(&node->lock);
322
323 /*
324 * If additional CPUs on the host have been enabled, CPU usage counter
325 * arrays have to be expanded.
326 */
327 if (node->cpu_count < cpu_count) {
328 lxcfs_debug("Expanding stat node %d->%d for %s\n",
329 node->cpu_count, cpu_count, cg);
330
331 if (!expand_proc_stat_node(node, cpu_count)) {
332 pthread_mutex_unlock(&node->lock);
333 return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg);
334 }
335 }
336
337 return node;
338 }
339
340 static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage,
341 uint64_t *counter, uint64_t threshold)
342 {
343 uint64_t free_space, to_add;
344
345 free_space = threshold - usage->user - usage->system;
346
347 if (free_space > usage->idle)
348 free_space = usage->idle;
349
350 if (free_space > *surplus)
351 to_add = *surplus;
352 else
353 to_add = free_space;
354
355 *counter += to_add;
356 usage->idle -= to_add;
357 *surplus -= to_add;
358 }
359
360 static uint64_t diff_cpu_usage(struct cpuacct_usage *older,
361 struct cpuacct_usage *newer,
362 struct cpuacct_usage *diff, int cpu_count)
363 {
364 uint64_t sum = 0;
365
366 for (int i = 0; i < cpu_count; i++) {
367 if (!newer[i].online)
368 continue;
369
370 /*
371 * When cpuset is changed on the fly, the CPUs might get
372 * reordered. We could either reset all counters, or check
373 * that the substractions below will return expected results.
374 */
375 if (newer[i].user > older[i].user)
376 diff[i].user = newer[i].user - older[i].user;
377 else
378 diff[i].user = 0;
379
380 if (newer[i].system > older[i].system)
381 diff[i].system = newer[i].system - older[i].system;
382 else
383 diff[i].system = 0;
384
385 if (newer[i].idle > older[i].idle)
386 diff[i].idle = newer[i].idle - older[i].idle;
387 else
388 diff[i].idle = 0;
389
390 sum += diff[i].user;
391 sum += diff[i].system;
392 sum += diff[i].idle;
393 }
394
395 return sum;
396 }
397
398 /*
399 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
400 * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
401 * through `value`.
402 */
403 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
404 {
405 __do_free char *str = NULL;
406 char file[STRLITERALLEN("cpu.cfs_period_us") + 1];
407 bool first = true;
408 int ret;
409
410 if (pure_unified_layout(cgroup_ops)) {
411 first = !strcmp(param, "quota");
412 ret = snprintf(file, sizeof(file), "cpu.max");
413 } else {
414 ret = snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
415 }
416 if (ret < 0 || (size_t)ret >= sizeof(file))
417 return false;
418
419 if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
420 return false;
421
422 return sscanf(str, first ? "%" PRId64 : "%*d %" PRId64, value) == 1;
423 }
424
425 /*
426 * Return the exact number of visible CPUs based on CPU quotas.
427 * If there is no quota set, zero is returned.
428 */
429 static double exact_cpu_count(const char *cg)
430 {
431 double rv;
432 int nprocs;
433 int64_t cfs_quota, cfs_period;
434
435 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
436 return 0;
437
438 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
439 return 0;
440
441 if (cfs_quota <= 0 || cfs_period <= 0)
442 return 0;
443
444 rv = (double)cfs_quota / (double)cfs_period;
445
446 nprocs = get_nprocs();
447
448 if (rv > nprocs)
449 rv = nprocs;
450
451 return rv;
452 }
453
454 /*
455 * Return true if cfs quota of the cgroup is neg / not set
456 */
457 static bool cfs_quota_disabled(const char *cg)
458 {
459 int64_t cfs_quota;
460
461 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
462 return true;
463
464 return cfs_quota < 0;
465 }
466
467 /*
468 * Return the maximum number of visible CPUs based on CPU quotas.
469 * If there is no quota set, cpu number in cpuset value is returned.
470 */
471 int max_cpu_count(const char *cg)
472 {
473 __do_free char *cpuset = NULL;
474 int rv, nprocs;
475 int64_t cfs_quota, cfs_period;
476 int nr_cpus_in_cpuset = 0;
477
478 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
479 cfs_quota = 0;
480
481 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
482 cfs_period = 0;
483
484 cpuset = get_cpuset(cg);
485 if (cpuset)
486 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
487
488 if (cfs_quota <= 0 || cfs_period <= 0) {
489 if (nr_cpus_in_cpuset > 0)
490 return nr_cpus_in_cpuset;
491
492 return 0;
493 }
494
495 rv = cfs_quota / cfs_period;
496
497 /*
498 * In case quota/period does not yield a whole number, add one CPU for
499 * the remainder.
500 */
501 if ((cfs_quota % cfs_period) > 0)
502 rv += 1;
503
504 nprocs = get_nprocs();
505 if (rv > nprocs)
506 rv = nprocs;
507
508 /* Use min value in cpu quota and cpuset. */
509 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
510 rv = nr_cpus_in_cpuset;
511
512 return rv;
513 }
514
515 int cpuview_proc_stat(const char *cg, const char *cpuset,
516 struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size,
517 FILE *f, char *buf, size_t buf_size)
518 {
519 __do_free char *line = NULL;
520 __do_free struct cpuacct_usage *diff = NULL;
521 size_t linelen = 0, total_len = 0;
522 int curcpu = -1; /* cpu numbering starts at 0 */
523 int physcpu, i;
524 int cpu_cnt = 0;
525 uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0,
526 softirq = 0, steal = 0, guest = 0, guest_nice = 0;
527 uint64_t user_sum = 0, system_sum = 0, idle_sum = 0;
528 uint64_t user_surplus = 0, system_surplus = 0;
529 int nprocs, max_cpus;
530 ssize_t l;
531 uint64_t total_sum, threshold;
532 struct cg_proc_stat *stat_node;
533
534 nprocs = get_nprocs_conf();
535 if (cg_cpu_usage_size < nprocs)
536 nprocs = cg_cpu_usage_size;
537
538 /* Read all CPU stats and stop when we've encountered other lines */
539 while (getline(&line, &linelen, f) != -1) {
540 int ret;
541 char cpu_char[10]; /* That's a lot of cores */
542 uint64_t all_used, cg_used;
543
544 if (strlen(line) == 0)
545 continue;
546
547 /* not a ^cpuN line containing a number N */
548 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
549 break;
550
551 if (sscanf(cpu_char, "%d", &physcpu) != 1)
552 continue;
553
554 if (physcpu >= cg_cpu_usage_size)
555 continue;
556
557 curcpu++;
558 cpu_cnt++;
559
560 if (!cpu_in_cpuset(physcpu, cpuset)) {
561 for (i = curcpu; i <= physcpu; i++)
562 cg_cpu_usage[i].online = false;
563 continue;
564 }
565
566 if (curcpu < physcpu) {
567 /* Some CPUs may be disabled */
568 for (i = curcpu; i < physcpu; i++)
569 cg_cpu_usage[i].online = false;
570
571 curcpu = physcpu;
572 }
573
574 cg_cpu_usage[curcpu].online = true;
575
576 ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu",
577 &user,
578 &nice,
579 &system,
580 &idle,
581 &iowait,
582 &irq,
583 &softirq,
584 &steal,
585 &guest,
586 &guest_nice);
587 if (ret != 10)
588 continue;
589
590 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
591 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
592
593 if (all_used >= cg_used) {
594 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
595 } else {
596 lxcfs_v("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
597 curcpu, cg, all_used, cg_used);
598 cg_cpu_usage[curcpu].idle = idle;
599 }
600 }
601
602 /* Cannot use more CPUs than is available in cpuset. */
603 max_cpus = max_cpu_count(cg);
604 if (max_cpus > cpu_cnt || !max_cpus)
605 max_cpus = cpu_cnt;
606
607 /* takes lock pthread_mutex_lock(&node->lock) */
608 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
609 if (!stat_node)
610 return log_error(0, "Failed to find/create stat node for %s", cg);
611
612 diff = zalloc(sizeof(struct cpuacct_usage) * nprocs);
613 if (!diff)
614 goto out_pthread_mutex_unlock;
615
616 /*
617 * If the new values are LOWER than values stored in memory, it means
618 * the cgroup has been reset/recreated and we should reset too.
619 */
620 for (curcpu = 0; curcpu < nprocs; curcpu++) {
621 if (!cg_cpu_usage[curcpu].online)
622 continue;
623
624 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
625 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
626
627 break;
628 }
629
630 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
631
632 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
633 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
634
635 if (!stat_node->usage[curcpu].online)
636 continue;
637
638 i++;
639
640 stat_node->usage[curcpu].user += diff[curcpu].user;
641 stat_node->usage[curcpu].system += diff[curcpu].system;
642 stat_node->usage[curcpu].idle += diff[curcpu].idle;
643
644 if (max_cpus > 0 && i >= max_cpus) {
645 user_surplus += diff[curcpu].user;
646 system_surplus += diff[curcpu].system;
647 }
648 }
649
650 /* Calculate usage counters of visible CPUs */
651 if (max_cpus > 0) {
652 uint64_t diff_user = 0;
653 uint64_t diff_system = 0;
654 uint64_t diff_idle = 0;
655 uint64_t max_diff_idle = 0;
656 uint64_t max_diff_idle_index = 0;
657 double exact_cpus;
658 /* threshold = maximum usage per cpu, including idle */
659 threshold = total_sum / cpu_cnt * max_cpus;
660
661 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
662 if (!stat_node->usage[curcpu].online)
663 continue;
664
665 i++;
666
667 if (i == max_cpus)
668 break;
669
670 if (diff[curcpu].user + diff[curcpu].system >= threshold)
671 continue;
672
673 /* Add user */
674 add_cpu_usage(&user_surplus, &diff[curcpu],
675 &diff[curcpu].user, threshold);
676
677 if (diff[curcpu].user + diff[curcpu].system >= threshold)
678 continue;
679
680 /* If there is still room, add system */
681 add_cpu_usage(&system_surplus, &diff[curcpu],
682 &diff[curcpu].system, threshold);
683 }
684
685 if (user_surplus > 0)
686 lxcfs_debug("leftover user: %" PRIu64 "for %s\n", user_surplus, cg);
687 if (system_surplus > 0)
688 lxcfs_debug("leftover system: %" PRIu64 "for %s\n", system_surplus, cg);
689
690 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
691 if (!stat_node->usage[curcpu].online)
692 continue;
693
694 i++;
695
696 if (i == max_cpus)
697 break;
698
699 stat_node->view[curcpu].user += diff[curcpu].user;
700 stat_node->view[curcpu].system += diff[curcpu].system;
701 stat_node->view[curcpu].idle += diff[curcpu].idle;
702
703 diff_user += diff[curcpu].user;
704 diff_system += diff[curcpu].system;
705 diff_idle += diff[curcpu].idle;
706 if (diff[curcpu].idle > max_diff_idle) {
707 max_diff_idle = diff[curcpu].idle;
708 max_diff_idle_index = curcpu;
709 }
710
711 lxcfs_v("curcpu: %d, diff_user: %" PRIu64 ", diff_system: %" PRIu64 ", diff_idle: %" PRIu64 "\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
712 }
713 lxcfs_v("total. diff_user: %" PRIu64 ", diff_system: %" PRIu64 ", diff_idle: %" PRIu64 "\n", diff_user, diff_system, diff_idle);
714
715 for (curcpu = 0; curcpu < nprocs; curcpu++) {
716 user_sum += stat_node->view[curcpu].user;
717 system_sum += stat_node->view[curcpu].system;
718 idle_sum += stat_node->view[curcpu].idle;
719 }
720
721 /* revise cpu usage view to support partial cpu case. */
722 exact_cpus = exact_cpu_count(cg);
723
724 /* skip revise cpu when cfs quota is disabled (exact_cpus == 0) */
725 if (!cfs_quota_disabled(cg) && exact_cpus < (double)max_cpus){
726 uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
727
728 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
729 lxcfs_v("delta: %" PRIu64 "\n", delta);
730 lxcfs_v("idle_sum before: %" PRIu64 "\n", idle_sum);
731 if (idle_sum > delta)
732 idle_sum = idle_sum - delta;
733 else
734 idle_sum = 0;
735 lxcfs_v("idle_sum after: %l" PRIu64 "\n", idle_sum);
736
737 curcpu = max_diff_idle_index;
738 lxcfs_v("curcpu: %d, idle before: %" PRIu64 "\n", curcpu, stat_node->view[curcpu].idle);
739 if (stat_node->view[curcpu].idle > delta)
740 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle - delta;
741 else
742 stat_node->view[curcpu].idle = 0;
743 lxcfs_v("curcpu: %d, idle after: %" PRIu64 "\n", curcpu, stat_node->view[curcpu].idle);
744 }
745 } else {
746 for (curcpu = 0; curcpu < nprocs; curcpu++) {
747 if (!stat_node->usage[curcpu].online)
748 continue;
749
750 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
751 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
752 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
753
754 user_sum += stat_node->view[curcpu].user;
755 system_sum += stat_node->view[curcpu].system;
756 idle_sum += stat_node->view[curcpu].idle;
757 }
758 }
759
760 /* Render the file */
761 /* cpu-all */
762 l = snprintf(buf, buf_size,
763 "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
764 user_sum, system_sum, idle_sum);
765 lxcfs_v("cpu-all: %s\n", buf);
766 if (l < 0) {
767 lxcfs_error("Failed to write cache");
768 total_len = 0;
769 goto out_pthread_mutex_unlock;
770 }
771 if ((size_t)l >= buf_size) {
772 lxcfs_error("Write to cache was truncated");
773 total_len = 0;
774 goto out_pthread_mutex_unlock;
775 }
776
777 buf += l;
778 buf_size -= l;
779 total_len += l;
780
781 /* Render visible CPUs
782 Assume there are K CPUs: 0, 1, 2, ..., K-1.
783 Among them, there are M online CPUs with index: a1, a2, ... aN ... aM (M >= N)
784 N = max_cpus, M = number of online CPUs
785
786 There will be N rendered cpus, indexed from 0 to N-1, cpu times of the cpus are calculated from those formula:
787 - user_time[0] = stat_node->view[0].user + stat_node->view[1].user + ... + stat_node->view[a1].user
788 - user_time[1] = stat_node->view[a1+1].user + stat_node->view[a1+1].user + ... + stat_node->view[a2].user
789 ...
790 - user_time[N-2] = stat_node->view[a(N-2)+1].user + stat_node->view[a(N-2)+2].user + ...
791 + stat_node->view[a(N-1)].user
792 - user_time[N-1] = stat_node->view[a(N-1)+1].user + stat_node->view[a(N-1)+2].user + ...
793 + stat_node->view[aN] + ... + stat_node->view[K-1] (sum of all remaining CPUs)
794
795 Similar formula applied for system and idle time
796 */
797
798 uint64_t curcpu_view_user_sum = 0, curcpu_view_system_sum = 0, curcpu_view_idle_sum = 0;
799 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
800 curcpu_view_user_sum += stat_node->view[curcpu].user;
801 curcpu_view_system_sum += stat_node->view[curcpu].system;
802 curcpu_view_idle_sum += stat_node->view[curcpu].idle;
803
804 if (!stat_node->usage[curcpu].online && curcpu < nprocs - 1) {
805 continue;
806 }
807
808 i++;
809
810 if (max_cpus > 0 && i >= max_cpus) {
811 // max(i) = count(rendered cpus) = max_cpus - 1
812 i--;
813 }
814
815 if (max_cpus > 0 && i == max_cpus - 1 && curcpu < nprocs - 1) {
816 // last 'rendered' cpu, sum until reaches the last cpu
817 continue;
818 }
819
820 l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
821 i,
822 curcpu_view_user_sum,
823 curcpu_view_system_sum,
824 curcpu_view_idle_sum);
825 lxcfs_v("cpu: %s\n", buf);
826 if (l < 0) {
827 lxcfs_error("Failed to write cache");
828 total_len = 0;
829 goto out_pthread_mutex_unlock;
830 }
831 if ((size_t)l >= buf_size) {
832 lxcfs_error("Write to cache was truncated");
833 total_len = 0;
834 goto out_pthread_mutex_unlock;
835 }
836
837 buf += l;
838 buf_size -= l;
839 total_len += l;
840
841 curcpu_view_user_sum = 0;
842 curcpu_view_system_sum = 0;
843 curcpu_view_idle_sum = 0;
844 }
845
846 /* Pass the rest of /proc/stat, start with the last line read */
847 l = snprintf(buf, buf_size, "%s", line);
848 if (l < 0) {
849 lxcfs_error("Failed to write cache");
850 total_len = 0;
851 goto out_pthread_mutex_unlock;
852 }
853 if ((size_t)l >= buf_size) {
854 lxcfs_error("Write to cache was truncated");
855 total_len = 0;
856 goto out_pthread_mutex_unlock;
857 }
858
859 buf += l;
860 buf_size -= l;
861 total_len += l;
862
863 /* Pass the rest of the host's /proc/stat */
864 while (getline(&line, &linelen, f) != -1) {
865 l = snprintf(buf, buf_size, "%s", line);
866 if (l < 0) {
867 lxcfs_error("Failed to write cache");
868 total_len = 0;
869 goto out_pthread_mutex_unlock;
870 }
871 if ((size_t)l >= buf_size) {
872 lxcfs_error("Write to cache was truncated");
873 total_len = 0;
874 goto out_pthread_mutex_unlock;
875 }
876
877 buf += l;
878 buf_size -= l;
879 total_len += l;
880 }
881
882 out_pthread_mutex_unlock:
883 if (stat_node)
884 pthread_mutex_unlock(&stat_node->lock);
885
886 return total_len;
887 }
888
889 /*
890 * check whether this is a '^processor" line in /proc/cpuinfo
891 */
892 static inline bool is_processor_line(const char *line)
893 {
894 int cpu;
895 return sscanf(line, "processor : %d", &cpu) == 1;
896 }
897
898 static inline bool cpuline_in_cpuset(const char *line, const char *cpuset)
899 {
900 int cpu;
901
902 if (sscanf(line, "processor : %d", &cpu) == 1)
903 return cpu_in_cpuset(cpu, cpuset);
904
905 return false;
906 }
907
908 int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
909 struct fuse_file_info *fi)
910 {
911 __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
912 __do_free void *fopen_cache = NULL;
913 __do_fclose FILE *f = NULL;
914 struct fuse_context *fc = fuse_get_context();
915 struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data;
916 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
917 size_t linelen = 0, total_len = 0;
918 bool am_printing = false, firstline = true, is_s390x = false;
919 int curcpu = -1, cpu, max_cpus = 0;
920 bool use_view;
921 char *cache = d->buf;
922 size_t cache_size = d->buflen;
923
924 if (offset) {
925 size_t left;
926
927 if (offset > d->size)
928 return -EINVAL;
929
930 if (!d->cached)
931 return 0;
932
933 left = d->size - offset;
934 total_len = left > size ? size: left;
935 memcpy(buf, cache + offset, total_len);
936
937 return total_len;
938 }
939
940 pid_t initpid = lookup_initpid_in_store(fc->pid);
941 if (initpid <= 1 || is_shared_pidns(initpid))
942 initpid = fc->pid;
943
944 cg = get_pid_cgroup(initpid, "cpuset");
945 if (!cg)
946 return read_file_fuse("proc/cpuinfo", buf, size, d);
947 prune_init_slice(cg);
948
949 cpuset = get_cpuset(cg);
950 if (!cpuset)
951 return 0;
952
953 if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs)
954 use_view = true;
955 else
956 use_view = false;
957 if (use_view)
958 max_cpus = max_cpu_count(cg);
959
960 f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache);
961 if (!f)
962 return 0;
963
964 while (getline(&line, &linelen, f) != -1) {
965 ssize_t l;
966 if (firstline) {
967 firstline = false;
968 if (strstr(line, "IBM/S390") != NULL) {
969 is_s390x = true;
970 am_printing = true;
971 continue;
972 }
973 }
974
975 if (strncmp(line, "# processors:", 12) == 0)
976 continue;
977
978 if (is_processor_line(line)) {
979 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
980 break;
981
982 am_printing = cpuline_in_cpuset(line, cpuset);
983 if (am_printing) {
984 curcpu++;
985 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
986 if (l < 0)
987 return log_error(0, "Failed to write cache");
988 if ((size_t)l >= cache_size)
989 return log_error(0, "Write to cache was truncated");
990 cache += l;
991 cache_size -= l;
992 total_len += l;
993 }
994 continue;
995 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
996 char *p;
997
998 if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus)
999 break;
1000
1001 if (!cpu_in_cpuset(cpu, cpuset))
1002 continue;
1003
1004 curcpu ++;
1005 p = strchr(line, ':');
1006 if (!p || !*p)
1007 return 0;
1008 p++;
1009
1010 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
1011 if (l < 0)
1012 return log_error(0, "Failed to write cache");
1013 if ((size_t)l >= cache_size)
1014 return log_error(0, "Write to cache was truncated");
1015
1016 cache += l;
1017 cache_size -= l;
1018 total_len += l;
1019 continue;
1020
1021 }
1022 if (am_printing) {
1023 l = snprintf(cache, cache_size, "%s", line);
1024 if (l < 0)
1025 return log_error(0, "Failed to write cache");
1026 if ((size_t)l >= cache_size)
1027 return log_error(0, "Write to cache was truncated");
1028
1029 cache += l;
1030 cache_size -= l;
1031 total_len += l;
1032 }
1033 }
1034
1035 if (is_s390x) {
1036 __do_free char *origcache = d->buf;
1037 ssize_t l;
1038
1039 d->buf = malloc(d->buflen);
1040 if (!d->buf) {
1041 d->buf = move_ptr(origcache);
1042 return 0;
1043 }
1044
1045 cache = d->buf;
1046 cache_size = d->buflen;
1047 total_len = 0;
1048 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
1049 if (l < 0 || (size_t)l >= cache_size)
1050 return 0;
1051
1052 cache_size -= l;
1053 cache += l;
1054 total_len += l;
1055 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
1056 if (l < 0 || (size_t)l >= cache_size)
1057 return 0;
1058
1059 cache_size -= l;
1060 cache += l;
1061 total_len += l;
1062 l = snprintf(cache, cache_size, "%s", origcache);
1063 if (l < 0 || (size_t)l >= cache_size)
1064 return 0;
1065 total_len += l;
1066 }
1067
1068 d->cached = 1;
1069 d->size = total_len;
1070 if (total_len > size)
1071 total_len = size;
1072
1073 /* read from off 0 */
1074 memcpy(buf, d->buf, total_len);
1075
1076 return total_len;
1077 }
1078
1079 /*
1080 * Returns 0 on success.
1081 * It is the caller's responsibility to free `return_usage`, unless this
1082 * function returns an error.
1083 */
1084 int read_cpuacct_usage_all(char *cg, char *cpuset,
1085 struct cpuacct_usage **return_usage, int *size)
1086 {
1087 __do_free char *usage_str = NULL;
1088 __do_free struct cpuacct_usage *cpu_usage = NULL;
1089 int i = 0, j = 0, read_pos = 0, read_cnt = 0;
1090 int cpucount;
1091 int ret;
1092 int cg_cpu;
1093 uint64_t cg_user, cg_system;
1094 int64_t ticks_per_sec;
1095
1096 ticks_per_sec = sysconf(_SC_CLK_TCK);
1097 if (ticks_per_sec < 0 && errno == EINVAL) {
1098 lxcfs_debug("%m - Failed to determine number of ticks per second");
1099 return -1;
1100 }
1101
1102 cpucount = get_nprocs_conf();
1103 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
1104 if (!cpu_usage)
1105 return -ENOMEM;
1106
1107 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
1108 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
1109 char *sep = " \t\n";
1110 char *tok;
1111
1112 /* Read cpuacct.usage_percpu instead. */
1113 lxcfs_debug("Falling back to cpuacct.usage_percpu");
1114 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
1115 return -1;
1116
1117 lxc_iterate_parts(tok, usage_str, sep) {
1118 uint64_t percpu_user;
1119
1120 if (i >= cpucount)
1121 break;
1122
1123 tok = trim_whitespace_in_place(tok);
1124 ret = safe_uint64(tok, &percpu_user, 10);
1125 if (ret)
1126 return -1;
1127
1128 /* Convert the time from nanoseconds to USER_HZ */
1129 cpu_usage[i].user = percpu_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1130 cpu_usage[i].system = cpu_usage[i].user;
1131 i++;
1132 lxcfs_debug("cpu%d with time %s", i, tok);
1133 }
1134 } else {
1135 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0)
1136 return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg);
1137
1138 read_pos += read_cnt;
1139
1140 for (i = 0, j = 0; i < cpucount; i++) {
1141 ret = sscanf(usage_str + read_pos,
1142 "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu,
1143 &cg_user, &cg_system, &read_cnt);
1144
1145 if (ret == EOF)
1146 break;
1147
1148 if (ret != 3)
1149 return log_error(-EINVAL, "Failed to parse cpuacct.usage_all line %s from cgroup %s",
1150 usage_str + read_pos, cg);
1151
1152 read_pos += read_cnt;
1153
1154 /* Convert the time from nanoseconds to USER_HZ */
1155 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
1156 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
1157 j++;
1158 }
1159 }
1160
1161 *return_usage = move_ptr(cpu_usage);
1162 *size = cpucount;
1163 return 0;
1164 }
1165
1166 static bool cpuview_init_head(struct cg_proc_stat_head **head)
1167 {
1168 __do_free struct cg_proc_stat_head *h;
1169
1170 h = zalloc(sizeof(struct cg_proc_stat_head));
1171 if (!h)
1172 return false;
1173
1174 if (pthread_rwlock_init(&h->lock, NULL))
1175 return false;
1176
1177 h->lastcheck = time(NULL);
1178
1179 *head = move_ptr(h);
1180 return true;
1181 }
1182
1183 bool init_cpuview(void)
1184 {
1185 int i;
1186
1187 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
1188 proc_stat_history[i] = NULL;
1189
1190 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1191 if (!cpuview_init_head(&proc_stat_history[i]))
1192 goto err;
1193 }
1194
1195 return true;
1196
1197 err:
1198 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
1199 if (proc_stat_history[i])
1200 free_disarm(proc_stat_history[i]);
1201 }
1202
1203 return false;
1204 }
1205
1206 static void cpuview_free_head(struct cg_proc_stat_head *head)
1207 {
1208 struct cg_proc_stat *node;
1209
1210 if (head->next) {
1211 node = head->next;
1212
1213 for (;;) {
1214 struct cg_proc_stat *cur = node;
1215 node = node->next;
1216 free_proc_stat_node(cur);
1217 if (!node)
1218 break;
1219 }
1220 }
1221
1222 pthread_rwlock_destroy(&head->lock);
1223 free_disarm(head);
1224 }
1225
1226 void free_cpuview(void)
1227 {
1228 for (int i = 0; i < CPUVIEW_HASH_SIZE; i++)
1229 if (proc_stat_history[i])
1230 cpuview_free_head(proc_stat_history[i]);
1231 }