]>
Commit | Line | Data |
---|---|---|
1 | /* SPDX-License-Identifier: LGPL-2.1+ */ | |
2 | ||
3 | #include "config.h" | |
4 | ||
5 | #include <dirent.h> | |
6 | #include <errno.h> | |
7 | #include <fcntl.h> | |
8 | #include <inttypes.h> | |
9 | #include <libgen.h> | |
10 | #include <pthread.h> | |
11 | #include <sched.h> | |
12 | #include <stdarg.h> | |
13 | #include <stdbool.h> | |
14 | #include <stdint.h> | |
15 | #include <stdio.h> | |
16 | #include <stdlib.h> | |
17 | #include <string.h> | |
18 | #include <time.h> | |
19 | #include <unistd.h> | |
20 | #include <wait.h> | |
21 | #include <linux/magic.h> | |
22 | #include <linux/sched.h> | |
23 | #include <sys/epoll.h> | |
24 | #include <sys/mman.h> | |
25 | #include <sys/mount.h> | |
26 | #include <sys/param.h> | |
27 | #include <sys/socket.h> | |
28 | #include <sys/syscall.h> | |
29 | #include <sys/sysinfo.h> | |
30 | #include <sys/vfs.h> | |
31 | ||
32 | #include "proc_cpuview.h" | |
33 | ||
34 | #include "bindings.h" | |
35 | #include "cgroup_fuse.h" | |
36 | #include "cpuset_parse.h" | |
37 | #include "cgroups/cgroup.h" | |
38 | #include "cgroups/cgroup_utils.h" | |
39 | #include "memory_utils.h" | |
40 | #include "proc_loadavg.h" | |
41 | #include "utils.h" | |
42 | ||
43 | /* Data for CPU view */ | |
44 | struct cg_proc_stat { | |
45 | char *cg; | |
46 | struct cpuacct_usage *usage; /* Real usage as read from the host's /proc/stat. */ | |
47 | struct cpuacct_usage *view; /* Usage stats reported to the container. */ | |
48 | int cpu_count; | |
49 | pthread_mutex_t lock; /* For node manipulation. */ | |
50 | struct cg_proc_stat *next; | |
51 | }; | |
52 | ||
53 | struct cg_proc_stat_head { | |
54 | struct cg_proc_stat *next; | |
55 | time_t lastcheck; | |
56 | ||
57 | /* | |
58 | * For access to the list. Reading can be parallel, pruning is exclusive. | |
59 | */ | |
60 | pthread_rwlock_t lock; | |
61 | }; | |
62 | ||
63 | #define CPUVIEW_HASH_SIZE 100 | |
64 | static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE]; | |
65 | ||
66 | static void reset_proc_stat_node(struct cg_proc_stat *node, | |
67 | struct cpuacct_usage *usage, int cpu_count) | |
68 | { | |
69 | lxcfs_debug("Resetting stat node for %s\n", node->cg); | |
70 | memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count); | |
71 | ||
72 | for (int i = 0; i < cpu_count; i++) { | |
73 | node->view[i].user = 0; | |
74 | node->view[i].system = 0; | |
75 | node->view[i].idle = 0; | |
76 | } | |
77 | ||
78 | node->cpu_count = cpu_count; | |
79 | } | |
80 | ||
81 | static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count) | |
82 | { | |
83 | __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL; | |
84 | ||
85 | /* Allocate new memory */ | |
86 | new_usage = zalloc(sizeof(struct cpuacct_usage) * cpu_count); | |
87 | if (!new_usage) | |
88 | return false; | |
89 | ||
90 | new_view = zalloc(sizeof(struct cpuacct_usage) * cpu_count); | |
91 | if (!new_view) | |
92 | return false; | |
93 | ||
94 | /* Copy existing data & initialize new elements */ | |
95 | for (int i = 0; i < cpu_count; i++) { | |
96 | if (i < node->cpu_count) { | |
97 | new_usage[i].user = node->usage[i].user; | |
98 | new_usage[i].system = node->usage[i].system; | |
99 | new_usage[i].idle = node->usage[i].idle; | |
100 | ||
101 | new_view[i].user = node->view[i].user; | |
102 | new_view[i].system = node->view[i].system; | |
103 | new_view[i].idle = node->view[i].idle; | |
104 | } | |
105 | } | |
106 | ||
107 | free(node->usage); | |
108 | node->usage = move_ptr(new_usage); | |
109 | ||
110 | free(node->view); | |
111 | node->view = move_ptr(new_view); | |
112 | node->cpu_count = cpu_count; | |
113 | ||
114 | return true; | |
115 | } | |
116 | ||
117 | static void free_proc_stat_node(struct cg_proc_stat *node) | |
118 | { | |
119 | if (node) { | |
120 | /* | |
121 | * We're abusing the usage pointer to indicate that | |
122 | * pthread_mutex_init() was successful. Don't judge me. | |
123 | */ | |
124 | if (node->usage) | |
125 | pthread_mutex_destroy(&node->lock); | |
126 | free_disarm(node->cg); | |
127 | free_disarm(node->usage); | |
128 | free_disarm(node->view); | |
129 | free_disarm(node); | |
130 | } | |
131 | } | |
132 | ||
133 | define_cleanup_function(struct cg_proc_stat *, free_proc_stat_node); | |
134 | ||
135 | static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node) | |
136 | { | |
137 | call_cleaner(free_proc_stat_node) struct cg_proc_stat *new = new_node; | |
138 | struct cg_proc_stat *rv = new_node; | |
139 | int hash = calc_hash(new->cg) % CPUVIEW_HASH_SIZE; | |
140 | struct cg_proc_stat_head *head = proc_stat_history[hash]; | |
141 | struct cg_proc_stat *cur; | |
142 | ||
143 | pthread_rwlock_wrlock(&head->lock); | |
144 | ||
145 | if (!head->next) { | |
146 | head->next = move_ptr(new); | |
147 | goto out_rwlock_unlock; | |
148 | } | |
149 | ||
150 | cur = head->next; | |
151 | ||
152 | for (;;) { | |
153 | /* | |
154 | * The node to be added is already present in the list, so | |
155 | * free the newly allocated one and return the one we found. | |
156 | */ | |
157 | if (strcmp(cur->cg, new->cg) == 0) { | |
158 | rv = cur; | |
159 | goto out_rwlock_unlock; | |
160 | } | |
161 | ||
162 | /* Keep walking. */ | |
163 | if (cur->next) { | |
164 | cur = cur->next; | |
165 | continue; | |
166 | } | |
167 | ||
168 | /* Add new node to end of list. */ | |
169 | cur->next = move_ptr(new); | |
170 | goto out_rwlock_unlock; | |
171 | } | |
172 | ||
173 | out_rwlock_unlock: | |
174 | pthread_rwlock_unlock(&head->lock); | |
175 | return move_ptr(rv); | |
176 | } | |
177 | ||
178 | static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, | |
179 | int cpu_count, const char *cg) | |
180 | { | |
181 | call_cleaner(free_proc_stat_node) struct cg_proc_stat *node = NULL; | |
182 | __do_free struct cpuacct_usage *new_usage = NULL; | |
183 | ||
184 | node = zalloc(sizeof(struct cg_proc_stat)); | |
185 | if (!node) | |
186 | return NULL; | |
187 | ||
188 | node->cg = strdup(cg); | |
189 | if (!node->cg) | |
190 | return NULL; | |
191 | ||
192 | new_usage = memdup(usage, sizeof(struct cpuacct_usage) * cpu_count); | |
193 | if (!new_usage) | |
194 | return NULL; | |
195 | ||
196 | node->view = zalloc(sizeof(struct cpuacct_usage) * cpu_count); | |
197 | if (!node->view) | |
198 | return NULL; | |
199 | ||
200 | node->cpu_count = cpu_count; | |
201 | ||
202 | if (pthread_mutex_init(&node->lock, NULL)) | |
203 | return NULL; | |
204 | /* | |
205 | * We're abusing the usage pointer to indicate that | |
206 | * pthread_mutex_init() was successful. Don't judge me. | |
207 | */ | |
208 | node->usage = move_ptr(new_usage); | |
209 | ||
210 | return move_ptr(node); | |
211 | } | |
212 | ||
213 | static bool cgroup_supports(const char *controller, const char *cgroup, | |
214 | const char *file) | |
215 | { | |
216 | __do_free char *path = NULL; | |
217 | int cfd; | |
218 | ||
219 | cfd = get_cgroup_fd(controller); | |
220 | if (cfd < 0) | |
221 | return false; | |
222 | ||
223 | path = must_make_path_relative(cgroup, file, NULL); | |
224 | return faccessat(cfd, path, F_OK, 0) == 0; | |
225 | } | |
226 | ||
227 | static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node) | |
228 | { | |
229 | struct cg_proc_stat *first = NULL; | |
230 | ||
231 | for (struct cg_proc_stat *prev = NULL; node; ) { | |
232 | if (!cgroup_supports("cpu", node->cg, "cpu.shares")) { | |
233 | struct cg_proc_stat *cur = node; | |
234 | ||
235 | if (prev) | |
236 | prev->next = node->next; | |
237 | else | |
238 | first = node->next; | |
239 | ||
240 | node = node->next; | |
241 | lxcfs_debug("Removing stat node for %s\n", cur); | |
242 | ||
243 | free_proc_stat_node(cur); | |
244 | } else { | |
245 | if (!first) | |
246 | first = node; | |
247 | prev = node; | |
248 | node = node->next; | |
249 | } | |
250 | } | |
251 | ||
252 | return first; | |
253 | } | |
254 | ||
255 | #define PROC_STAT_PRUNE_INTERVAL 10 | |
256 | static void prune_proc_stat_history(void) | |
257 | { | |
258 | time_t now = time(NULL); | |
259 | ||
260 | for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) { | |
261 | pthread_rwlock_wrlock(&proc_stat_history[i]->lock); | |
262 | ||
263 | if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) { | |
264 | pthread_rwlock_unlock(&proc_stat_history[i]->lock); | |
265 | return; | |
266 | } | |
267 | ||
268 | if (proc_stat_history[i]->next) { | |
269 | proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next); | |
270 | proc_stat_history[i]->lastcheck = now; | |
271 | } | |
272 | ||
273 | pthread_rwlock_unlock(&proc_stat_history[i]->lock); | |
274 | } | |
275 | } | |
276 | ||
277 | static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, | |
278 | const char *cg) | |
279 | { | |
280 | struct cg_proc_stat *node; | |
281 | ||
282 | pthread_rwlock_rdlock(&head->lock); | |
283 | ||
284 | if (!head->next) { | |
285 | pthread_rwlock_unlock(&head->lock); | |
286 | return NULL; | |
287 | } | |
288 | ||
289 | node = head->next; | |
290 | ||
291 | do { | |
292 | if (strcmp(cg, node->cg) == 0) | |
293 | goto out; | |
294 | } while ((node = node->next)); | |
295 | ||
296 | node = NULL; | |
297 | ||
298 | out: | |
299 | pthread_rwlock_unlock(&head->lock); | |
300 | prune_proc_stat_history(); | |
301 | return node; | |
302 | } | |
303 | ||
304 | static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, | |
305 | int cpu_count, const char *cg) | |
306 | { | |
307 | int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE; | |
308 | struct cg_proc_stat_head *head = proc_stat_history[hash]; | |
309 | struct cg_proc_stat *node; | |
310 | ||
311 | node = find_proc_stat_node(head, cg); | |
312 | if (!node) { | |
313 | node = new_proc_stat_node(usage, cpu_count, cg); | |
314 | if (!node) | |
315 | return NULL; | |
316 | ||
317 | node = add_proc_stat_node(node); | |
318 | lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg); | |
319 | } | |
320 | ||
321 | pthread_mutex_lock(&node->lock); | |
322 | ||
323 | /* | |
324 | * If additional CPUs on the host have been enabled, CPU usage counter | |
325 | * arrays have to be expanded. | |
326 | */ | |
327 | if (node->cpu_count < cpu_count) { | |
328 | lxcfs_debug("Expanding stat node %d->%d for %s\n", | |
329 | node->cpu_count, cpu_count, cg); | |
330 | ||
331 | if (!expand_proc_stat_node(node, cpu_count)) { | |
332 | pthread_mutex_unlock(&node->lock); | |
333 | return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg); | |
334 | } | |
335 | } | |
336 | ||
337 | return node; | |
338 | } | |
339 | ||
340 | static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage, | |
341 | uint64_t *counter, uint64_t threshold) | |
342 | { | |
343 | uint64_t free_space, to_add; | |
344 | ||
345 | free_space = threshold - usage->user - usage->system; | |
346 | ||
347 | if (free_space > usage->idle) | |
348 | free_space = usage->idle; | |
349 | ||
350 | if (free_space > *surplus) | |
351 | to_add = *surplus; | |
352 | else | |
353 | to_add = free_space; | |
354 | ||
355 | *counter += to_add; | |
356 | usage->idle -= to_add; | |
357 | *surplus -= to_add; | |
358 | } | |
359 | ||
360 | static uint64_t diff_cpu_usage(struct cpuacct_usage *older, | |
361 | struct cpuacct_usage *newer, | |
362 | struct cpuacct_usage *diff, int cpu_count) | |
363 | { | |
364 | uint64_t sum = 0; | |
365 | ||
366 | for (int i = 0; i < cpu_count; i++) { | |
367 | if (!newer[i].online) | |
368 | continue; | |
369 | ||
370 | /* | |
371 | * When cpuset is changed on the fly, the CPUs might get | |
372 | * reordered. We could either reset all counters, or check | |
373 | * that the substractions below will return expected results. | |
374 | */ | |
375 | if (newer[i].user > older[i].user) | |
376 | diff[i].user = newer[i].user - older[i].user; | |
377 | else | |
378 | diff[i].user = 0; | |
379 | ||
380 | if (newer[i].system > older[i].system) | |
381 | diff[i].system = newer[i].system - older[i].system; | |
382 | else | |
383 | diff[i].system = 0; | |
384 | ||
385 | if (newer[i].idle > older[i].idle) | |
386 | diff[i].idle = newer[i].idle - older[i].idle; | |
387 | else | |
388 | diff[i].idle = 0; | |
389 | ||
390 | sum += diff[i].user; | |
391 | sum += diff[i].system; | |
392 | sum += diff[i].idle; | |
393 | } | |
394 | ||
395 | return sum; | |
396 | } | |
397 | ||
398 | /* | |
399 | * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or | |
400 | * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned | |
401 | * through `value`. | |
402 | */ | |
403 | static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value) | |
404 | { | |
405 | __do_free char *str = NULL; | |
406 | char file[STRLITERALLEN("cpu.cfs_period_us") + 1]; | |
407 | bool first = true; | |
408 | int ret; | |
409 | ||
410 | if (pure_unified_layout(cgroup_ops)) { | |
411 | first = !strcmp(param, "quota"); | |
412 | ret = snprintf(file, sizeof(file), "cpu.max"); | |
413 | } else { | |
414 | ret = snprintf(file, sizeof(file), "cpu.cfs_%s_us", param); | |
415 | } | |
416 | if (ret < 0 || (size_t)ret >= sizeof(file)) | |
417 | return false; | |
418 | ||
419 | if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str)) | |
420 | return false; | |
421 | ||
422 | return sscanf(str, first ? "%" PRId64 : "%*d %" PRId64, value) == 1; | |
423 | } | |
424 | ||
425 | /* | |
426 | * Return the exact number of visible CPUs based on CPU quotas. | |
427 | * If there is no quota set, zero is returned. | |
428 | */ | |
429 | static double exact_cpu_count(const char *cg) | |
430 | { | |
431 | double rv; | |
432 | int nprocs; | |
433 | int64_t cfs_quota, cfs_period; | |
434 | ||
435 | if (!read_cpu_cfs_param(cg, "quota", &cfs_quota)) | |
436 | return 0; | |
437 | ||
438 | if (!read_cpu_cfs_param(cg, "period", &cfs_period)) | |
439 | return 0; | |
440 | ||
441 | if (cfs_quota <= 0 || cfs_period <= 0) | |
442 | return 0; | |
443 | ||
444 | rv = (double)cfs_quota / (double)cfs_period; | |
445 | ||
446 | nprocs = get_nprocs(); | |
447 | ||
448 | if (rv > nprocs) | |
449 | rv = nprocs; | |
450 | ||
451 | return rv; | |
452 | } | |
453 | ||
454 | /* | |
455 | * Return the maximum number of visible CPUs based on CPU quotas. | |
456 | * If there is no quota set, zero is returned. | |
457 | */ | |
458 | int max_cpu_count(const char *cg) | |
459 | { | |
460 | __do_free char *cpuset = NULL; | |
461 | int rv, nprocs; | |
462 | int64_t cfs_quota, cfs_period; | |
463 | int nr_cpus_in_cpuset = 0; | |
464 | ||
465 | if (!read_cpu_cfs_param(cg, "quota", &cfs_quota)) | |
466 | return 0; | |
467 | ||
468 | if (!read_cpu_cfs_param(cg, "period", &cfs_period)) | |
469 | return 0; | |
470 | ||
471 | cpuset = get_cpuset(cg); | |
472 | if (cpuset) | |
473 | nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset); | |
474 | ||
475 | if (cfs_quota <= 0 || cfs_period <= 0) { | |
476 | if (nr_cpus_in_cpuset > 0) | |
477 | return nr_cpus_in_cpuset; | |
478 | ||
479 | return 0; | |
480 | } | |
481 | ||
482 | rv = cfs_quota / cfs_period; | |
483 | ||
484 | /* | |
485 | * In case quota/period does not yield a whole number, add one CPU for | |
486 | * the remainder. | |
487 | */ | |
488 | if ((cfs_quota % cfs_period) > 0) | |
489 | rv += 1; | |
490 | ||
491 | nprocs = get_nprocs(); | |
492 | if (rv > nprocs) | |
493 | rv = nprocs; | |
494 | ||
495 | /* Use min value in cpu quota and cpuset. */ | |
496 | if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv) | |
497 | rv = nr_cpus_in_cpuset; | |
498 | ||
499 | return rv; | |
500 | } | |
501 | ||
502 | int cpuview_proc_stat(const char *cg, const char *cpuset, | |
503 | struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, | |
504 | FILE *f, char *buf, size_t buf_size) | |
505 | { | |
506 | __do_free char *line = NULL; | |
507 | __do_free struct cpuacct_usage *diff = NULL; | |
508 | size_t linelen = 0, total_len = 0; | |
509 | int curcpu = -1; /* cpu numbering starts at 0 */ | |
510 | int physcpu, i; | |
511 | int cpu_cnt = 0; | |
512 | uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, | |
513 | softirq = 0, steal = 0, guest = 0, guest_nice = 0; | |
514 | uint64_t user_sum = 0, system_sum = 0, idle_sum = 0; | |
515 | uint64_t user_surplus = 0, system_surplus = 0; | |
516 | int nprocs, max_cpus; | |
517 | ssize_t l; | |
518 | uint64_t total_sum, threshold; | |
519 | struct cg_proc_stat *stat_node; | |
520 | ||
521 | nprocs = get_nprocs_conf(); | |
522 | if (cg_cpu_usage_size < nprocs) | |
523 | nprocs = cg_cpu_usage_size; | |
524 | ||
525 | /* Read all CPU stats and stop when we've encountered other lines */ | |
526 | while (getline(&line, &linelen, f) != -1) { | |
527 | int ret; | |
528 | char cpu_char[10]; /* That's a lot of cores */ | |
529 | uint64_t all_used, cg_used; | |
530 | ||
531 | if (strlen(line) == 0) | |
532 | continue; | |
533 | ||
534 | /* not a ^cpuN line containing a number N */ | |
535 | if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) | |
536 | break; | |
537 | ||
538 | if (sscanf(cpu_char, "%d", &physcpu) != 1) | |
539 | continue; | |
540 | ||
541 | if (physcpu >= cg_cpu_usage_size) | |
542 | continue; | |
543 | ||
544 | curcpu++; | |
545 | cpu_cnt++; | |
546 | ||
547 | if (!cpu_in_cpuset(physcpu, cpuset)) { | |
548 | for (i = curcpu; i <= physcpu; i++) | |
549 | cg_cpu_usage[i].online = false; | |
550 | continue; | |
551 | } | |
552 | ||
553 | if (curcpu < physcpu) { | |
554 | /* Some CPUs may be disabled */ | |
555 | for (i = curcpu; i < physcpu; i++) | |
556 | cg_cpu_usage[i].online = false; | |
557 | ||
558 | curcpu = physcpu; | |
559 | } | |
560 | ||
561 | cg_cpu_usage[curcpu].online = true; | |
562 | ||
563 | ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu", | |
564 | &user, | |
565 | &nice, | |
566 | &system, | |
567 | &idle, | |
568 | &iowait, | |
569 | &irq, | |
570 | &softirq, | |
571 | &steal, | |
572 | &guest, | |
573 | &guest_nice); | |
574 | if (ret != 10) | |
575 | continue; | |
576 | ||
577 | all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice; | |
578 | cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system; | |
579 | ||
580 | if (all_used >= cg_used) { | |
581 | cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used); | |
582 | } else { | |
583 | lxcfs_v("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time", | |
584 | curcpu, cg, all_used, cg_used); | |
585 | cg_cpu_usage[curcpu].idle = idle; | |
586 | } | |
587 | } | |
588 | ||
589 | /* Cannot use more CPUs than is available in cpuset. */ | |
590 | max_cpus = max_cpu_count(cg); | |
591 | if (max_cpus > cpu_cnt || !max_cpus) | |
592 | max_cpus = cpu_cnt; | |
593 | ||
594 | /* takes lock pthread_mutex_lock(&node->lock) */ | |
595 | stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg); | |
596 | if (!stat_node) | |
597 | return log_error(0, "Failed to find/create stat node for %s", cg); | |
598 | ||
599 | diff = zalloc(sizeof(struct cpuacct_usage) * nprocs); | |
600 | if (!diff) | |
601 | goto out_pthread_mutex_unlock; | |
602 | ||
603 | /* | |
604 | * If the new values are LOWER than values stored in memory, it means | |
605 | * the cgroup has been reset/recreated and we should reset too. | |
606 | */ | |
607 | for (curcpu = 0; curcpu < nprocs; curcpu++) { | |
608 | if (!cg_cpu_usage[curcpu].online) | |
609 | continue; | |
610 | ||
611 | if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user) | |
612 | reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs); | |
613 | ||
614 | break; | |
615 | } | |
616 | ||
617 | total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs); | |
618 | ||
619 | for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) { | |
620 | stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online; | |
621 | ||
622 | if (!stat_node->usage[curcpu].online) | |
623 | continue; | |
624 | ||
625 | i++; | |
626 | ||
627 | stat_node->usage[curcpu].user += diff[curcpu].user; | |
628 | stat_node->usage[curcpu].system += diff[curcpu].system; | |
629 | stat_node->usage[curcpu].idle += diff[curcpu].idle; | |
630 | ||
631 | if (max_cpus > 0 && i >= max_cpus) { | |
632 | user_surplus += diff[curcpu].user; | |
633 | system_surplus += diff[curcpu].system; | |
634 | } | |
635 | } | |
636 | ||
637 | /* Calculate usage counters of visible CPUs */ | |
638 | if (max_cpus > 0) { | |
639 | uint64_t diff_user = 0; | |
640 | uint64_t diff_system = 0; | |
641 | uint64_t diff_idle = 0; | |
642 | uint64_t max_diff_idle = 0; | |
643 | uint64_t max_diff_idle_index = 0; | |
644 | double exact_cpus; | |
645 | /* threshold = maximum usage per cpu, including idle */ | |
646 | threshold = total_sum / cpu_cnt * max_cpus; | |
647 | ||
648 | for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) { | |
649 | if (!stat_node->usage[curcpu].online) | |
650 | continue; | |
651 | ||
652 | i++; | |
653 | ||
654 | if (i == max_cpus) | |
655 | break; | |
656 | ||
657 | if (diff[curcpu].user + diff[curcpu].system >= threshold) | |
658 | continue; | |
659 | ||
660 | /* Add user */ | |
661 | add_cpu_usage(&user_surplus, &diff[curcpu], | |
662 | &diff[curcpu].user, threshold); | |
663 | ||
664 | if (diff[curcpu].user + diff[curcpu].system >= threshold) | |
665 | continue; | |
666 | ||
667 | /* If there is still room, add system */ | |
668 | add_cpu_usage(&system_surplus, &diff[curcpu], | |
669 | &diff[curcpu].system, threshold); | |
670 | } | |
671 | ||
672 | if (user_surplus > 0) | |
673 | lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg); | |
674 | if (system_surplus > 0) | |
675 | lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg); | |
676 | ||
677 | for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) { | |
678 | if (!stat_node->usage[curcpu].online) | |
679 | continue; | |
680 | ||
681 | i++; | |
682 | ||
683 | if (i == max_cpus) | |
684 | break; | |
685 | ||
686 | stat_node->view[curcpu].user += diff[curcpu].user; | |
687 | stat_node->view[curcpu].system += diff[curcpu].system; | |
688 | stat_node->view[curcpu].idle += diff[curcpu].idle; | |
689 | ||
690 | user_sum += stat_node->view[curcpu].user; | |
691 | system_sum += stat_node->view[curcpu].system; | |
692 | idle_sum += stat_node->view[curcpu].idle; | |
693 | ||
694 | diff_user += diff[curcpu].user; | |
695 | diff_system += diff[curcpu].system; | |
696 | diff_idle += diff[curcpu].idle; | |
697 | if (diff[curcpu].idle > max_diff_idle) { | |
698 | max_diff_idle = diff[curcpu].idle; | |
699 | max_diff_idle_index = curcpu; | |
700 | } | |
701 | ||
702 | lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle); | |
703 | } | |
704 | lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle); | |
705 | ||
706 | /* revise cpu usage view to support partial cpu case. */ | |
707 | exact_cpus = exact_cpu_count(cg); | |
708 | if (exact_cpus < (double)max_cpus){ | |
709 | uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus)); | |
710 | ||
711 | lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus); | |
712 | lxcfs_v("delta: %lu\n", delta); | |
713 | lxcfs_v("idle_sum before: %lu\n", idle_sum); | |
714 | if (idle_sum > delta) | |
715 | idle_sum = idle_sum - delta; | |
716 | else | |
717 | idle_sum = 0; | |
718 | lxcfs_v("idle_sum after: %lu\n", idle_sum); | |
719 | ||
720 | curcpu = max_diff_idle_index; | |
721 | lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle); | |
722 | if (stat_node->view[curcpu].idle > delta) | |
723 | stat_node->view[curcpu].idle = stat_node->view[curcpu].idle - delta; | |
724 | else | |
725 | stat_node->view[curcpu].idle = 0; | |
726 | lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle); | |
727 | } | |
728 | } else { | |
729 | for (curcpu = 0; curcpu < nprocs; curcpu++) { | |
730 | if (!stat_node->usage[curcpu].online) | |
731 | continue; | |
732 | ||
733 | stat_node->view[curcpu].user = stat_node->usage[curcpu].user; | |
734 | stat_node->view[curcpu].system = stat_node->usage[curcpu].system; | |
735 | stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle; | |
736 | ||
737 | user_sum += stat_node->view[curcpu].user; | |
738 | system_sum += stat_node->view[curcpu].system; | |
739 | idle_sum += stat_node->view[curcpu].idle; | |
740 | } | |
741 | } | |
742 | ||
743 | /* Render the file */ | |
744 | /* cpu-all */ | |
745 | l = snprintf(buf, buf_size, | |
746 | "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n", | |
747 | user_sum, system_sum, idle_sum); | |
748 | lxcfs_v("cpu-all: %s\n", buf); | |
749 | if (l < 0) { | |
750 | lxcfs_error("Failed to write cache"); | |
751 | total_len = 0; | |
752 | goto out_pthread_mutex_unlock; | |
753 | } | |
754 | if ((size_t)l >= buf_size) { | |
755 | lxcfs_error("Write to cache was truncated"); | |
756 | total_len = 0; | |
757 | goto out_pthread_mutex_unlock; | |
758 | } | |
759 | ||
760 | buf += l; | |
761 | buf_size -= l; | |
762 | total_len += l; | |
763 | ||
764 | /* Render visible CPUs */ | |
765 | for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) { | |
766 | if (!stat_node->usage[curcpu].online) | |
767 | continue; | |
768 | ||
769 | i++; | |
770 | ||
771 | if (max_cpus > 0 && i == max_cpus) | |
772 | break; | |
773 | ||
774 | l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n", | |
775 | i, | |
776 | stat_node->view[curcpu].user, | |
777 | stat_node->view[curcpu].system, | |
778 | stat_node->view[curcpu].idle); | |
779 | lxcfs_v("cpu: %s\n", buf); | |
780 | if (l < 0) { | |
781 | lxcfs_error("Failed to write cache"); | |
782 | total_len = 0; | |
783 | goto out_pthread_mutex_unlock; | |
784 | } | |
785 | if ((size_t)l >= buf_size) { | |
786 | lxcfs_error("Write to cache was truncated"); | |
787 | total_len = 0; | |
788 | goto out_pthread_mutex_unlock; | |
789 | } | |
790 | ||
791 | buf += l; | |
792 | buf_size -= l; | |
793 | total_len += l; | |
794 | } | |
795 | ||
796 | /* Pass the rest of /proc/stat, start with the last line read */ | |
797 | l = snprintf(buf, buf_size, "%s", line); | |
798 | if (l < 0) { | |
799 | lxcfs_error("Failed to write cache"); | |
800 | total_len = 0; | |
801 | goto out_pthread_mutex_unlock; | |
802 | } | |
803 | if ((size_t)l >= buf_size) { | |
804 | lxcfs_error("Write to cache was truncated"); | |
805 | total_len = 0; | |
806 | goto out_pthread_mutex_unlock; | |
807 | } | |
808 | ||
809 | buf += l; | |
810 | buf_size -= l; | |
811 | total_len += l; | |
812 | ||
813 | /* Pass the rest of the host's /proc/stat */ | |
814 | while (getline(&line, &linelen, f) != -1) { | |
815 | l = snprintf(buf, buf_size, "%s", line); | |
816 | if (l < 0) { | |
817 | lxcfs_error("Failed to write cache"); | |
818 | total_len = 0; | |
819 | goto out_pthread_mutex_unlock; | |
820 | } | |
821 | if ((size_t)l >= buf_size) { | |
822 | lxcfs_error("Write to cache was truncated"); | |
823 | total_len = 0; | |
824 | goto out_pthread_mutex_unlock; | |
825 | } | |
826 | ||
827 | buf += l; | |
828 | buf_size -= l; | |
829 | total_len += l; | |
830 | } | |
831 | ||
832 | out_pthread_mutex_unlock: | |
833 | if (stat_node) | |
834 | pthread_mutex_unlock(&stat_node->lock); | |
835 | ||
836 | return total_len; | |
837 | } | |
838 | ||
839 | /* | |
840 | * check whether this is a '^processor" line in /proc/cpuinfo | |
841 | */ | |
842 | static inline bool is_processor_line(const char *line) | |
843 | { | |
844 | int cpu; | |
845 | return sscanf(line, "processor : %d", &cpu) == 1; | |
846 | } | |
847 | ||
848 | static inline bool cpuline_in_cpuset(const char *line, const char *cpuset) | |
849 | { | |
850 | int cpu; | |
851 | ||
852 | if (sscanf(line, "processor : %d", &cpu) == 1) | |
853 | return cpu_in_cpuset(cpu, cpuset); | |
854 | ||
855 | return false; | |
856 | } | |
857 | ||
858 | int proc_cpuinfo_read(char *buf, size_t size, off_t offset, | |
859 | struct fuse_file_info *fi) | |
860 | { | |
861 | __do_free char *cg = NULL, *cpuset = NULL, *line = NULL; | |
862 | __do_free void *fopen_cache = NULL; | |
863 | __do_fclose FILE *f = NULL; | |
864 | struct fuse_context *fc = fuse_get_context(); | |
865 | struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data; | |
866 | struct file_info *d = INTTYPE_TO_PTR(fi->fh); | |
867 | size_t linelen = 0, total_len = 0; | |
868 | bool am_printing = false, firstline = true, is_s390x = false; | |
869 | int curcpu = -1, cpu, max_cpus = 0; | |
870 | bool use_view; | |
871 | char *cache = d->buf; | |
872 | size_t cache_size = d->buflen; | |
873 | ||
874 | if (offset) { | |
875 | size_t left; | |
876 | ||
877 | if (offset > d->size) | |
878 | return -EINVAL; | |
879 | ||
880 | if (!d->cached) | |
881 | return 0; | |
882 | ||
883 | left = d->size - offset; | |
884 | total_len = left > size ? size: left; | |
885 | memcpy(buf, cache + offset, total_len); | |
886 | ||
887 | return total_len; | |
888 | } | |
889 | ||
890 | pid_t initpid = lookup_initpid_in_store(fc->pid); | |
891 | if (initpid <= 1 || is_shared_pidns(initpid)) | |
892 | initpid = fc->pid; | |
893 | ||
894 | cg = get_pid_cgroup(initpid, "cpuset"); | |
895 | if (!cg) | |
896 | return read_file_fuse("proc/cpuinfo", buf, size, d); | |
897 | prune_init_slice(cg); | |
898 | ||
899 | cpuset = get_cpuset(cg); | |
900 | if (!cpuset) | |
901 | return 0; | |
902 | ||
903 | if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs) | |
904 | use_view = true; | |
905 | else | |
906 | use_view = false; | |
907 | if (use_view) | |
908 | max_cpus = max_cpu_count(cg); | |
909 | ||
910 | f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache); | |
911 | if (!f) | |
912 | return 0; | |
913 | ||
914 | while (getline(&line, &linelen, f) != -1) { | |
915 | ssize_t l; | |
916 | if (firstline) { | |
917 | firstline = false; | |
918 | if (strstr(line, "IBM/S390") != NULL) { | |
919 | is_s390x = true; | |
920 | am_printing = true; | |
921 | continue; | |
922 | } | |
923 | } | |
924 | ||
925 | if (strncmp(line, "# processors:", 12) == 0) | |
926 | continue; | |
927 | ||
928 | if (is_processor_line(line)) { | |
929 | if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus) | |
930 | break; | |
931 | ||
932 | am_printing = cpuline_in_cpuset(line, cpuset); | |
933 | if (am_printing) { | |
934 | curcpu++; | |
935 | l = snprintf(cache, cache_size, "processor : %d\n", curcpu); | |
936 | if (l < 0) | |
937 | return log_error(0, "Failed to write cache"); | |
938 | if ((size_t)l >= cache_size) | |
939 | return log_error(0, "Write to cache was truncated"); | |
940 | cache += l; | |
941 | cache_size -= l; | |
942 | total_len += l; | |
943 | } | |
944 | continue; | |
945 | } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) { | |
946 | char *p; | |
947 | ||
948 | if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus) | |
949 | break; | |
950 | ||
951 | if (!cpu_in_cpuset(cpu, cpuset)) | |
952 | continue; | |
953 | ||
954 | curcpu ++; | |
955 | p = strchr(line, ':'); | |
956 | if (!p || !*p) | |
957 | return 0; | |
958 | p++; | |
959 | ||
960 | l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p); | |
961 | if (l < 0) | |
962 | return log_error(0, "Failed to write cache"); | |
963 | if ((size_t)l >= cache_size) | |
964 | return log_error(0, "Write to cache was truncated"); | |
965 | ||
966 | cache += l; | |
967 | cache_size -= l; | |
968 | total_len += l; | |
969 | continue; | |
970 | ||
971 | } | |
972 | if (am_printing) { | |
973 | l = snprintf(cache, cache_size, "%s", line); | |
974 | if (l < 0) | |
975 | return log_error(0, "Failed to write cache"); | |
976 | if ((size_t)l >= cache_size) | |
977 | return log_error(0, "Write to cache was truncated"); | |
978 | ||
979 | cache += l; | |
980 | cache_size -= l; | |
981 | total_len += l; | |
982 | } | |
983 | } | |
984 | ||
985 | if (is_s390x) { | |
986 | __do_free char *origcache = d->buf; | |
987 | ssize_t l; | |
988 | ||
989 | d->buf = malloc(d->buflen); | |
990 | if (!d->buf) { | |
991 | d->buf = move_ptr(origcache); | |
992 | return 0; | |
993 | } | |
994 | ||
995 | cache = d->buf; | |
996 | cache_size = d->buflen; | |
997 | total_len = 0; | |
998 | l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n"); | |
999 | if (l < 0 || (size_t)l >= cache_size) | |
1000 | return 0; | |
1001 | ||
1002 | cache_size -= l; | |
1003 | cache += l; | |
1004 | total_len += l; | |
1005 | l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1); | |
1006 | if (l < 0 || (size_t)l >= cache_size) | |
1007 | return 0; | |
1008 | ||
1009 | cache_size -= l; | |
1010 | cache += l; | |
1011 | total_len += l; | |
1012 | l = snprintf(cache, cache_size, "%s", origcache); | |
1013 | if (l < 0 || (size_t)l >= cache_size) | |
1014 | return 0; | |
1015 | total_len += l; | |
1016 | } | |
1017 | ||
1018 | d->cached = 1; | |
1019 | d->size = total_len; | |
1020 | if (total_len > size) | |
1021 | total_len = size; | |
1022 | ||
1023 | /* read from off 0 */ | |
1024 | memcpy(buf, d->buf, total_len); | |
1025 | ||
1026 | return total_len; | |
1027 | } | |
1028 | ||
1029 | /* | |
1030 | * Returns 0 on success. | |
1031 | * It is the caller's responsibility to free `return_usage`, unless this | |
1032 | * function returns an error. | |
1033 | */ | |
1034 | int read_cpuacct_usage_all(char *cg, char *cpuset, | |
1035 | struct cpuacct_usage **return_usage, int *size) | |
1036 | { | |
1037 | __do_free char *usage_str = NULL; | |
1038 | __do_free struct cpuacct_usage *cpu_usage = NULL; | |
1039 | int i = 0, j = 0, read_pos = 0, read_cnt = 0; | |
1040 | int cpucount; | |
1041 | int ret; | |
1042 | int cg_cpu; | |
1043 | uint64_t cg_user, cg_system; | |
1044 | int64_t ticks_per_sec; | |
1045 | ||
1046 | ticks_per_sec = sysconf(_SC_CLK_TCK); | |
1047 | if (ticks_per_sec < 0 && errno == EINVAL) { | |
1048 | lxcfs_debug("%m - Failed to determine number of ticks per second"); | |
1049 | return -1; | |
1050 | } | |
1051 | ||
1052 | cpucount = get_nprocs_conf(); | |
1053 | cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount); | |
1054 | if (!cpu_usage) | |
1055 | return -ENOMEM; | |
1056 | ||
1057 | memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount); | |
1058 | if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) { | |
1059 | char *sep = " \t\n"; | |
1060 | char *tok; | |
1061 | ||
1062 | /* Read cpuacct.usage_percpu instead. */ | |
1063 | lxcfs_debug("Falling back to cpuacct.usage_percpu"); | |
1064 | if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str)) | |
1065 | return -1; | |
1066 | ||
1067 | lxc_iterate_parts(tok, usage_str, sep) { | |
1068 | uint64_t percpu_user; | |
1069 | ||
1070 | if (i >= cpucount) | |
1071 | break; | |
1072 | ||
1073 | tok = trim_whitespace_in_place(tok); | |
1074 | ret = safe_uint64(tok, &percpu_user, 10); | |
1075 | if (ret) | |
1076 | return -1; | |
1077 | ||
1078 | /* Convert the time from nanoseconds to USER_HZ */ | |
1079 | cpu_usage[i].user = percpu_user / 1000.0 / 1000 / 1000 * ticks_per_sec; | |
1080 | cpu_usage[i].system = cpu_usage[i].user; | |
1081 | i++; | |
1082 | lxcfs_debug("cpu%d with time %s", i, tok); | |
1083 | } | |
1084 | } else { | |
1085 | if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) | |
1086 | return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg); | |
1087 | ||
1088 | read_pos += read_cnt; | |
1089 | ||
1090 | for (i = 0, j = 0; i < cpucount; i++) { | |
1091 | ret = sscanf(usage_str + read_pos, | |
1092 | "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu, | |
1093 | &cg_user, &cg_system, &read_cnt); | |
1094 | ||
1095 | if (ret == EOF) | |
1096 | break; | |
1097 | ||
1098 | if (ret != 3) | |
1099 | return log_error(-EINVAL, "Failed to parse cpuacct.usage_all line %s from cgroup %s", | |
1100 | usage_str + read_pos, cg); | |
1101 | ||
1102 | read_pos += read_cnt; | |
1103 | ||
1104 | /* Convert the time from nanoseconds to USER_HZ */ | |
1105 | cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec; | |
1106 | cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec; | |
1107 | j++; | |
1108 | } | |
1109 | } | |
1110 | ||
1111 | *return_usage = move_ptr(cpu_usage); | |
1112 | *size = cpucount; | |
1113 | return 0; | |
1114 | } | |
1115 | ||
1116 | static bool cpuview_init_head(struct cg_proc_stat_head **head) | |
1117 | { | |
1118 | __do_free struct cg_proc_stat_head *h; | |
1119 | ||
1120 | h = zalloc(sizeof(struct cg_proc_stat_head)); | |
1121 | if (!h) | |
1122 | return false; | |
1123 | ||
1124 | if (pthread_rwlock_init(&h->lock, NULL)) | |
1125 | return false; | |
1126 | ||
1127 | h->lastcheck = time(NULL); | |
1128 | ||
1129 | *head = move_ptr(h); | |
1130 | return true; | |
1131 | } | |
1132 | ||
1133 | bool init_cpuview(void) | |
1134 | { | |
1135 | int i; | |
1136 | ||
1137 | for (i = 0; i < CPUVIEW_HASH_SIZE; i++) | |
1138 | proc_stat_history[i] = NULL; | |
1139 | ||
1140 | for (i = 0; i < CPUVIEW_HASH_SIZE; i++) { | |
1141 | if (!cpuview_init_head(&proc_stat_history[i])) | |
1142 | goto err; | |
1143 | } | |
1144 | ||
1145 | return true; | |
1146 | ||
1147 | err: | |
1148 | for (i = 0; i < CPUVIEW_HASH_SIZE; i++) { | |
1149 | if (proc_stat_history[i]) | |
1150 | free_disarm(proc_stat_history[i]); | |
1151 | } | |
1152 | ||
1153 | return false; | |
1154 | } | |
1155 | ||
1156 | static void cpuview_free_head(struct cg_proc_stat_head *head) | |
1157 | { | |
1158 | struct cg_proc_stat *node; | |
1159 | ||
1160 | if (head->next) { | |
1161 | node = head->next; | |
1162 | ||
1163 | for (;;) { | |
1164 | struct cg_proc_stat *cur = node; | |
1165 | node = node->next; | |
1166 | free_proc_stat_node(cur); | |
1167 | if (!node) | |
1168 | break; | |
1169 | } | |
1170 | } | |
1171 | ||
1172 | pthread_rwlock_destroy(&head->lock); | |
1173 | free_disarm(head); | |
1174 | } | |
1175 | ||
1176 | void free_cpuview(void) | |
1177 | { | |
1178 | for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) | |
1179 | if (proc_stat_history[i]) | |
1180 | cpuview_free_head(proc_stat_history[i]); | |
1181 | } |