]>
Commit | Line | Data |
---|---|---|
1f5596dd CB |
1 | /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
2 | ||
3 | #ifndef _GNU_SOURCE | |
4 | #define _GNU_SOURCE | |
5 | #endif | |
6 | ||
7 | #ifndef FUSE_USE_VERSION | |
8 | #define FUSE_USE_VERSION 26 | |
9 | #endif | |
10 | ||
11 | #define _FILE_OFFSET_BITS 64 | |
12 | ||
13 | #define __STDC_FORMAT_MACROS | |
14 | #include <dirent.h> | |
15 | #include <errno.h> | |
16 | #include <fcntl.h> | |
17 | #include <fuse.h> | |
18 | #include <inttypes.h> | |
19 | #include <libgen.h> | |
20 | #include <pthread.h> | |
21 | #include <sched.h> | |
22 | #include <stdarg.h> | |
23 | #include <stdbool.h> | |
24 | #include <stdint.h> | |
25 | #include <stdio.h> | |
26 | #include <stdlib.h> | |
27 | #include <string.h> | |
28 | #include <time.h> | |
29 | #include <unistd.h> | |
30 | #include <wait.h> | |
31 | #include <linux/magic.h> | |
32 | #include <linux/sched.h> | |
33 | #include <sys/epoll.h> | |
34 | #include <sys/mman.h> | |
35 | #include <sys/mount.h> | |
36 | #include <sys/param.h> | |
37 | #include <sys/socket.h> | |
38 | #include <sys/syscall.h> | |
39 | #include <sys/sysinfo.h> | |
40 | #include <sys/vfs.h> | |
41 | ||
42 | #include "bindings.h" | |
43 | #include "config.h" | |
44 | #include "cgroup_fuse.h" | |
45 | #include "cpuset_parse.h" | |
46 | #include "cgroups/cgroup.h" | |
47 | #include "cgroups/cgroup_utils.h" | |
48 | #include "memory_utils.h" | |
4ec5c9da | 49 | #include "proc_loadavg.h" |
1f5596dd CB |
50 | #include "utils.h" |
51 | ||
1f5596dd CB |
52 | /* Data for CPU view */ |
53 | struct cg_proc_stat { | |
54 | char *cg; | |
55 | struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat | |
56 | struct cpuacct_usage *view; // Usage stats reported to the container | |
57 | int cpu_count; | |
58 | pthread_mutex_t lock; // For node manipulation | |
59 | struct cg_proc_stat *next; | |
60 | }; | |
61 | ||
62 | struct cg_proc_stat_head { | |
63 | struct cg_proc_stat *next; | |
64 | time_t lastcheck; | |
65 | ||
66 | /* | |
67 | * For access to the list. Reading can be parallel, pruning is exclusive. | |
68 | */ | |
69 | pthread_rwlock_t lock; | |
70 | }; | |
71 | ||
72 | #define CPUVIEW_HASH_SIZE 100 | |
73 | static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE]; | |
74 | ||
75 | static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count) | |
76 | { | |
77 | int i; | |
78 | ||
79 | lxcfs_debug("Resetting stat node for %s\n", node->cg); | |
80 | memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count); | |
81 | ||
82 | for (i = 0; i < cpu_count; i++) { | |
83 | node->view[i].user = 0; | |
84 | node->view[i].system = 0; | |
85 | node->view[i].idle = 0; | |
86 | } | |
87 | ||
88 | node->cpu_count = cpu_count; | |
89 | } | |
90 | ||
91 | static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count) | |
92 | { | |
93 | __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL; | |
94 | ||
95 | /* Allocate new memory */ | |
96 | new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count); | |
97 | if (!new_usage) | |
98 | return false; | |
99 | ||
100 | new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count); | |
101 | if (!new_view) | |
102 | return false; | |
103 | ||
104 | /* Copy existing data & initialize new elements */ | |
105 | for (int i = 0; i < cpu_count; i++) { | |
106 | if (i < node->cpu_count) { | |
107 | new_usage[i].user = node->usage[i].user; | |
108 | new_usage[i].system = node->usage[i].system; | |
109 | new_usage[i].idle = node->usage[i].idle; | |
110 | ||
111 | new_view[i].user = node->view[i].user; | |
112 | new_view[i].system = node->view[i].system; | |
113 | new_view[i].idle = node->view[i].idle; | |
114 | } else { | |
115 | new_usage[i].user = 0; | |
116 | new_usage[i].system = 0; | |
117 | new_usage[i].idle = 0; | |
118 | ||
119 | new_view[i].user = 0; | |
120 | new_view[i].system = 0; | |
121 | new_view[i].idle = 0; | |
122 | } | |
123 | } | |
124 | ||
125 | free(node->usage); | |
126 | node->usage = move_ptr(new_usage); | |
127 | ||
128 | free(node->view); | |
129 | node->view = move_ptr(new_view); | |
130 | node->cpu_count = cpu_count; | |
131 | ||
132 | return true; | |
133 | } | |
134 | ||
4ec5c9da CB |
135 | static void free_proc_stat_node(struct cg_proc_stat *node) |
136 | { | |
137 | pthread_mutex_destroy(&node->lock); | |
138 | free_disarm(node->cg); | |
139 | free_disarm(node->usage); | |
140 | free_disarm(node->view); | |
141 | free_disarm(node); | |
142 | } | |
143 | ||
1f5596dd CB |
144 | static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node) |
145 | { | |
146 | int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE; | |
147 | struct cg_proc_stat_head *head = proc_stat_history[hash]; | |
148 | struct cg_proc_stat *node, *rv = new_node; | |
149 | ||
150 | pthread_rwlock_wrlock(&head->lock); | |
151 | ||
152 | if (!head->next) { | |
153 | head->next = new_node; | |
154 | goto out; | |
155 | } | |
156 | ||
157 | node = head->next; | |
158 | ||
159 | for (;;) { | |
160 | if (strcmp(node->cg, new_node->cg) == 0) { | |
161 | /* The node is already present, return it */ | |
162 | free_proc_stat_node(new_node); | |
163 | rv = node; | |
164 | goto out; | |
165 | } | |
166 | ||
167 | if (node->next) { | |
168 | node = node->next; | |
169 | continue; | |
170 | } | |
171 | ||
172 | node->next = new_node; | |
173 | goto out; | |
174 | } | |
175 | ||
176 | out: | |
177 | pthread_rwlock_unlock(&head->lock); | |
178 | return rv; | |
179 | } | |
180 | ||
181 | static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg) | |
182 | { | |
183 | struct cg_proc_stat *node; | |
184 | int i; | |
185 | ||
186 | node = malloc(sizeof(struct cg_proc_stat)); | |
187 | if (!node) | |
188 | goto err; | |
189 | ||
190 | node->cg = NULL; | |
191 | node->usage = NULL; | |
192 | node->view = NULL; | |
193 | ||
194 | node->cg = malloc(strlen(cg) + 1); | |
195 | if (!node->cg) | |
196 | goto err; | |
197 | ||
198 | strcpy(node->cg, cg); | |
199 | ||
200 | node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count); | |
201 | if (!node->usage) | |
202 | goto err; | |
203 | ||
204 | memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count); | |
205 | ||
206 | node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count); | |
207 | if (!node->view) | |
208 | goto err; | |
209 | ||
210 | node->cpu_count = cpu_count; | |
211 | node->next = NULL; | |
212 | ||
213 | if (pthread_mutex_init(&node->lock, NULL) != 0) { | |
214 | lxcfs_error("%s\n", "Failed to initialize node lock"); | |
215 | goto err; | |
216 | } | |
217 | ||
218 | for (i = 0; i < cpu_count; i++) { | |
219 | node->view[i].user = 0; | |
220 | node->view[i].system = 0; | |
221 | node->view[i].idle = 0; | |
222 | } | |
223 | ||
224 | return node; | |
225 | ||
226 | err: | |
227 | if (node && node->cg) | |
228 | free(node->cg); | |
229 | if (node && node->usage) | |
230 | free(node->usage); | |
231 | if (node && node->view) | |
232 | free(node->view); | |
233 | if (node) | |
234 | free(node); | |
235 | ||
236 | return NULL; | |
237 | } | |
238 | ||
4ec5c9da CB |
239 | static bool cgfs_param_exist(const char *controller, const char *cgroup, |
240 | const char *file) | |
241 | { | |
242 | int ret, cfd; | |
243 | size_t len; | |
244 | char *fnam; | |
245 | ||
246 | cfd = get_cgroup_fd(controller); | |
247 | if (cfd < 0) | |
248 | return false; | |
249 | ||
250 | /* Make sure we pass a relative path to *at() family of functions. | |
251 | * . + /cgroup + / + file + \0 | |
252 | */ | |
253 | len = strlen(cgroup) + strlen(file) + 3; | |
254 | fnam = alloca(len); | |
255 | ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file); | |
256 | if (ret < 0 || (size_t)ret >= len) | |
257 | return false; | |
258 | ||
259 | return (faccessat(cfd, fnam, F_OK, 0) == 0); | |
260 | } | |
261 | ||
1f5596dd CB |
262 | static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node) |
263 | { | |
264 | struct cg_proc_stat *first = NULL, *prev, *tmp; | |
265 | ||
266 | for (prev = NULL; node; ) { | |
267 | if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) { | |
268 | tmp = node; | |
269 | lxcfs_debug("Removing stat node for %s\n", node->cg); | |
270 | ||
271 | if (prev) | |
272 | prev->next = node->next; | |
273 | else | |
274 | first = node->next; | |
275 | ||
276 | node = node->next; | |
277 | free_proc_stat_node(tmp); | |
278 | } else { | |
279 | if (!first) | |
280 | first = node; | |
281 | prev = node; | |
282 | node = node->next; | |
283 | } | |
284 | } | |
285 | ||
286 | return first; | |
287 | } | |
288 | ||
289 | #define PROC_STAT_PRUNE_INTERVAL 10 | |
290 | static void prune_proc_stat_history(void) | |
291 | { | |
292 | int i; | |
293 | time_t now = time(NULL); | |
294 | ||
295 | for (i = 0; i < CPUVIEW_HASH_SIZE; i++) { | |
296 | pthread_rwlock_wrlock(&proc_stat_history[i]->lock); | |
297 | ||
298 | if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) { | |
299 | pthread_rwlock_unlock(&proc_stat_history[i]->lock); | |
300 | return; | |
301 | } | |
302 | ||
303 | if (proc_stat_history[i]->next) { | |
304 | proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next); | |
305 | proc_stat_history[i]->lastcheck = now; | |
306 | } | |
307 | ||
308 | pthread_rwlock_unlock(&proc_stat_history[i]->lock); | |
309 | } | |
310 | } | |
311 | ||
312 | static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, | |
313 | const char *cg) | |
314 | { | |
315 | struct cg_proc_stat *node; | |
316 | ||
317 | pthread_rwlock_rdlock(&head->lock); | |
318 | ||
319 | if (!head->next) { | |
320 | pthread_rwlock_unlock(&head->lock); | |
321 | return NULL; | |
322 | } | |
323 | ||
324 | node = head->next; | |
325 | ||
326 | do { | |
327 | if (strcmp(cg, node->cg) == 0) | |
328 | goto out; | |
329 | } while ((node = node->next)); | |
330 | ||
331 | node = NULL; | |
332 | ||
333 | out: | |
334 | pthread_rwlock_unlock(&head->lock); | |
335 | prune_proc_stat_history(); | |
336 | return node; | |
337 | } | |
338 | ||
339 | static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg) | |
340 | { | |
341 | int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE; | |
342 | struct cg_proc_stat_head *head = proc_stat_history[hash]; | |
343 | struct cg_proc_stat *node; | |
344 | ||
345 | node = find_proc_stat_node(head, cg); | |
346 | ||
347 | if (!node) { | |
348 | node = new_proc_stat_node(usage, cpu_count, cg); | |
349 | if (!node) | |
350 | return NULL; | |
351 | ||
352 | node = add_proc_stat_node(node); | |
353 | lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg); | |
354 | } | |
355 | ||
356 | pthread_mutex_lock(&node->lock); | |
357 | ||
358 | /* If additional CPUs on the host have been enabled, CPU usage counter | |
359 | * arrays have to be expanded */ | |
360 | if (node->cpu_count < cpu_count) { | |
361 | lxcfs_debug("Expanding stat node %d->%d for %s\n", | |
362 | node->cpu_count, cpu_count, cg); | |
363 | ||
364 | if (!expand_proc_stat_node(node, cpu_count)) { | |
365 | pthread_mutex_unlock(&node->lock); | |
366 | lxcfs_debug("Unable to expand stat node %d->%d for %s\n", | |
367 | node->cpu_count, cpu_count, cg); | |
368 | return NULL; | |
369 | } | |
370 | } | |
371 | ||
372 | return node; | |
373 | } | |
374 | ||
375 | static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, | |
376 | unsigned long *counter, unsigned long threshold) | |
377 | { | |
378 | unsigned long free_space, to_add; | |
379 | ||
380 | free_space = threshold - usage->user - usage->system; | |
381 | ||
382 | if (free_space > usage->idle) | |
383 | free_space = usage->idle; | |
384 | ||
385 | to_add = free_space > *surplus ? *surplus : free_space; | |
386 | ||
387 | *counter += to_add; | |
388 | usage->idle -= to_add; | |
389 | *surplus -= to_add; | |
390 | } | |
391 | ||
392 | static unsigned long diff_cpu_usage(struct cpuacct_usage *older, | |
393 | struct cpuacct_usage *newer, | |
394 | struct cpuacct_usage *diff, int cpu_count) | |
395 | { | |
396 | int i; | |
397 | unsigned long sum = 0; | |
398 | ||
399 | for (i = 0; i < cpu_count; i++) { | |
400 | if (!newer[i].online) | |
401 | continue; | |
402 | ||
403 | /* When cpuset is changed on the fly, the CPUs might get reordered. | |
404 | * We could either reset all counters, or check that the substractions | |
405 | * below will return expected results. | |
406 | */ | |
407 | if (newer[i].user > older[i].user) | |
408 | diff[i].user = newer[i].user - older[i].user; | |
409 | else | |
410 | diff[i].user = 0; | |
411 | ||
412 | if (newer[i].system > older[i].system) | |
413 | diff[i].system = newer[i].system - older[i].system; | |
414 | else | |
415 | diff[i].system = 0; | |
416 | ||
417 | if (newer[i].idle > older[i].idle) | |
418 | diff[i].idle = newer[i].idle - older[i].idle; | |
419 | else | |
420 | diff[i].idle = 0; | |
421 | ||
422 | sum += diff[i].user; | |
423 | sum += diff[i].system; | |
424 | sum += diff[i].idle; | |
425 | } | |
426 | ||
427 | return sum; | |
428 | } | |
429 | ||
430 | /* | |
431 | * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`, | |
432 | * depending on `param`. Parameter value is returned throuh `value`. | |
433 | */ | |
434 | static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value) | |
435 | { | |
436 | __do_free char *str = NULL; | |
437 | char file[11 + 6 + 1]; /* cpu.cfs__us + quota/period + \0 */ | |
438 | ||
439 | snprintf(file, sizeof(file), "cpu.cfs_%s_us", param); | |
440 | ||
441 | if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str)) | |
442 | return false; | |
443 | ||
444 | if (sscanf(str, "%ld", value) != 1) | |
445 | return false; | |
446 | ||
447 | return true; | |
448 | } | |
449 | ||
450 | /* | |
451 | * Return the exact number of visible CPUs based on CPU quotas. | |
452 | * If there is no quota set, zero is returned. | |
453 | */ | |
454 | static double exact_cpu_count(const char *cg) | |
455 | { | |
456 | double rv; | |
457 | int nprocs; | |
458 | int64_t cfs_quota, cfs_period; | |
459 | ||
460 | if (!read_cpu_cfs_param(cg, "quota", &cfs_quota)) | |
461 | return 0; | |
462 | ||
463 | if (!read_cpu_cfs_param(cg, "period", &cfs_period)) | |
464 | return 0; | |
465 | ||
466 | if (cfs_quota <= 0 || cfs_period <= 0) | |
467 | return 0; | |
468 | ||
469 | rv = (double)cfs_quota / (double)cfs_period; | |
470 | ||
471 | nprocs = get_nprocs(); | |
472 | ||
473 | if (rv > nprocs) | |
474 | rv = nprocs; | |
475 | ||
476 | return rv; | |
477 | } | |
478 | ||
479 | /* | |
480 | * Return the maximum number of visible CPUs based on CPU quotas. | |
481 | * If there is no quota set, zero is returned. | |
482 | */ | |
4ec5c9da | 483 | int max_cpu_count(const char *cg) |
1f5596dd CB |
484 | { |
485 | int rv, nprocs; | |
486 | int64_t cfs_quota, cfs_period; | |
487 | int nr_cpus_in_cpuset = 0; | |
488 | char *cpuset = NULL; | |
489 | ||
490 | if (!read_cpu_cfs_param(cg, "quota", &cfs_quota)) | |
491 | return 0; | |
492 | ||
493 | if (!read_cpu_cfs_param(cg, "period", &cfs_period)) | |
494 | return 0; | |
495 | ||
496 | cpuset = get_cpuset(cg); | |
497 | if (cpuset) | |
498 | nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset); | |
499 | ||
500 | if (cfs_quota <= 0 || cfs_period <= 0){ | |
501 | if (nr_cpus_in_cpuset > 0) | |
502 | return nr_cpus_in_cpuset; | |
503 | ||
504 | return 0; | |
505 | } | |
506 | ||
507 | rv = cfs_quota / cfs_period; | |
508 | ||
509 | /* In case quota/period does not yield a whole number, add one CPU for | |
510 | * the remainder. | |
511 | */ | |
512 | if ((cfs_quota % cfs_period) > 0) | |
513 | rv += 1; | |
514 | ||
515 | nprocs = get_nprocs(); | |
516 | ||
517 | if (rv > nprocs) | |
518 | rv = nprocs; | |
519 | ||
520 | /* use min value in cpu quota and cpuset */ | |
521 | if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv) | |
522 | rv = nr_cpus_in_cpuset; | |
523 | ||
524 | return rv; | |
525 | } | |
526 | ||
527 | int cpuview_proc_stat(const char *cg, const char *cpuset, | |
528 | struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, | |
529 | FILE *f, char *buf, size_t buf_size) | |
530 | { | |
531 | __do_free char *line = NULL; | |
532 | __do_free struct cpuacct_usage *diff = NULL; | |
533 | size_t linelen = 0, total_len = 0, l; | |
534 | int curcpu = -1; /* cpu numbering starts at 0 */ | |
535 | int physcpu, i; | |
536 | int max_cpus = max_cpu_count(cg), cpu_cnt = 0; | |
537 | unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, | |
538 | irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0; | |
539 | unsigned long user_sum = 0, system_sum = 0, idle_sum = 0; | |
540 | unsigned long user_surplus = 0, system_surplus = 0; | |
541 | unsigned long total_sum, threshold; | |
542 | struct cg_proc_stat *stat_node; | |
543 | int nprocs = get_nprocs_conf(); | |
544 | ||
545 | if (cg_cpu_usage_size < nprocs) | |
546 | nprocs = cg_cpu_usage_size; | |
547 | ||
548 | /* Read all CPU stats and stop when we've encountered other lines */ | |
549 | while (getline(&line, &linelen, f) != -1) { | |
550 | int ret; | |
551 | char cpu_char[10]; /* That's a lot of cores */ | |
552 | uint64_t all_used, cg_used; | |
553 | ||
554 | if (strlen(line) == 0) | |
555 | continue; | |
556 | ||
557 | /* not a ^cpuN line containing a number N */ | |
558 | if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) | |
559 | break; | |
560 | ||
561 | if (sscanf(cpu_char, "%d", &physcpu) != 1) | |
562 | continue; | |
563 | ||
564 | if (physcpu >= cg_cpu_usage_size) | |
565 | continue; | |
566 | ||
567 | curcpu ++; | |
568 | cpu_cnt ++; | |
569 | ||
570 | if (!cpu_in_cpuset(physcpu, cpuset)) { | |
571 | for (i = curcpu; i <= physcpu; i++) | |
572 | cg_cpu_usage[i].online = false; | |
573 | continue; | |
574 | } | |
575 | ||
576 | if (curcpu < physcpu) { | |
577 | /* Some CPUs may be disabled */ | |
578 | for (i = curcpu; i < physcpu; i++) | |
579 | cg_cpu_usage[i].online = false; | |
580 | ||
581 | curcpu = physcpu; | |
582 | } | |
583 | ||
584 | cg_cpu_usage[curcpu].online = true; | |
585 | ||
586 | ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", | |
587 | &user, | |
588 | &nice, | |
589 | &system, | |
590 | &idle, | |
591 | &iowait, | |
592 | &irq, | |
593 | &softirq, | |
594 | &steal, | |
595 | &guest, | |
596 | &guest_nice); | |
597 | ||
598 | if (ret != 10) | |
599 | continue; | |
600 | ||
601 | all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice; | |
602 | cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system; | |
603 | ||
604 | if (all_used >= cg_used) { | |
605 | cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used); | |
606 | ||
607 | } else { | |
608 | lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, " | |
609 | "%lu in cpuacct.usage_all; unable to determine idle time\n", | |
610 | curcpu, cg, all_used, cg_used); | |
611 | cg_cpu_usage[curcpu].idle = idle; | |
612 | } | |
613 | } | |
614 | ||
615 | /* Cannot use more CPUs than is available due to cpuset */ | |
616 | if (max_cpus > cpu_cnt) | |
617 | max_cpus = cpu_cnt; | |
618 | ||
619 | stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg); | |
620 | ||
621 | if (!stat_node) { | |
622 | lxcfs_error("unable to find/create stat node for %s\n", cg); | |
623 | return 0; | |
624 | } | |
625 | ||
626 | diff = malloc(sizeof(struct cpuacct_usage) * nprocs); | |
627 | if (!diff) { | |
628 | return 0; | |
629 | } | |
630 | ||
631 | /* | |
632 | * If the new values are LOWER than values stored in memory, it means | |
633 | * the cgroup has been reset/recreated and we should reset too. | |
634 | */ | |
635 | for (curcpu = 0; curcpu < nprocs; curcpu++) { | |
636 | if (!cg_cpu_usage[curcpu].online) | |
637 | continue; | |
638 | ||
639 | if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user) | |
640 | reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs); | |
641 | ||
642 | break; | |
643 | } | |
644 | ||
645 | total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs); | |
646 | ||
647 | for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) { | |
648 | stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online; | |
649 | ||
650 | if (!stat_node->usage[curcpu].online) | |
651 | continue; | |
652 | ||
653 | i++; | |
654 | ||
655 | stat_node->usage[curcpu].user += diff[curcpu].user; | |
656 | stat_node->usage[curcpu].system += diff[curcpu].system; | |
657 | stat_node->usage[curcpu].idle += diff[curcpu].idle; | |
658 | ||
659 | if (max_cpus > 0 && i >= max_cpus) { | |
660 | user_surplus += diff[curcpu].user; | |
661 | system_surplus += diff[curcpu].system; | |
662 | } | |
663 | } | |
664 | ||
665 | /* Calculate usage counters of visible CPUs */ | |
666 | if (max_cpus > 0) { | |
667 | unsigned long diff_user = 0; | |
668 | unsigned long diff_system = 0; | |
669 | unsigned long diff_idle = 0; | |
670 | unsigned long max_diff_idle = 0; | |
671 | unsigned long max_diff_idle_index = 0; | |
672 | double exact_cpus; | |
673 | ||
674 | /* threshold = maximum usage per cpu, including idle */ | |
675 | threshold = total_sum / cpu_cnt * max_cpus; | |
676 | ||
677 | for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) { | |
678 | if (!stat_node->usage[curcpu].online) | |
679 | continue; | |
680 | ||
681 | i++; | |
682 | ||
683 | if (i == max_cpus) | |
684 | break; | |
685 | ||
686 | if (diff[curcpu].user + diff[curcpu].system >= threshold) | |
687 | continue; | |
688 | ||
689 | /* Add user */ | |
690 | add_cpu_usage(&user_surplus, &diff[curcpu], | |
691 | &diff[curcpu].user, threshold); | |
692 | ||
693 | if (diff[curcpu].user + diff[curcpu].system >= threshold) | |
694 | continue; | |
695 | ||
696 | /* If there is still room, add system */ | |
697 | add_cpu_usage(&system_surplus, &diff[curcpu], | |
698 | &diff[curcpu].system, threshold); | |
699 | } | |
700 | ||
701 | if (user_surplus > 0) | |
702 | lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg); | |
703 | if (system_surplus > 0) | |
704 | lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg); | |
705 | ||
706 | for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) { | |
707 | if (!stat_node->usage[curcpu].online) | |
708 | continue; | |
709 | ||
710 | i++; | |
711 | ||
712 | if (i == max_cpus) | |
713 | break; | |
714 | ||
715 | stat_node->view[curcpu].user += diff[curcpu].user; | |
716 | stat_node->view[curcpu].system += diff[curcpu].system; | |
717 | stat_node->view[curcpu].idle += diff[curcpu].idle; | |
718 | ||
719 | user_sum += stat_node->view[curcpu].user; | |
720 | system_sum += stat_node->view[curcpu].system; | |
721 | idle_sum += stat_node->view[curcpu].idle; | |
722 | ||
723 | diff_user += diff[curcpu].user; | |
724 | diff_system += diff[curcpu].system; | |
725 | diff_idle += diff[curcpu].idle; | |
726 | if (diff[curcpu].idle > max_diff_idle) { | |
727 | max_diff_idle = diff[curcpu].idle; | |
728 | max_diff_idle_index = curcpu; | |
729 | } | |
730 | ||
731 | lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle); | |
732 | } | |
733 | lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle); | |
734 | ||
735 | /* revise cpu usage view to support partial cpu case. */ | |
736 | exact_cpus = exact_cpu_count(cg); | |
737 | if (exact_cpus < (double)max_cpus){ | |
738 | unsigned long delta = (unsigned long)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus)); | |
739 | ||
740 | lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus); | |
741 | lxcfs_v("delta: %lu\n", delta); | |
742 | lxcfs_v("idle_sum before: %lu\n", idle_sum); | |
743 | idle_sum = idle_sum > delta ? idle_sum - delta : 0; | |
744 | lxcfs_v("idle_sum after: %lu\n", idle_sum); | |
745 | ||
746 | curcpu = max_diff_idle_index; | |
747 | lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle); | |
748 | stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0; | |
749 | lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle); | |
750 | } | |
751 | } else { | |
752 | for (curcpu = 0; curcpu < nprocs; curcpu++) { | |
753 | if (!stat_node->usage[curcpu].online) | |
754 | continue; | |
755 | ||
756 | stat_node->view[curcpu].user = stat_node->usage[curcpu].user; | |
757 | stat_node->view[curcpu].system = stat_node->usage[curcpu].system; | |
758 | stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle; | |
759 | ||
760 | user_sum += stat_node->view[curcpu].user; | |
761 | system_sum += stat_node->view[curcpu].system; | |
762 | idle_sum += stat_node->view[curcpu].idle; | |
763 | } | |
764 | } | |
765 | ||
766 | /* Render the file */ | |
767 | /* cpu-all */ | |
768 | l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n", | |
769 | user_sum, | |
770 | system_sum, | |
771 | idle_sum); | |
772 | lxcfs_v("cpu-all: %s\n", buf); | |
773 | ||
774 | if (l < 0) { | |
775 | perror("Error writing to cache"); | |
776 | return 0; | |
777 | } | |
778 | if (l >= buf_size) { | |
779 | lxcfs_error("%s\n", "Internal error: truncated write to cache."); | |
780 | return 0; | |
781 | } | |
782 | ||
783 | buf += l; | |
784 | buf_size -= l; | |
785 | total_len += l; | |
786 | ||
787 | /* Render visible CPUs */ | |
788 | for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) { | |
789 | if (!stat_node->usage[curcpu].online) | |
790 | continue; | |
791 | ||
792 | i++; | |
793 | ||
794 | if (max_cpus > 0 && i == max_cpus) | |
795 | break; | |
796 | ||
797 | l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n", | |
798 | i, | |
799 | stat_node->view[curcpu].user, | |
800 | stat_node->view[curcpu].system, | |
801 | stat_node->view[curcpu].idle); | |
802 | lxcfs_v("cpu: %s\n", buf); | |
803 | ||
804 | if (l < 0) { | |
805 | perror("Error writing to cache"); | |
806 | return 0; | |
807 | ||
808 | } | |
809 | if (l >= buf_size) { | |
810 | lxcfs_error("%s\n", "Internal error: truncated write to cache."); | |
811 | return 0; | |
812 | } | |
813 | ||
814 | buf += l; | |
815 | buf_size -= l; | |
816 | total_len += l; | |
817 | } | |
818 | ||
819 | /* Pass the rest of /proc/stat, start with the last line read */ | |
820 | l = snprintf(buf, buf_size, "%s", line); | |
821 | ||
822 | if (l < 0) { | |
823 | perror("Error writing to cache"); | |
824 | return 0; | |
825 | ||
826 | } | |
827 | if (l >= buf_size) { | |
828 | lxcfs_error("%s\n", "Internal error: truncated write to cache."); | |
829 | return 0; | |
830 | } | |
831 | ||
832 | buf += l; | |
833 | buf_size -= l; | |
834 | total_len += l; | |
835 | ||
836 | /* Pass the rest of the host's /proc/stat */ | |
837 | while (getline(&line, &linelen, f) != -1) { | |
838 | l = snprintf(buf, buf_size, "%s", line); | |
839 | if (l < 0) { | |
840 | perror("Error writing to cache"); | |
841 | return 0; | |
842 | } | |
843 | if (l >= buf_size) { | |
844 | lxcfs_error("%s\n", "Internal error: truncated write to cache."); | |
845 | return 0; | |
846 | } | |
847 | buf += l; | |
848 | buf_size -= l; | |
849 | total_len += l; | |
850 | } | |
851 | ||
852 | if (stat_node) | |
853 | pthread_mutex_unlock(&stat_node->lock); | |
854 | return total_len; | |
855 | } | |
856 | ||
857 | /* | |
858 | * check whether this is a '^processor" line in /proc/cpuinfo | |
859 | */ | |
860 | static bool is_processor_line(const char *line) | |
861 | { | |
862 | int cpu; | |
863 | ||
864 | if (sscanf(line, "processor : %d", &cpu) == 1) | |
865 | return true; | |
866 | return false; | |
867 | } | |
868 | ||
869 | static bool cpuline_in_cpuset(const char *line, const char *cpuset) | |
870 | { | |
871 | int cpu; | |
872 | ||
873 | if (sscanf(line, "processor : %d", &cpu) != 1) | |
874 | return false; | |
875 | return cpu_in_cpuset(cpu, cpuset); | |
876 | } | |
877 | ||
878 | int proc_cpuinfo_read(char *buf, size_t size, off_t offset, | |
879 | struct fuse_file_info *fi) | |
880 | { | |
881 | __do_free char *cg = NULL, *cpuset = NULL, *line = NULL; | |
882 | __do_fclose FILE *f = NULL; | |
883 | struct fuse_context *fc = fuse_get_context(); | |
884 | struct file_info *d = (struct file_info *)fi->fh; | |
885 | size_t linelen = 0, total_len = 0; | |
886 | bool am_printing = false, firstline = true, is_s390x = false; | |
887 | int curcpu = -1, cpu, max_cpus = 0; | |
888 | bool use_view; | |
889 | char *cache = d->buf; | |
890 | size_t cache_size = d->buflen; | |
891 | ||
892 | if (offset){ | |
893 | int left; | |
894 | ||
895 | if (offset > d->size) | |
896 | return -EINVAL; | |
897 | ||
898 | if (!d->cached) | |
899 | return 0; | |
900 | ||
901 | left = d->size - offset; | |
902 | total_len = left > size ? size: left; | |
903 | memcpy(buf, cache + offset, total_len); | |
904 | ||
905 | return total_len; | |
906 | } | |
907 | ||
908 | pid_t initpid = lookup_initpid_in_store(fc->pid); | |
909 | if (initpid <= 1 || is_shared_pidns(initpid)) | |
910 | initpid = fc->pid; | |
911 | cg = get_pid_cgroup(initpid, "cpuset"); | |
912 | if (!cg) | |
913 | return read_file_fuse("proc/cpuinfo", buf, size, d); | |
914 | prune_init_slice(cg); | |
915 | ||
916 | cpuset = get_cpuset(cg); | |
917 | if (!cpuset) | |
918 | return 0; | |
919 | ||
920 | use_view = cgroup_ops->can_use_cpuview(cgroup_ops); | |
921 | if (use_view) | |
922 | max_cpus = max_cpu_count(cg); | |
923 | ||
924 | f = fopen("/proc/cpuinfo", "r"); | |
925 | if (!f) | |
926 | return 0; | |
927 | ||
928 | while (getline(&line, &linelen, f) != -1) { | |
929 | ssize_t l; | |
930 | if (firstline) { | |
931 | firstline = false; | |
932 | if (strstr(line, "IBM/S390") != NULL) { | |
933 | is_s390x = true; | |
934 | am_printing = true; | |
935 | continue; | |
936 | } | |
937 | } | |
938 | if (strncmp(line, "# processors:", 12) == 0) | |
939 | continue; | |
940 | if (is_processor_line(line)) { | |
941 | if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus) | |
942 | break; | |
943 | am_printing = cpuline_in_cpuset(line, cpuset); | |
944 | if (am_printing) { | |
945 | curcpu ++; | |
946 | l = snprintf(cache, cache_size, "processor : %d\n", curcpu); | |
947 | if (l < 0) { | |
948 | perror("Error writing to cache"); | |
949 | return 0; | |
950 | } | |
951 | if (l >= cache_size) { | |
952 | lxcfs_error("%s\n", "Internal error: truncated write to cache."); | |
953 | return 0; | |
954 | } | |
955 | cache += l; | |
956 | cache_size -= l; | |
957 | total_len += l; | |
958 | } | |
959 | continue; | |
960 | } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) { | |
961 | char *p; | |
962 | if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus) | |
963 | break; | |
964 | if (!cpu_in_cpuset(cpu, cpuset)) | |
965 | continue; | |
966 | curcpu ++; | |
967 | p = strchr(line, ':'); | |
968 | if (!p || !*p) | |
969 | return 0; | |
970 | p++; | |
971 | l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p); | |
972 | if (l < 0) { | |
973 | perror("Error writing to cache"); | |
974 | return 0; | |
975 | } | |
976 | if (l >= cache_size) { | |
977 | lxcfs_error("%s\n", "Internal error: truncated write to cache."); | |
978 | return 0; | |
979 | } | |
980 | cache += l; | |
981 | cache_size -= l; | |
982 | total_len += l; | |
983 | continue; | |
984 | ||
985 | } | |
986 | if (am_printing) { | |
987 | l = snprintf(cache, cache_size, "%s", line); | |
988 | if (l < 0) { | |
989 | perror("Error writing to cache"); | |
990 | return 0; | |
991 | } | |
992 | if (l >= cache_size) { | |
993 | lxcfs_error("%s\n", "Internal error: truncated write to cache."); | |
994 | return 0; | |
995 | } | |
996 | cache += l; | |
997 | cache_size -= l; | |
998 | total_len += l; | |
999 | } | |
1000 | } | |
1001 | ||
1002 | if (is_s390x) { | |
1003 | __do_free char *origcache = d->buf; | |
1004 | ssize_t l; | |
1005 | ||
1006 | d->buf = malloc(d->buflen); | |
1007 | if (!d->buf) { | |
1008 | d->buf = move_ptr(origcache); | |
1009 | return 0; | |
1010 | } | |
1011 | ||
1012 | cache = d->buf; | |
1013 | cache_size = d->buflen; | |
1014 | total_len = 0; | |
1015 | l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n"); | |
1016 | if (l < 0 || l >= cache_size) | |
1017 | return 0; | |
1018 | ||
1019 | cache_size -= l; | |
1020 | cache += l; | |
1021 | total_len += l; | |
1022 | l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1); | |
1023 | if (l < 0 || l >= cache_size) | |
1024 | return 0; | |
1025 | ||
1026 | cache_size -= l; | |
1027 | cache += l; | |
1028 | total_len += l; | |
1029 | l = snprintf(cache, cache_size, "%s", origcache); | |
1030 | if (l < 0 || l >= cache_size) | |
1031 | return 0; | |
1032 | total_len += l; | |
1033 | } | |
1034 | ||
1035 | d->cached = 1; | |
1036 | d->size = total_len; | |
1037 | if (total_len > size ) total_len = size; | |
1038 | ||
1039 | /* read from off 0 */ | |
1040 | memcpy(buf, d->buf, total_len); | |
1041 | return total_len; | |
1042 | } | |
1043 | ||
1044 | /* | |
1045 | * Returns 0 on success. | |
1046 | * It is the caller's responsibility to free `return_usage`, unless this | |
1047 | * function returns an error. | |
1048 | */ | |
1049 | int read_cpuacct_usage_all(char *cg, char *cpuset, | |
1050 | struct cpuacct_usage **return_usage, int *size) | |
1051 | { | |
1052 | __do_free char *usage_str = NULL; | |
1053 | __do_free struct cpuacct_usage *cpu_usage = NULL; | |
1054 | int cpucount = get_nprocs_conf(); | |
9ce186dc CB |
1055 | int i = 0, j = 0, read_pos = 0, read_cnt = 0; |
1056 | int ret; | |
1f5596dd CB |
1057 | int cg_cpu; |
1058 | uint64_t cg_user, cg_system; | |
1059 | int64_t ticks_per_sec; | |
1060 | ||
1061 | ticks_per_sec = sysconf(_SC_CLK_TCK); | |
1062 | ||
1063 | if (ticks_per_sec < 0 && errno == EINVAL) { | |
1064 | lxcfs_v( | |
1065 | "%s\n", | |
1066 | "read_cpuacct_usage_all failed to determine number of clock ticks " | |
1067 | "in a second"); | |
1068 | return -1; | |
1069 | } | |
1070 | ||
1071 | cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount); | |
1072 | if (!cpu_usage) | |
1073 | return -ENOMEM; | |
1074 | ||
1075 | memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount); | |
1076 | if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) { | |
1077 | char *data = NULL; | |
1f5596dd CB |
1078 | size_t sz = 0, asz = 0; |
1079 | ||
1080 | /* read cpuacct.usage_percpu instead. */ | |
1081 | lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", ""); | |
1082 | if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str)) | |
1083 | return -1; | |
1084 | lxcfs_v("usage_str: %s\n", usage_str); | |
1085 | ||
1086 | /* convert cpuacct.usage_percpu into cpuacct.usage_all. */ | |
1087 | lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", ""); | |
1088 | ||
1089 | must_strcat(&data, &sz, &asz, "cpu user system\n"); | |
1090 | ||
1091 | while (sscanf(usage_str + read_pos, "%lu %n", &cg_user, &read_cnt) > 0) { | |
1092 | lxcfs_debug("i: %d, cg_user: %lu, read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt); | |
1093 | must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user); | |
1094 | i++; | |
1095 | read_pos += read_cnt; | |
1096 | } | |
1097 | ||
1098 | usage_str = data; | |
1099 | ||
1100 | lxcfs_v("usage_str: %s\n", usage_str); | |
1101 | } | |
1102 | ||
1103 | if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) { | |
1104 | lxcfs_error("read_cpuacct_usage_all reading first line from " | |
1105 | "%s/cpuacct.usage_all failed.\n", cg); | |
1106 | return -1; | |
1107 | } | |
1108 | ||
1109 | read_pos += read_cnt; | |
1110 | ||
1111 | for (i = 0, j = 0; i < cpucount; i++) { | |
1112 | ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user, | |
1113 | &cg_system, &read_cnt); | |
1114 | ||
1115 | if (ret == EOF) | |
1116 | break; | |
1117 | ||
1118 | if (ret != 3) { | |
1119 | lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all " | |
1120 | "failed.\n", cg); | |
1121 | return -1; | |
1122 | } | |
1123 | ||
1124 | read_pos += read_cnt; | |
1125 | ||
1126 | /* Convert the time from nanoseconds to USER_HZ */ | |
1127 | cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec; | |
1128 | cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec; | |
1129 | j++; | |
1130 | } | |
1131 | ||
1132 | *return_usage = move_ptr(cpu_usage); | |
1133 | *size = cpucount; | |
1134 | return 0; | |
1135 | } | |
1136 | ||
1137 | static bool cpuview_init_head(struct cg_proc_stat_head **head) | |
1138 | { | |
1139 | *head = malloc(sizeof(struct cg_proc_stat_head)); | |
1140 | if (!(*head)) { | |
1141 | lxcfs_error("%s\n", strerror(errno)); | |
1142 | return false; | |
1143 | } | |
1144 | ||
1145 | (*head)->lastcheck = time(NULL); | |
1146 | (*head)->next = NULL; | |
1147 | ||
1148 | if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) { | |
1149 | lxcfs_error("%s\n", "Failed to initialize list lock"); | |
1150 | free_disarm(*head); | |
1151 | return false; | |
1152 | } | |
1153 | ||
1154 | return true; | |
1155 | } | |
1156 | ||
4ec5c9da | 1157 | bool init_cpuview(void) |
1f5596dd CB |
1158 | { |
1159 | int i; | |
1160 | ||
1161 | for (i = 0; i < CPUVIEW_HASH_SIZE; i++) | |
1162 | proc_stat_history[i] = NULL; | |
1163 | ||
1164 | for (i = 0; i < CPUVIEW_HASH_SIZE; i++) { | |
1165 | if (!cpuview_init_head(&proc_stat_history[i])) | |
1166 | goto err; | |
1167 | } | |
1168 | ||
1169 | return true; | |
1170 | ||
1171 | err: | |
1172 | for (i = 0; i < CPUVIEW_HASH_SIZE; i++) { | |
1173 | if (proc_stat_history[i]) | |
1174 | free_disarm(proc_stat_history[i]); | |
1175 | } | |
1176 | ||
1177 | return false; | |
1178 | } | |
1179 | ||
1f5596dd CB |
1180 | static void cpuview_free_head(struct cg_proc_stat_head *head) |
1181 | { | |
1182 | struct cg_proc_stat *node, *tmp; | |
1183 | ||
1184 | if (head->next) { | |
1185 | node = head->next; | |
1186 | ||
1187 | for (;;) { | |
1188 | tmp = node; | |
1189 | node = node->next; | |
1190 | free_proc_stat_node(tmp); | |
1191 | ||
1192 | if (!node) | |
1193 | break; | |
1194 | } | |
1195 | } | |
1196 | ||
1197 | pthread_rwlock_destroy(&head->lock); | |
1198 | free_disarm(head); | |
1199 | } | |
1200 | ||
4ec5c9da | 1201 | void free_cpuview(void) |
1f5596dd | 1202 | { |
4ec5c9da | 1203 | for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) |
1f5596dd CB |
1204 | if (proc_stat_history[i]) |
1205 | cpuview_free_head(proc_stat_history[i]); | |
1f5596dd | 1206 | } |