]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/proc_loadavg.c
src: rely on config.h for fuse version
[mirror_lxcfs.git] / src / proc_loadavg.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE
5 #endif
6
7 #include "config.h"
8
9 #define __STDC_FORMAT_MACROS
10 #include <dirent.h>
11 #include <errno.h>
12 #include <fcntl.h>
13 #include <fuse.h>
14 #include <inttypes.h>
15 #include <libgen.h>
16 #include <pthread.h>
17 #include <sched.h>
18 #include <stdarg.h>
19 #include <stdbool.h>
20 #include <stdint.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <time.h>
25 #include <unistd.h>
26 #include <wait.h>
27 #include <linux/magic.h>
28 #include <linux/sched.h>
29 #include <sys/epoll.h>
30 #include <sys/mman.h>
31 #include <sys/mount.h>
32 #include <sys/param.h>
33 #include <sys/socket.h>
34 #include <sys/syscall.h>
35 #include <sys/sysinfo.h>
36 #include <sys/vfs.h>
37
38 #include "bindings.h"
39 #include "cgroup_fuse.h"
40 #include "cgroups/cgroup.h"
41 #include "cgroups/cgroup_utils.h"
42 #include "memory_utils.h"
43 #include "utils.h"
44
45 /*
46 * This parameter is used for proc_loadavg_read().
47 * 1 means use loadavg, 0 means not use.
48 */
49 static int loadavg = 0;
50
51 /* The function of hash table.*/
52 #define LOAD_SIZE 100 /*the size of hash_table */
53 #define FLUSH_TIME 5 /*the flush rate */
54 #define DEPTH_DIR 3 /*the depth of per cgroup */
55 /* The function of calculate loadavg .*/
56 #define FSHIFT 11 /* nr of bits of precision */
57 #define FIXED_1 (1 << FSHIFT) /* 1.0 as fixed-point */
58 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
59 #define EXP_5 2014 /* 1/exp(5sec/5min) */
60 #define EXP_15 2037 /* 1/exp(5sec/15min) */
61 #define LOAD_INT(x) ((x) >> FSHIFT)
62 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
63 static volatile sig_atomic_t loadavg_stop = 0;
64
65 struct load_node {
66 /* cgroup */
67 char *cg;
68 /* Load averages */
69 uint64_t avenrun[3];
70 unsigned int run_pid;
71 unsigned int total_pid;
72 unsigned int last_pid;
73 /* The file descriptor of the mounted cgroup */
74 int cfd;
75 struct load_node *next;
76 struct load_node **pre;
77 };
78
79 struct load_head {
80 /*
81 * The lock is about insert load_node and refresh load_node.To the first
82 * load_node of each hash bucket, insert and refresh in this hash bucket is
83 * mutually exclusive.
84 */
85 pthread_mutex_t lock;
86 /*
87 * The rdlock is about read loadavg and delete load_node.To each hash
88 * bucket, read and delete is mutually exclusive. But at the same time, we
89 * allow paratactic read operation. This rdlock is at list level.
90 */
91 pthread_rwlock_t rdlock;
92 /*
93 * The rilock is about read loadavg and insert load_node.To the first
94 * load_node of each hash bucket, read and insert is mutually exclusive.
95 * But at the same time, we allow paratactic read operation.
96 */
97 pthread_rwlock_t rilock;
98 struct load_node *next;
99 };
100
101 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
102
103 /*
104 * locate_node() finds special node. Not return NULL means success.
105 * It should be noted that rdlock isn't unlocked at the end of code
106 * because this function is used to read special node. Delete is not
107 * allowed before read has ended.
108 * unlock rdlock only in proc_loadavg_read().
109 */
110 static struct load_node *locate_node(char *cg, int locate)
111 {
112 struct load_node *f = NULL;
113 int i = 0;
114
115 pthread_rwlock_rdlock(&load_hash[locate].rilock);
116 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
117 if (load_hash[locate].next == NULL) {
118 pthread_rwlock_unlock(&load_hash[locate].rilock);
119 return f;
120 }
121 f = load_hash[locate].next;
122 pthread_rwlock_unlock(&load_hash[locate].rilock);
123 while (f && ((i = strcmp(f->cg, cg)) != 0))
124 f = f->next;
125 return f;
126 }
127
128 static void insert_node(struct load_node **n, int locate)
129 {
130 struct load_node *f;
131
132 pthread_mutex_lock(&load_hash[locate].lock);
133 pthread_rwlock_wrlock(&load_hash[locate].rilock);
134 f = load_hash[locate].next;
135 load_hash[locate].next = *n;
136
137 (*n)->pre = &(load_hash[locate].next);
138 if (f)
139 f->pre = &((*n)->next);
140 (*n)->next = f;
141 pthread_mutex_unlock(&load_hash[locate].lock);
142 pthread_rwlock_unlock(&load_hash[locate].rilock);
143 }
144
145 int calc_hash(const char *name)
146 {
147 unsigned int hash = 0;
148 unsigned int x = 0;
149
150 /* ELFHash algorithm. */
151 while (*name) {
152 hash = (hash << 4) + *name++;
153 x = hash & 0xf0000000;
154 if (x != 0)
155 hash ^= (x >> 24);
156 hash &= ~x;
157 }
158
159 return (hash & 0x7fffffff);
160 }
161
162 int proc_loadavg_read(char *buf, size_t size, off_t offset,
163 struct fuse_file_info *fi)
164 {
165 __do_free char *cg = NULL;
166 struct fuse_context *fc = fuse_get_context();
167 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
168 pid_t initpid;
169 ssize_t total_len = 0;
170 struct load_node *n;
171 int hash;
172 int cfd;
173 uint64_t a, b, c;
174
175 if (offset) {
176 int left;
177
178 if (offset > d->size)
179 return -EINVAL;
180
181 if (!d->cached)
182 return 0;
183
184 left = d->size - offset;
185 total_len = left > size ? size : left;
186 memcpy(buf, d->buf + offset, total_len);
187
188 return total_len;
189 }
190 if (!loadavg)
191 return read_file_fuse("/proc/loadavg", buf, size, d);
192
193 initpid = lookup_initpid_in_store(fc->pid);
194 if (initpid <= 1 || is_shared_pidns(initpid))
195 initpid = fc->pid;
196
197 cg = get_pid_cgroup(initpid, "cpu");
198 if (!cg)
199 return read_file_fuse("/proc/loadavg", buf, size, d);
200
201 prune_init_slice(cg);
202 hash = calc_hash(cg) % LOAD_SIZE;
203 n = locate_node(cg, hash);
204
205 /* First time */
206 if (n == NULL) {
207 cfd = get_cgroup_fd("cpu");
208 if (cfd < 0) {
209 /*
210 * In locate_node() above, pthread_rwlock_unlock() isn't used
211 * because delete is not allowed before read has ended.
212 */
213 pthread_rwlock_unlock(&load_hash[hash].rdlock);
214 return read_file_fuse("/proc/loadavg", buf, size, d);
215 }
216
217 n = must_realloc(NULL, sizeof(struct load_node));
218 n->cg = move_ptr(cg);
219 n->avenrun[0] = 0;
220 n->avenrun[1] = 0;
221 n->avenrun[2] = 0;
222 n->run_pid = 0;
223 n->total_pid = 1;
224 n->last_pid = initpid;
225 n->cfd = cfd;
226 insert_node(&n, hash);
227 }
228 a = n->avenrun[0] + (FIXED_1 / 200);
229 b = n->avenrun[1] + (FIXED_1 / 200);
230 c = n->avenrun[2] + (FIXED_1 / 200);
231 total_len = snprintf(d->buf, d->buflen,
232 "%lu.%02lu "
233 "%lu.%02lu "
234 "%lu.%02lu "
235 "%d/"
236 "%d "
237 "%d\n",
238 LOAD_INT(a),
239 LOAD_FRAC(a),
240 LOAD_INT(b),
241 LOAD_FRAC(b),
242 LOAD_INT(c),
243 LOAD_FRAC(c),
244 n->run_pid,
245 n->total_pid,
246 n->last_pid);
247 pthread_rwlock_unlock(&load_hash[hash].rdlock);
248 if (total_len < 0 || total_len >= d->buflen)
249 return log_error(0, "Failed to write to cache");
250
251 d->size = (int)total_len;
252 d->cached = 1;
253
254 if (total_len > size)
255 total_len = size;
256
257 memcpy(buf, d->buf, total_len);
258 return total_len;
259 }
260
261 /*
262 * Find the process pid from cgroup path.
263 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
264 * @pid_buf : put pid to pid_buf.
265 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
266 * @depth : the depth of cgroup in container.
267 * @sum : return the number of pid.
268 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
269 */
270 static int calc_pid(char ***pid_buf, const char *rel_path, int depth, int sum, int cfd)
271 {
272 __do_free char *line = NULL, *path = NULL;
273 __do_free void *fdopen_cache = NULL;
274 __do_close int fd = -EBADF;
275 __do_fclose FILE *f = NULL;
276 __do_closedir DIR *dir = NULL;
277 struct dirent *file;
278 size_t linelen = 0;
279 int pd;
280
281 fd = openat(cfd, rel_path, O_RDONLY | O_CLOEXEC);
282 if (fd < 0)
283 return sum;
284
285 dir = fdopendir(fd);
286 if (!dir)
287 return sum;
288 /* Transfer ownership to fdopendir(). */
289 move_fd(fd);
290
291 while (((file = readdir(dir)) != NULL) && depth > 0) {
292 if (strcmp(file->d_name, ".") == 0)
293 continue;
294
295 if (strcmp(file->d_name, "..") == 0)
296 continue;
297
298 if (file->d_type == DT_DIR) {
299 __do_free char *path_next = NULL;
300 path_next = must_make_path(rel_path, "/", file->d_name, NULL);
301 pd = depth - 1;
302 sum = calc_pid(pid_buf, path_next, pd, sum, cfd);
303 }
304 }
305
306 path = must_make_path(rel_path, "/cgroup.procs", NULL);
307 fd = openat(cfd, path, O_RDONLY | O_CLOEXEC);
308 if (fd < 0)
309 return sum;
310
311 f = fdopen_cached(fd, "re", &fdopen_cache);
312 if (!f)
313 return sum;
314
315 while (getline(&line, &linelen, f) != -1) {
316 __do_free char *task_pid = NULL;
317 char **pid;
318
319 task_pid = strdup(line);
320 if (!task_pid)
321 return sum;
322
323 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
324 if (!pid)
325 return sum;
326 *pid_buf = pid;
327 *(*pid_buf + sum) = move_ptr(task_pid);
328 sum++;
329 }
330
331 return sum;
332 }
333
334 /*
335 * calc_load calculates the load according to the following formula:
336 * load1 = load0 * exp + active * (1 - exp)
337 *
338 * @load1: the new loadavg.
339 * @load0: the former loadavg.
340 * @active: the total number of running pid at this moment.
341 * @exp: the fixed-point defined in the beginning.
342 */
343 static uint64_t calc_load(uint64_t load, uint64_t exp, uint64_t active)
344 {
345 uint64_t newload;
346
347 active = active > 0 ? active * FIXED_1 : 0;
348 newload = load * exp + active * (FIXED_1 - exp);
349 if (active >= load)
350 newload += FIXED_1 - 1;
351
352 return newload / FIXED_1;
353 }
354
355 /*
356 * Return 0 means that container p->cg is closed.
357 * Return -1 means that error occurred in refresh.
358 * Positive num equals the total number of pid.
359 */
360 static int refresh_load(struct load_node *p, const char *path)
361 {
362 char **idbuf = NULL;
363 char proc_path[STRLITERALLEN("/proc//task//status") +
364 2 * INTTYPE_TO_STRLEN(pid_t) + 1];
365 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
366 size_t linelen = 0;
367 int sum, length;
368 struct dirent *file;
369
370 idbuf = must_realloc(NULL, sizeof(char **));
371
372 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
373 if (!sum)
374 goto out;
375
376 for (i = 0; i < sum; i++) {
377 __do_closedir DIR *dp = NULL;
378
379 length = strlen(idbuf[i]) - 1;
380 idbuf[i][length] = '\0';
381
382 ret = snprintf(proc_path, sizeof(proc_path), "/proc/%s/task", idbuf[i]);
383 if (ret < 0 || (size_t)ret > sizeof(proc_path)) {
384 i = sum;
385 sum = -1;
386 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
387 goto err_out;
388 }
389
390 dp = opendir(proc_path);
391 if (!dp) {
392 lxcfs_error("Failed to open \"%s\"", proc_path);
393 continue;
394 }
395
396 while ((file = readdir(dp)) != NULL) {
397 __do_free char *line = NULL;
398 __do_fclose FILE *f = NULL;
399
400 if (strcmp(file->d_name, ".") == 0)
401 continue;
402
403 if (strcmp(file->d_name, "..") == 0)
404 continue;
405
406 total_pid++;
407
408 /* We make the biggest pid become last_pid. */
409 ret = atof(file->d_name);
410 last_pid = (ret > last_pid) ? ret : last_pid;
411
412 ret = snprintf(proc_path, sizeof(proc_path),
413 "/proc/%s/task/%s/status", idbuf[i], file->d_name);
414 if (ret < 0 || (size_t)ret > sizeof(proc_path)) {
415 i = sum;
416 sum = -1;
417 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
418 goto err_out;
419 }
420
421 f = fopen(proc_path, "re");
422 if (!f)
423 continue;
424
425 while (getline(&line, &linelen, f) != -1)
426 if ((line[0] == 'S') && (line[1] == 't'))
427 break;
428
429 if ((line[7] == 'R') || (line[7] == 'D'))
430 run_pid++;
431 }
432 }
433
434 /* Calculate the loadavg. */
435 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
436 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
437 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
438 p->run_pid = run_pid;
439 p->total_pid = total_pid;
440 p->last_pid = last_pid;
441
442 err_out:
443 for (; i > 0; i--)
444 free(idbuf[i - 1]);
445 out:
446 free(idbuf);
447 return sum;
448 }
449
450 /* Delete the load_node n and return the next node of it. */
451 static struct load_node *del_node(struct load_node *n, int locate)
452 {
453 struct load_node *g;
454
455 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
456 if (n->next == NULL) {
457 *(n->pre) = NULL;
458 } else {
459 *(n->pre) = n->next;
460 n->next->pre = n->pre;
461 }
462 g = n->next;
463 free_disarm(n->cg);
464 free_disarm(n);
465 pthread_rwlock_unlock(&load_hash[locate].rdlock);
466 return g;
467 }
468
469 /*
470 * Traverse the hash table and update it.
471 */
472 static void *load_begin(void *arg)
473 {
474
475 int first_node, sum;
476 struct load_node *f;
477 clock_t time1, time2;
478
479 for (;;) {
480 if (loadavg_stop == 1)
481 return NULL;
482
483 time1 = clock();
484 for (int i = 0; i < LOAD_SIZE; i++) {
485 pthread_mutex_lock(&load_hash[i].lock);
486 if (load_hash[i].next == NULL) {
487 pthread_mutex_unlock(&load_hash[i].lock);
488 continue;
489 }
490
491 f = load_hash[i].next;
492 first_node = 1;
493 while (f) {
494 __do_free char *path = NULL;
495
496 path = must_make_path_relative(f->cg, NULL);
497
498 sum = refresh_load(f, path);
499 if (sum == 0)
500 f = del_node(f, i);
501 else
502 f = f->next;
503
504 /* load_hash[i].lock locks only on the first node.*/
505 if (first_node == 1) {
506 first_node = 0;
507 pthread_mutex_unlock(&load_hash[i].lock);
508 }
509 }
510 }
511
512 if (loadavg_stop == 1)
513 return NULL;
514
515 time2 = clock();
516 usleep(FLUSH_TIME * 1000000 -
517 (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
518 }
519 }
520
521 /*
522 * init_load initialize the hash table.
523 * Return 0 on success, return -1 on failure.
524 */
525 static int init_load(void)
526 {
527 int i;
528 int ret;
529
530 for (i = 0; i < LOAD_SIZE; i++) {
531 load_hash[i].next = NULL;
532 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
533 if (ret) {
534 lxcfs_error("Failed to initialize lock");
535 goto out3;
536 }
537
538 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
539 if (ret) {
540 lxcfs_error("Failed to initialize rdlock");
541 goto out2;
542 }
543
544 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
545 if (ret) {
546 lxcfs_error("Failed to initialize rilock");
547 goto out1;
548 }
549 }
550
551 return 0;
552
553 out1:
554 pthread_rwlock_destroy(&load_hash[i].rdlock);
555 out2:
556 pthread_mutex_destroy(&load_hash[i].lock);
557 out3:
558 while (i > 0) {
559 i--;
560 pthread_mutex_destroy(&load_hash[i].lock);
561 pthread_rwlock_destroy(&load_hash[i].rdlock);
562 pthread_rwlock_destroy(&load_hash[i].rilock);
563 }
564
565 return -1;
566 }
567
568 static void load_free(void)
569 {
570 struct load_node *f, *p;
571
572 for (int i = 0; i < LOAD_SIZE; i++) {
573 pthread_mutex_lock(&load_hash[i].lock);
574 pthread_rwlock_wrlock(&load_hash[i].rilock);
575 pthread_rwlock_wrlock(&load_hash[i].rdlock);
576 if (load_hash[i].next == NULL) {
577 pthread_mutex_unlock(&load_hash[i].lock);
578 pthread_mutex_destroy(&load_hash[i].lock);
579 pthread_rwlock_unlock(&load_hash[i].rilock);
580 pthread_rwlock_destroy(&load_hash[i].rilock);
581 pthread_rwlock_unlock(&load_hash[i].rdlock);
582 pthread_rwlock_destroy(&load_hash[i].rdlock);
583 continue;
584 }
585
586 for (f = load_hash[i].next; f;) {
587 free_disarm(f->cg);
588 p = f->next;
589 free_disarm(f);
590 f = p;
591 }
592
593 pthread_mutex_unlock(&load_hash[i].lock);
594 pthread_mutex_destroy(&load_hash[i].lock);
595 pthread_rwlock_unlock(&load_hash[i].rilock);
596 pthread_rwlock_destroy(&load_hash[i].rilock);
597 pthread_rwlock_unlock(&load_hash[i].rdlock);
598 pthread_rwlock_destroy(&load_hash[i].rdlock);
599 }
600 }
601
602 /* Return a positive number on success, return 0 on failure.*/
603 pthread_t load_daemon(int load_use)
604 {
605 int ret;
606 pthread_t pid;
607
608 ret = init_load();
609 if (ret == -1)
610 return log_error(0, "Initialize hash_table fails in load_daemon!");
611
612 ret = pthread_create(&pid, NULL, load_begin, NULL);
613 if (ret != 0) {
614 load_free();
615 return log_error(0, "Create pthread fails in load_daemon!");
616 }
617
618 /* use loadavg, here loadavg = 1*/
619 loadavg = load_use;
620 return pid;
621 }
622
623 /* Returns 0 on success. */
624 int stop_load_daemon(pthread_t pid)
625 {
626 int s;
627
628 /* Signal the thread to gracefully stop */
629 loadavg_stop = 1;
630
631 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
632 if (s)
633 return log_error(-1, "stop_load_daemon error: failed to join");
634
635 load_free();
636 loadavg_stop = 0;
637
638 return 0;
639 }