]> git.proxmox.com Git - mirror_lxcfs.git/blob - src/proc_loadavg.c
tree-wide: align lxcfs and lxc licensing
[mirror_lxcfs.git] / src / proc_loadavg.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE
5 #endif
6
7 #ifndef FUSE_USE_VERSION
8 #define FUSE_USE_VERSION 26
9 #endif
10
11 #define _FILE_OFFSET_BITS 64
12
13 #define __STDC_FORMAT_MACROS
14 #include <dirent.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <fuse.h>
18 #include <inttypes.h>
19 #include <libgen.h>
20 #include <pthread.h>
21 #include <sched.h>
22 #include <stdarg.h>
23 #include <stdbool.h>
24 #include <stdint.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <time.h>
29 #include <unistd.h>
30 #include <wait.h>
31 #include <linux/magic.h>
32 #include <linux/sched.h>
33 #include <sys/epoll.h>
34 #include <sys/mman.h>
35 #include <sys/mount.h>
36 #include <sys/param.h>
37 #include <sys/socket.h>
38 #include <sys/syscall.h>
39 #include <sys/sysinfo.h>
40 #include <sys/vfs.h>
41
42 #include "bindings.h"
43 #include "config.h"
44 #include "cgroup_fuse.h"
45 #include "cgroups/cgroup.h"
46 #include "cgroups/cgroup_utils.h"
47 #include "memory_utils.h"
48 #include "utils.h"
49
50 /*
51 * This parameter is used for proc_loadavg_read().
52 * 1 means use loadavg, 0 means not use.
53 */
54 static int loadavg = 0;
55
56 /* The function of hash table.*/
57 #define LOAD_SIZE 100 /*the size of hash_table */
58 #define FLUSH_TIME 5 /*the flush rate */
59 #define DEPTH_DIR 3 /*the depth of per cgroup */
60 /* The function of calculate loadavg .*/
61 #define FSHIFT 11 /* nr of bits of precision */
62 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
63 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
64 #define EXP_5 2014 /* 1/exp(5sec/5min) */
65 #define EXP_15 2037 /* 1/exp(5sec/15min) */
66 #define LOAD_INT(x) ((x) >> FSHIFT)
67 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
68 static volatile sig_atomic_t loadavg_stop = 0;
69
70 struct load_node {
71 char *cg; /*cg */
72 unsigned long avenrun[3]; /* Load averages */
73 unsigned int run_pid;
74 unsigned int total_pid;
75 unsigned int last_pid;
76 int cfd; /* The file descriptor of the mounted cgroup */
77 struct load_node *next;
78 struct load_node **pre;
79 };
80
81 struct load_head {
82 /*
83 * The lock is about insert load_node and refresh load_node.To the first
84 * load_node of each hash bucket, insert and refresh in this hash bucket is
85 * mutually exclusive.
86 */
87 pthread_mutex_t lock;
88 /*
89 * The rdlock is about read loadavg and delete load_node.To each hash
90 * bucket, read and delete is mutually exclusive. But at the same time, we
91 * allow paratactic read operation. This rdlock is at list level.
92 */
93 pthread_rwlock_t rdlock;
94 /*
95 * The rilock is about read loadavg and insert load_node.To the first
96 * load_node of each hash bucket, read and insert is mutually exclusive.
97 * But at the same time, we allow paratactic read operation.
98 */
99 pthread_rwlock_t rilock;
100 struct load_node *next;
101 };
102
103 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
104
105 /*
106 * locate_node() finds special node. Not return NULL means success.
107 * It should be noted that rdlock isn't unlocked at the end of code
108 * because this function is used to read special node. Delete is not
109 * allowed before read has ended.
110 * unlock rdlock only in proc_loadavg_read().
111 */
112 static struct load_node *locate_node(char *cg, int locate)
113 {
114 struct load_node *f = NULL;
115 int i = 0;
116
117 pthread_rwlock_rdlock(&load_hash[locate].rilock);
118 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
119 if (load_hash[locate].next == NULL) {
120 pthread_rwlock_unlock(&load_hash[locate].rilock);
121 return f;
122 }
123 f = load_hash[locate].next;
124 pthread_rwlock_unlock(&load_hash[locate].rilock);
125 while (f && ((i = strcmp(f->cg, cg)) != 0))
126 f = f->next;
127 return f;
128 }
129
130 static void insert_node(struct load_node **n, int locate)
131 {
132 struct load_node *f;
133
134 pthread_mutex_lock(&load_hash[locate].lock);
135 pthread_rwlock_wrlock(&load_hash[locate].rilock);
136 f = load_hash[locate].next;
137 load_hash[locate].next = *n;
138
139 (*n)->pre = &(load_hash[locate].next);
140 if (f)
141 f->pre = &((*n)->next);
142 (*n)->next = f;
143 pthread_mutex_unlock(&load_hash[locate].lock);
144 pthread_rwlock_unlock(&load_hash[locate].rilock);
145 }
146
147 int calc_hash(const char *name)
148 {
149 unsigned int hash = 0;
150 unsigned int x = 0;
151
152 /* ELFHash algorithm. */
153 while (*name) {
154 hash = (hash << 4) + *name++;
155 x = hash & 0xf0000000;
156 if (x != 0)
157 hash ^= (x >> 24);
158 hash &= ~x;
159 }
160
161 return (hash & 0x7fffffff);
162 }
163
164 int proc_loadavg_read(char *buf, size_t size, off_t offset,
165 struct fuse_file_info *fi)
166 {
167 __do_free char *cg = NULL;
168 struct fuse_context *fc = fuse_get_context();
169 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
170 pid_t initpid;
171 size_t total_len = 0;
172 char *cache = d->buf;
173 struct load_node *n;
174 int hash;
175 int cfd;
176 unsigned long a, b, c;
177
178 if (offset) {
179 int left;
180
181 if (offset > d->size)
182 return -EINVAL;
183
184 if (!d->cached)
185 return 0;
186
187 left = d->size - offset;
188 total_len = left > size ? size : left;
189 memcpy(buf, cache + offset, total_len);
190
191 return total_len;
192 }
193 if (!loadavg)
194 return read_file_fuse("/proc/loadavg", buf, size, d);
195
196 initpid = lookup_initpid_in_store(fc->pid);
197 if (initpid <= 1 || is_shared_pidns(initpid))
198 initpid = fc->pid;
199
200 cg = get_pid_cgroup(initpid, "cpu");
201 if (!cg)
202 return read_file_fuse("/proc/loadavg", buf, size, d);
203
204 prune_init_slice(cg);
205 hash = calc_hash(cg) % LOAD_SIZE;
206 n = locate_node(cg, hash);
207
208 /* First time */
209 if (n == NULL) {
210 cfd = get_cgroup_fd("cpu");
211 if (cfd >= 0) {
212 /*
213 * In locate_node() above, pthread_rwlock_unlock() isn't used
214 * because delete is not allowed before read has ended.
215 */
216 pthread_rwlock_unlock(&load_hash[hash].rdlock);
217 return 0;
218 }
219
220 do {
221 n = malloc(sizeof(struct load_node));
222 } while (!n);
223
224 do {
225 n->cg = malloc(strlen(cg)+1);
226 } while (!n->cg);
227
228 strcpy(n->cg, cg);
229 n->avenrun[0] = 0;
230 n->avenrun[1] = 0;
231 n->avenrun[2] = 0;
232 n->run_pid = 0;
233 n->total_pid = 1;
234 n->last_pid = initpid;
235 n->cfd = cfd;
236 insert_node(&n, hash);
237 }
238 a = n->avenrun[0] + (FIXED_1/200);
239 b = n->avenrun[1] + (FIXED_1/200);
240 c = n->avenrun[2] + (FIXED_1/200);
241 total_len = snprintf(d->buf, d->buflen,
242 "%lu.%02lu "
243 "%lu.%02lu "
244 "%lu.%02lu "
245 "%d/"
246 "%d"
247 "%d\n",
248 LOAD_INT(a),
249 LOAD_FRAC(a),
250 LOAD_INT(b),
251 LOAD_FRAC(b),
252 LOAD_INT(c),
253 LOAD_FRAC(c),
254 n->run_pid,
255 n->total_pid,
256 n->last_pid);
257 pthread_rwlock_unlock(&load_hash[hash].rdlock);
258 if (total_len < 0 || total_len >= d->buflen)
259 return log_error(0, "Failed to write to cache");
260
261 d->size = (int)total_len;
262 d->cached = 1;
263
264 if (total_len > size)
265 total_len = size;
266
267 memcpy(buf, d->buf, total_len);
268 return total_len;
269 }
270
271 /*
272 * Find the process pid from cgroup path.
273 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
274 * @pid_buf : put pid to pid_buf.
275 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
276 * @depth : the depth of cgroup in container.
277 * @sum : return the number of pid.
278 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
279 */
280 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
281 {
282 __do_free char *path = NULL;
283 __do_free void *fdopen_cache = NULL;
284 __do_close_prot_errno int fd = -EBADF;
285 __do_fclose FILE *f = NULL;
286 __do_closedir DIR *dir = NULL;
287 struct dirent *file;
288 size_t linelen = 0;
289 char *line = NULL;
290 int pd;
291 char **pid;
292
293 /* path = dpath + "/cgroup.procs" + /0 */
294 path = malloc(strlen(dpath) + 20);
295 if (!path)
296 return sum;
297
298 strcpy(path, dpath);
299 fd = openat(cfd, path, O_RDONLY | O_CLOEXEC | O_NOFOLLOW);
300 if (fd < 0)
301 return sum;
302
303 dir = fdopendir(move_fd(fd));
304 if (!dir)
305 return sum;
306
307 while (((file = readdir(dir)) != NULL) && depth > 0) {
308 if (strcmp(file->d_name, ".") == 0)
309 continue;
310
311 if (strcmp(file->d_name, "..") == 0)
312 continue;
313
314 if (file->d_type == DT_DIR) {
315 __do_free char *path_dir = NULL;
316
317 /* path + '/' + d_name +/0 */
318 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
319 if (!path_dir)
320 return sum;
321
322 strcpy(path_dir, path);
323 strcat(path_dir, "/");
324 strcat(path_dir, file->d_name);
325 pd = depth - 1;
326 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
327 }
328 }
329
330 strcat(path, "/cgroup.procs");
331 fd = openat(cfd, path, O_RDONLY);
332 if (fd < 0)
333 return sum;
334
335 f = fdopen_cached(fd, "re", &fdopen_cache);
336 if (!f)
337 return sum;
338
339 while (getline(&line, &linelen, f) != -1) {
340 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
341 if (!pid)
342 return sum;
343 *pid_buf = pid;
344
345 *(*pid_buf + sum) = malloc(strlen(line) + 1);
346 if (!*(*pid_buf + sum))
347 return sum;
348
349 strcpy(*(*pid_buf + sum), line);
350 sum++;
351 }
352
353 return sum;
354 }
355
356 /*
357 * calc_load calculates the load according to the following formula:
358 * load1 = load0 * exp + active * (1 - exp)
359 *
360 * @load1: the new loadavg.
361 * @load0: the former loadavg.
362 * @active: the total number of running pid at this moment.
363 * @exp: the fixed-point defined in the beginning.
364 */
365 static unsigned long calc_load(unsigned long load, unsigned long exp,
366 unsigned long active)
367 {
368 unsigned long newload;
369
370 active = active > 0 ? active * FIXED_1 : 0;
371 newload = load * exp + active * (FIXED_1 - exp);
372 if (active >= load)
373 newload += FIXED_1 - 1;
374
375 return newload / FIXED_1;
376 }
377
378 /*
379 * Return 0 means that container p->cg is closed.
380 * Return -1 means that error occurred in refresh.
381 * Positive num equals the total number of pid.
382 */
383 static int refresh_load(struct load_node *p, char *path)
384 {
385 __do_free char *line = NULL;
386 char **idbuf;
387 char proc_path[256];
388 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
389 size_t linelen = 0;
390 int sum, length;
391 struct dirent *file;
392
393 idbuf = malloc(sizeof(char *));
394 if (!idbuf)
395 return -1;
396
397 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
398 /* normal exit */
399 if (sum == 0)
400 goto out;
401
402 for (i = 0; i < sum; i++) {
403 __do_closedir DIR *dp = NULL;
404
405 /*clean up '\n' */
406 length = strlen(idbuf[i]) - 1;
407 idbuf[i][length] = '\0';
408 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
409 if (ret < 0 || ret > 255) {
410 i = sum;
411 sum = -1;
412 log_error(goto err_out, "snprintf() failed in refresh_load");
413 }
414
415 dp = opendir(proc_path);
416 if (!dp)
417 log_error(continue, "Open proc_path failed in refresh_load");
418
419 while ((file = readdir(dp)) != NULL) {
420 __do_free void *fopen_cache = NULL;
421 __do_fclose FILE *f = NULL;
422
423 if (strncmp(file->d_name, ".", 1) == 0)
424 continue;
425
426 if (strncmp(file->d_name, "..", 1) == 0)
427 continue;
428
429 total_pid++;
430
431 /* We make the biggest pid become last_pid.*/
432 ret = atof(file->d_name);
433 last_pid = (ret > last_pid) ? ret : last_pid;
434
435 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status",
436 idbuf[i], file->d_name);
437 if (ret < 0 || ret > 255) {
438 i = sum;
439 sum = -1;
440 log_error(goto err_out, "snprintf() failed in refresh_load");
441 }
442
443 f = fopen_cached(proc_path, "re", &fopen_cache);
444 if (f != NULL) {
445 while (getline(&line, &linelen, f) != -1) {
446 /* Find State */
447 if ((strncmp(line, "State", 5) == 0) &&
448 (strncmp(line, "State R", 7) == 0 ||
449 strncmp(line, "State D", 7) == 0))
450 run_pid++;
451 break;
452 }
453 }
454 }
455 }
456 /*Calculate the loadavg.*/
457 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
458 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
459 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
460 p->run_pid = run_pid;
461 p->total_pid = total_pid;
462 p->last_pid = last_pid;
463
464 err_out:
465 for (; i > 0; i--)
466 free(idbuf[i - 1]);
467 out:
468 free(idbuf);
469 return sum;
470 }
471
472 /* Delete the load_node n and return the next node of it. */
473 static struct load_node *del_node(struct load_node *n, int locate)
474 {
475 struct load_node *g;
476
477 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
478 if (n->next == NULL) {
479 *(n->pre) = NULL;
480 } else {
481 *(n->pre) = n->next;
482 n->next->pre = n->pre;
483 }
484 g = n->next;
485 free_disarm(n->cg);
486 free_disarm(n);
487 pthread_rwlock_unlock(&load_hash[locate].rdlock);
488 return g;
489 }
490
491 /*
492 * Traverse the hash table and update it.
493 */
494 static void *load_begin(void *arg)
495 {
496
497 int i, sum, length, ret;
498 struct load_node *f;
499 int first_node;
500 clock_t time1, time2;
501
502 while (1) {
503 if (loadavg_stop == 1)
504 return NULL;
505
506 time1 = clock();
507 for (i = 0; i < LOAD_SIZE; i++) {
508 pthread_mutex_lock(&load_hash[i].lock);
509 if (load_hash[i].next == NULL) {
510 pthread_mutex_unlock(&load_hash[i].lock);
511 continue;
512 }
513 f = load_hash[i].next;
514 first_node = 1;
515 while (f) {
516 __do_free char *path = NULL;
517
518 length = strlen(f->cg) + 2;
519 /* strlen(f->cg) + '.' or '' + \0 */
520 path = malloc(length);
521 if (!path)
522 goto out;
523
524 ret = snprintf(path, length, "%s%s", dot_or_empty(f->cg), f->cg);
525 /* Ignore the node if snprintf fails.*/
526 if (ret < 0 || ret > length - 1)
527 log_error(goto out, "Refresh node %s failed for snprintf()", f->cg);
528
529 sum = refresh_load(f, path);
530 if (sum == 0)
531 f = del_node(f, i);
532 else
533 out: f = f->next;
534 /* load_hash[i].lock locks only on the first node.*/
535 if (first_node == 1) {
536 first_node = 0;
537 pthread_mutex_unlock(&load_hash[i].lock);
538 }
539 }
540 }
541
542 if (loadavg_stop == 1)
543 return NULL;
544
545 time2 = clock();
546 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
547 }
548 }
549
550 /*
551 * init_load initialize the hash table.
552 * Return 0 on success, return -1 on failure.
553 */
554 static int init_load(void)
555 {
556 int i;
557 int ret;
558
559 for (i = 0; i < LOAD_SIZE; i++) {
560 load_hash[i].next = NULL;
561 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
562 if (ret) {
563 lxcfs_error("Failed to initialize lock");
564 goto out3;
565 }
566
567 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
568 if (ret) {
569 lxcfs_error("Failed to initialize rdlock");
570 goto out2;
571 }
572
573 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
574 if (ret) {
575 lxcfs_error("Failed to initialize rilock");
576 goto out1;
577 }
578 }
579
580 return 0;
581
582 out1:
583 pthread_rwlock_destroy(&load_hash[i].rdlock);
584 out2:
585 pthread_mutex_destroy(&load_hash[i].lock);
586 out3:
587 while (i > 0) {
588 i--;
589 pthread_mutex_destroy(&load_hash[i].lock);
590 pthread_rwlock_destroy(&load_hash[i].rdlock);
591 pthread_rwlock_destroy(&load_hash[i].rilock);
592 }
593
594 return -1;
595 }
596
597 static void load_free(void)
598 {
599 struct load_node *f, *p;
600
601 for (int i = 0; i < LOAD_SIZE; i++) {
602 pthread_mutex_lock(&load_hash[i].lock);
603 pthread_rwlock_wrlock(&load_hash[i].rilock);
604 pthread_rwlock_wrlock(&load_hash[i].rdlock);
605 if (load_hash[i].next == NULL) {
606 pthread_mutex_unlock(&load_hash[i].lock);
607 pthread_mutex_destroy(&load_hash[i].lock);
608 pthread_rwlock_unlock(&load_hash[i].rilock);
609 pthread_rwlock_destroy(&load_hash[i].rilock);
610 pthread_rwlock_unlock(&load_hash[i].rdlock);
611 pthread_rwlock_destroy(&load_hash[i].rdlock);
612 continue;
613 }
614
615 for (f = load_hash[i].next; f;) {
616 free_disarm(f->cg);
617 p = f->next;
618 free_disarm(f);
619 f = p;
620 }
621
622 pthread_mutex_unlock(&load_hash[i].lock);
623 pthread_mutex_destroy(&load_hash[i].lock);
624 pthread_rwlock_unlock(&load_hash[i].rilock);
625 pthread_rwlock_destroy(&load_hash[i].rilock);
626 pthread_rwlock_unlock(&load_hash[i].rdlock);
627 pthread_rwlock_destroy(&load_hash[i].rdlock);
628 }
629 }
630
631 /* Return a positive number on success, return 0 on failure.*/
632 pthread_t load_daemon(int load_use)
633 {
634 int ret;
635 pthread_t pid;
636
637 ret = init_load();
638 if (ret == -1)
639 return log_error(0, "Initialize hash_table fails in load_daemon!");
640
641 ret = pthread_create(&pid, NULL, load_begin, NULL);
642 if (ret != 0) {
643 load_free();
644 return log_error(0, "Create pthread fails in load_daemon!");
645 }
646
647 /* use loadavg, here loadavg = 1*/
648 loadavg = load_use;
649 return pid;
650 }
651
652 /* Returns 0 on success. */
653 int stop_load_daemon(pthread_t pid)
654 {
655 int s;
656
657 /* Signal the thread to gracefully stop */
658 loadavg_stop = 1;
659
660 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
661 if (s)
662 return log_error(-1, "stop_load_daemon error: failed to join");
663
664 load_free();
665 loadavg_stop = 0;
666
667 return 0;
668 }