/* SPDX-License-Identifier: LGPL-2.1+ */
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-
-#ifndef FUSE_USE_VERSION
-#define FUSE_USE_VERSION 26
-#endif
-
-#define _FILE_OFFSET_BITS 64
+#include "config.h"
-#define __STDC_FORMAT_MACROS
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
-#include <fuse.h>
#include <inttypes.h>
#include <libgen.h>
#include <pthread.h>
#include <sys/sysinfo.h>
#include <sys/vfs.h>
+#include "proc_cpuview.h"
+
#include "bindings.h"
-#include "config.h"
#include "cgroup_fuse.h"
#include "cpuset_parse.h"
#include "cgroups/cgroup.h"
}
out_rwlock_unlock:
+ pthread_mutex_lock(&rv->lock);
pthread_rwlock_unlock(&head->lock);
return move_ptr(rv);
}
return faccessat(cfd, path, F_OK, 0) == 0;
}
+/* should be called with wr-locked list */
static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
{
struct cg_proc_stat *first = NULL;
for (struct cg_proc_stat *prev = NULL; node; ) {
if (!cgroup_supports("cpu", node->cg, "cpu.shares")) {
- call_cleaner(free_proc_stat_node) struct cg_proc_stat *cur = node;
+ struct cg_proc_stat *cur = node;
+
+ /*
+ * We need to ensure that no one referenced this node,
+ * because we are going to remove it from the list and free memory.
+ *
+ * If we can't grab the lock then just keep this node for now.
+ */
+ if (pthread_mutex_trylock(&cur->lock))
+ goto next;
+
+ /*
+ * Yes, we can put lock back just after taking it, as we ensured
+ * that we are only one user of it right now.
+ *
+ * It follows from three facts:
+ * - we are under pthread_rwlock_wrlock(hash_table_bucket)
+ * - pthread_mutex_lock is taken by find_proc_stat_node()
+ * with pthread_rwlock_rdlock(hash_table_bucket) held.
+ * - pthread_mutex_lock is taken by add_proc_stat_node()
+ * with pthread_rwlock_wrlock(hash_table_bucket) held.
+ *
+ * It means that nobody can get a pointer to (cur) node in a parallel
+ * thread and all old users of (cur) node have released pthread_mutex_lock(cur).
+ */
+ pthread_mutex_unlock(&cur->lock);
if (prev)
prev->next = node->next;
first = node->next;
node = node->next;
- lxcfs_debug("Removing stat node for %s\n", cur->cg);
+ lxcfs_debug("Removing stat node for %s\n", cur);
+
+ free_proc_stat_node(cur);
} else {
+next:
if (!first)
first = node;
prev = node;
{
struct cg_proc_stat *node;
+ prune_proc_stat_history();
pthread_rwlock_rdlock(&head->lock);
if (!head->next) {
node = head->next;
do {
- if (strcmp(cg, node->cg) == 0)
+ if (strcmp(cg, node->cg) == 0) {
+ pthread_mutex_lock(&node->lock);
goto out;
+ }
} while ((node = node->next));
node = NULL;
out:
pthread_rwlock_unlock(&head->lock);
- prune_proc_stat_history();
return node;
}
-static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
+static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage,
+ int cpu_count, const char *cg)
{
int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
struct cg_proc_stat_head *head = proc_stat_history[hash];
lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
}
- pthread_mutex_lock(&node->lock);
-
/*
* If additional CPUs on the host have been enabled, CPU usage counter
* arrays have to be expanded.
/*
* Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or
* `cpu.cfs_period_us`, depending on `param`. Parameter value is returned
- * throuh `value`.
+ * through `value`.
*/
static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
{
int nprocs;
int64_t cfs_quota, cfs_period;
- read_cpu_cfs_param(cg, "quota", &cfs_quota);
- read_cpu_cfs_param(cg, "period", &cfs_period);
+ if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
+ return 0;
+
+ if (!read_cpu_cfs_param(cg, "period", &cfs_period))
+ return 0;
if (cfs_quota <= 0 || cfs_period <= 0)
return 0;
return rv;
}
+/*
+ * Return true if cfs quota of the cgroup is neg / not set
+ */
+static bool cfs_quota_disabled(const char *cg)
+{
+ int64_t cfs_quota;
+
+ if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
+ return true;
+
+ return cfs_quota < 0;
+}
+
/*
* Return the maximum number of visible CPUs based on CPU quotas.
- * If there is no quota set, zero is returned.
+ * If there is no quota set, cpu number in cpuset value is returned.
*/
int max_cpu_count(const char *cg)
{
int64_t cfs_quota, cfs_period;
int nr_cpus_in_cpuset = 0;
- read_cpu_cfs_param(cg, "quota", &cfs_quota);
- read_cpu_cfs_param(cg, "period", &cfs_period);
+ if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
+ cfs_quota = 0;
+
+ if (!read_cpu_cfs_param(cg, "period", &cfs_period))
+ cfs_period = 0;
cpuset = get_cpuset(cg);
if (cpuset)
nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
- if (cfs_quota <= 0 || cfs_period <= 0){
+ if (cfs_quota <= 0 || cfs_period <= 0) {
if (nr_cpus_in_cpuset > 0)
return nr_cpus_in_cpuset;
rv = cfs_quota / cfs_period;
- /* In case quota/period does not yield a whole number, add one CPU for
+ /*
+ * In case quota/period does not yield a whole number, add one CPU for
* the remainder.
*/
if ((cfs_quota % cfs_period) > 0)
if (rv > nprocs)
rv = nprocs;
- /* use min value in cpu quota and cpuset */
+ /* Use min value in cpu quota and cpuset. */
if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
rv = nr_cpus_in_cpuset;
if (all_used >= cg_used) {
cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
-
} else {
- lxcfs_error("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
- curcpu, cg, all_used, cg_used);
+ lxcfs_v("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time",
+ curcpu, cg, all_used, cg_used);
cg_cpu_usage[curcpu].idle = idle;
}
}
if (max_cpus > cpu_cnt || !max_cpus)
max_cpus = cpu_cnt;
+ /* takes lock pthread_mutex_lock(&node->lock) */
stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
if (!stat_node)
return log_error(0, "Failed to find/create stat node for %s", cg);
- diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
+ diff = zalloc(sizeof(struct cpuacct_usage) * nprocs);
if (!diff)
- return 0;
+ goto out_pthread_mutex_unlock;
/*
* If the new values are LOWER than values stored in memory, it means
i++;
- stat_node->usage[curcpu].user += diff[curcpu].user;
+ stat_node->usage[curcpu].user += diff[curcpu].user;
stat_node->usage[curcpu].system += diff[curcpu].system;
- stat_node->usage[curcpu].idle += diff[curcpu].idle;
+ stat_node->usage[curcpu].idle += diff[curcpu].idle;
if (max_cpus > 0 && i >= max_cpus) {
- user_surplus += diff[curcpu].user;
- system_surplus += diff[curcpu].system;
+ user_surplus += diff[curcpu].user;
+ system_surplus += diff[curcpu].system;
}
}
uint64_t max_diff_idle = 0;
uint64_t max_diff_idle_index = 0;
double exact_cpus;
-
/* threshold = maximum usage per cpu, including idle */
threshold = total_sum / cpu_cnt * max_cpus;
}
if (user_surplus > 0)
- lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
+ lxcfs_debug("leftover user: %" PRIu64 "for %s\n", user_surplus, cg);
if (system_surplus > 0)
- lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
+ lxcfs_debug("leftover system: %" PRIu64 "for %s\n", system_surplus, cg);
for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
if (!stat_node->usage[curcpu].online)
if (i == max_cpus)
break;
- stat_node->view[curcpu].user += diff[curcpu].user;
- stat_node->view[curcpu].system += diff[curcpu].system;
- stat_node->view[curcpu].idle += diff[curcpu].idle;
-
- user_sum += stat_node->view[curcpu].user;
- system_sum += stat_node->view[curcpu].system;
- idle_sum += stat_node->view[curcpu].idle;
+ stat_node->view[curcpu].user += diff[curcpu].user;
+ stat_node->view[curcpu].system += diff[curcpu].system;
+ stat_node->view[curcpu].idle += diff[curcpu].idle;
- diff_user += diff[curcpu].user;
- diff_system += diff[curcpu].system;
- diff_idle += diff[curcpu].idle;
+ diff_user += diff[curcpu].user;
+ diff_system += diff[curcpu].system;
+ diff_idle += diff[curcpu].idle;
if (diff[curcpu].idle > max_diff_idle) {
- max_diff_idle = diff[curcpu].idle;
- max_diff_idle_index = curcpu;
+ max_diff_idle = diff[curcpu].idle;
+ max_diff_idle_index = curcpu;
}
- lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
+ lxcfs_v("curcpu: %d, diff_user: %" PRIu64 ", diff_system: %" PRIu64 ", diff_idle: %" PRIu64 "\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
+ }
+ lxcfs_v("total. diff_user: %" PRIu64 ", diff_system: %" PRIu64 ", diff_idle: %" PRIu64 "\n", diff_user, diff_system, diff_idle);
+
+ for (curcpu = 0; curcpu < nprocs; curcpu++) {
+ user_sum += stat_node->view[curcpu].user;
+ system_sum += stat_node->view[curcpu].system;
+ idle_sum += stat_node->view[curcpu].idle;
}
- lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
/* revise cpu usage view to support partial cpu case. */
exact_cpus = exact_cpu_count(cg);
- if (exact_cpus < (double)max_cpus){
+
+ /* skip revise cpu when cfs quota is disabled (exact_cpus == 0) */
+ if (!cfs_quota_disabled(cg) && exact_cpus < (double)max_cpus){
uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
- lxcfs_v("delta: %lu\n", delta);
- lxcfs_v("idle_sum before: %lu\n", idle_sum);
- idle_sum = idle_sum > delta ? idle_sum - delta : 0;
- lxcfs_v("idle_sum after: %lu\n", idle_sum);
+ lxcfs_v("delta: %" PRIu64 "\n", delta);
+ lxcfs_v("idle_sum before: %" PRIu64 "\n", idle_sum);
+ if (idle_sum > delta)
+ idle_sum = idle_sum - delta;
+ else
+ idle_sum = 0;
+ lxcfs_v("idle_sum after: %l" PRIu64 "\n", idle_sum);
curcpu = max_diff_idle_index;
- lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
- stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
- lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
+ lxcfs_v("curcpu: %d, idle before: %" PRIu64 "\n", curcpu, stat_node->view[curcpu].idle);
+ if (stat_node->view[curcpu].idle > delta)
+ stat_node->view[curcpu].idle = stat_node->view[curcpu].idle - delta;
+ else
+ stat_node->view[curcpu].idle = 0;
+ lxcfs_v("curcpu: %d, idle after: %" PRIu64 "\n", curcpu, stat_node->view[curcpu].idle);
}
} else {
for (curcpu = 0; curcpu < nprocs; curcpu++) {
if (!stat_node->usage[curcpu].online)
continue;
- stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
- stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
- stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
+ stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
+ stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
+ stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
- user_sum += stat_node->view[curcpu].user;
- system_sum += stat_node->view[curcpu].system;
- idle_sum += stat_node->view[curcpu].idle;
+ user_sum += stat_node->view[curcpu].user;
+ system_sum += stat_node->view[curcpu].system;
+ idle_sum += stat_node->view[curcpu].idle;
}
}
"cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
user_sum, system_sum, idle_sum);
lxcfs_v("cpu-all: %s\n", buf);
- if (l < 0)
- return log_error(0, "Failed to write cache");
- if (l >= buf_size)
- return log_error(0, "Write to cache was truncated");
+ if (l < 0) {
+ lxcfs_error("Failed to write cache");
+ total_len = 0;
+ goto out_pthread_mutex_unlock;
+ }
+ if ((size_t)l >= buf_size) {
+ lxcfs_error("Write to cache was truncated");
+ total_len = 0;
+ goto out_pthread_mutex_unlock;
+ }
buf += l;
buf_size -= l;
total_len += l;
- /* Render visible CPUs */
+ /* Render visible CPUs
+ Assume there are K CPUs: 0, 1, 2, ..., K-1.
+ Among them, there are M online CPUs with index: a1, a2, ... aN ... aM (M >= N)
+ N = max_cpus, M = number of online CPUs
+
+ There will be N rendered cpus, indexed from 0 to N-1, cpu times of the cpus are calculated from those formula:
+ - user_time[0] = stat_node->view[0].user + stat_node->view[1].user + ... + stat_node->view[a1].user
+ - user_time[1] = stat_node->view[a1+1].user + stat_node->view[a1+1].user + ... + stat_node->view[a2].user
+ ...
+ - user_time[N-2] = stat_node->view[a(N-2)+1].user + stat_node->view[a(N-2)+2].user + ...
+ + stat_node->view[a(N-1)].user
+ - user_time[N-1] = stat_node->view[a(N-1)+1].user + stat_node->view[a(N-1)+2].user + ...
+ + stat_node->view[aN] + ... + stat_node->view[K-1] (sum of all remaining CPUs)
+
+ Similar formula applied for system and idle time
+ */
+
+ uint64_t curcpu_view_user_sum = 0, curcpu_view_system_sum = 0, curcpu_view_idle_sum = 0;
for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
- if (!stat_node->usage[curcpu].online)
- continue;
+ curcpu_view_user_sum += stat_node->view[curcpu].user;
+ curcpu_view_system_sum += stat_node->view[curcpu].system;
+ curcpu_view_idle_sum += stat_node->view[curcpu].idle;
+ if (!stat_node->usage[curcpu].online && curcpu < nprocs - 1) {
+ continue;
+ }
+
i++;
- if (max_cpus > 0 && i == max_cpus)
- break;
+ if (max_cpus > 0 && i >= max_cpus) {
+ // max(i) = count(rendered cpus) = max_cpus - 1
+ i--;
+ }
+
+ if (max_cpus > 0 && i == max_cpus - 1 && curcpu < nprocs - 1) {
+ // last 'rendered' cpu, sum until reaches the last cpu
+ continue;
+ }
l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n",
i,
- stat_node->view[curcpu].user,
- stat_node->view[curcpu].system,
- stat_node->view[curcpu].idle);
+ curcpu_view_user_sum,
+ curcpu_view_system_sum,
+ curcpu_view_idle_sum);
lxcfs_v("cpu: %s\n", buf);
- if (l < 0)
- return log_error(0, "Failed to write cache");
- if (l >= buf_size)
- return log_error(0, "Write to cache was truncated");
+ if (l < 0) {
+ lxcfs_error("Failed to write cache");
+ total_len = 0;
+ goto out_pthread_mutex_unlock;
+ }
+ if ((size_t)l >= buf_size) {
+ lxcfs_error("Write to cache was truncated");
+ total_len = 0;
+ goto out_pthread_mutex_unlock;
+ }
buf += l;
buf_size -= l;
total_len += l;
+
+ curcpu_view_user_sum = 0;
+ curcpu_view_system_sum = 0;
+ curcpu_view_idle_sum = 0;
}
/* Pass the rest of /proc/stat, start with the last line read */
l = snprintf(buf, buf_size, "%s", line);
- if (l < 0)
- return log_error(0, "Failed to write cache");
- if (l >= buf_size)
- return log_error(0, "Write to cache was truncated");
+ if (l < 0) {
+ lxcfs_error("Failed to write cache");
+ total_len = 0;
+ goto out_pthread_mutex_unlock;
+ }
+ if ((size_t)l >= buf_size) {
+ lxcfs_error("Write to cache was truncated");
+ total_len = 0;
+ goto out_pthread_mutex_unlock;
+ }
buf += l;
buf_size -= l;
/* Pass the rest of the host's /proc/stat */
while (getline(&line, &linelen, f) != -1) {
l = snprintf(buf, buf_size, "%s", line);
- if (l < 0)
- return log_error(0, "Failed to write cache");
- if (l >= buf_size)
- return log_error(0, "Write to cache was truncated");
+ if (l < 0) {
+ lxcfs_error("Failed to write cache");
+ total_len = 0;
+ goto out_pthread_mutex_unlock;
+ }
+ if ((size_t)l >= buf_size) {
+ lxcfs_error("Write to cache was truncated");
+ total_len = 0;
+ goto out_pthread_mutex_unlock;
+ }
buf += l;
buf_size -= l;
total_len += l;
}
+out_pthread_mutex_unlock:
if (stat_node)
pthread_mutex_unlock(&stat_node->lock);
size_t cache_size = d->buflen;
if (offset) {
- int left;
+ size_t left;
if (offset > d->size)
return -EINVAL;
l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
if (l < 0)
return log_error(0, "Failed to write cache");
- if (l >= cache_size)
+ if ((size_t)l >= cache_size)
return log_error(0, "Write to cache was truncated");
cache += l;
cache_size -= l;
l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
if (l < 0)
return log_error(0, "Failed to write cache");
- if (l >= cache_size)
+ if ((size_t)l >= cache_size)
return log_error(0, "Write to cache was truncated");
cache += l;
l = snprintf(cache, cache_size, "%s", line);
if (l < 0)
return log_error(0, "Failed to write cache");
- if (l >= cache_size)
+ if ((size_t)l >= cache_size)
return log_error(0, "Write to cache was truncated");
cache += l;
cache_size = d->buflen;
total_len = 0;
l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
- if (l < 0 || l >= cache_size)
+ if (l < 0 || (size_t)l >= cache_size)
return 0;
cache_size -= l;
cache += l;
total_len += l;
l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
- if (l < 0 || l >= cache_size)
+ if (l < 0 || (size_t)l >= cache_size)
return 0;
cache_size -= l;
cache += l;
total_len += l;
l = snprintf(cache, cache_size, "%s", origcache);
- if (l < 0 || l >= cache_size)
+ if (l < 0 || (size_t)l >= cache_size)
return 0;
total_len += l;
}
static bool cpuview_init_head(struct cg_proc_stat_head **head)
{
- *head = malloc(sizeof(struct cg_proc_stat_head));
- if (!(*head))
- return log_error(false, "%s", strerror(errno));
+ __do_free struct cg_proc_stat_head *h;
- (*head)->lastcheck = time(NULL);
- (*head)->next = NULL;
+ h = zalloc(sizeof(struct cg_proc_stat_head));
+ if (!h)
+ return false;
- if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
- free_disarm(*head);
- return log_error(false, "Failed to initialize list lock");
- }
+ if (pthread_rwlock_init(&h->lock, NULL))
+ return false;
+
+ h->lastcheck = time(NULL);
+ *head = move_ptr(h);
return true;
}