]> git.proxmox.com Git - mirror_lxcfs.git/blobdiff - bindings.c
Use hash table to store load information
[mirror_lxcfs.git] / bindings.c
index f3aefa15004338f0718c6c6bdbc1a6ef1ea40800..d4ddd3300e1d2e41588e051f483d856454dab82c 100644 (file)
@@ -66,6 +66,7 @@ enum {
        LXC_TYPE_PROC_STAT,
        LXC_TYPE_PROC_DISKSTATS,
        LXC_TYPE_PROC_SWAPS,
+       LXC_TYPE_PROC_LOADAVG,
 };
 
 struct file_info {
@@ -79,6 +80,158 @@ struct file_info {
        int cached;
 };
 
+/* The function of hash table.*/
+#define LOAD_SIZE 100 /*the size of hash_table */
+static int calc_hash(char *name)
+{
+       unsigned int hash = 0;
+       unsigned int x = 0;
+       /* ELFHash algorithm. */
+       while (*name) {
+               hash = (hash << 4) + *name++;
+               x = hash & 0xf0000000;
+               if (x != 0)
+                       hash ^= (x >> 24);
+               hash &= ~x;
+       }
+       return ((hash & 0x7fffffff) % LOAD_SIZE);
+}
+
+struct load_node {
+       char *cg;  /*cg */
+       unsigned long avenrun[3];               /* Load averages */
+       unsigned int run_pid;
+       unsigned int total_pid;
+       unsigned int last_pid;
+       int cfd; /* The file descriptor of the mounted cgroup */
+       struct  load_node *next;
+       struct  load_node **pre;
+};
+
+struct load_head {
+       /*
+        * The lock is about insert load_node and refresh load_node.To the first
+        * load_node of each hash bucket, insert and refresh in this hash bucket is
+        * mutually exclusive.
+        */
+       pthread_mutex_t lock;
+       /*
+        * The rdlock is about read loadavg and delete load_node.To each hash
+        * bucket, read and delete is mutually exclusive. But at the same time, we
+        * allow paratactic read operation. This rdlock is at list level.
+        */
+       pthread_rwlock_t rdlock;
+       /*
+        * The rilock is about read loadavg and insert load_node.To the first
+        * load_node of each hash bucket, read and insert is mutually exclusive.
+        * But at the same time, we allow paratactic read operation.
+        */
+       pthread_rwlock_t rilock;
+       struct load_node *next;
+};
+
+static struct load_head load_hash[LOAD_SIZE]; /* hash table */
+/*
+ * init_load initialize the hash table.
+ * Return 0 on success, return -1 on failure.
+ */
+static int init_load(void)
+{
+       int i;
+       int ret;
+
+       for (i = 0; i < LOAD_SIZE; i++) {
+               load_hash[i].next = NULL;
+               ret = pthread_mutex_init(&load_hash[i].lock, NULL);
+               if (ret != 0) {
+                       lxcfs_error("%s\n", "Failed to initialize lock");
+                       goto out3;
+               }
+               ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
+               if (ret != 0) {
+                       lxcfs_error("%s\n", "Failed to initialize rdlock");
+                       goto out2;
+               }
+               ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
+               if (ret != 0) {
+                       lxcfs_error("%s\n", "Failed to initialize rilock");
+                       goto out1;
+               }
+       }
+       return 0;
+out1:
+       pthread_rwlock_destroy(&load_hash[i].rdlock);
+out2:
+       pthread_mutex_destroy(&load_hash[i].lock);
+out3:
+       while (i > 0) {
+               i--;
+               pthread_mutex_destroy(&load_hash[i].lock);
+               pthread_rwlock_destroy(&load_hash[i].rdlock);
+               pthread_rwlock_destroy(&load_hash[i].rilock);
+       }
+       return -1;
+}
+
+static void insert_node(struct load_node **n, int locate)
+{
+       struct load_node *f;
+
+       pthread_mutex_lock(&load_hash[locate].lock);
+       pthread_rwlock_wrlock(&load_hash[locate].rilock);
+       f = load_hash[locate].next;
+       load_hash[locate].next = *n;
+
+       (*n)->pre = &(load_hash[locate].next);
+       if (f)
+               f->pre = &((*n)->next);
+       (*n)->next = f;
+       pthread_mutex_unlock(&load_hash[locate].lock);
+       pthread_rwlock_unlock(&load_hash[locate].rilock);
+}
+/*
+ * locate_node() finds special node. Not return NULL means success.
+ * It should be noted that rdlock isn't unlocked at the end of code
+ * because this function is used to read special node. Delete is not
+ * allowed before read has ended.
+ * unlock rdlock only in proc_loadavg_read().
+ */
+static struct load_node *locate_node(char *cg, int locate)
+{
+       struct load_node *f = NULL;
+       int i = 0;
+
+       pthread_rwlock_rdlock(&load_hash[locate].rilock);
+       pthread_rwlock_rdlock(&load_hash[locate].rdlock);
+       if (load_hash[locate].next == NULL) {
+               pthread_rwlock_unlock(&load_hash[locate].rilock);
+               return f;
+       }
+       f = load_hash[locate].next;
+       pthread_rwlock_unlock(&load_hash[locate].rilock);
+       while (f && ((i = strcmp(f->cg, cg)) != 0))
+               f = f->next;
+       return f;
+}
+/* Delete the load_node n and return the next node of it. */
+static struct load_node *del_node(struct load_node *n, int locate)
+{
+       struct load_node *g;
+
+       pthread_rwlock_wrlock(&load_hash[locate].rdlock);
+       if (n->next == NULL) {
+               *(n->pre) = NULL;
+       } else {
+               *(n->pre) = n->next;
+               n->next->pre = n->pre;
+       }
+       g = n->next;
+       free(n->cg);
+       free(n);
+       pthread_rwlock_unlock(&load_hash[locate].rdlock);
+       return g;
+}
+
 /* Reserve buffer size to account for file size changes. */
 #define BUF_RESERVE_SIZE 512
 
@@ -139,6 +292,7 @@ static char **hierarchies;
  * another namespace using the *at() family of functions
  * {openat(), fchownat(), ...}. */
 static int *fd_hierarchies;
+static int cgroup_mount_ns_fd = -1;
 
 static void unlock_mutex(pthread_mutex_t *l)
 {
@@ -421,6 +575,7 @@ static void print_subsystems(void)
 {
        int i;
 
+       fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
        fprintf(stderr, "hierarchies:\n");
        for (i = 0; i < num_hierarchies; i++) {
                if (hierarchies[i])
@@ -2959,23 +3114,23 @@ static void parse_memstat(char *memstat, unsigned long *cached,
        char *eol;
 
        while (*memstat) {
-               if (startswith(memstat, "cache")) {
-                       sscanf(memstat + 5, "%lu", cached);
+               if (startswith(memstat, "total_cache")) {
+                       sscanf(memstat + 11, "%lu", cached);
                        *cached /= 1024;
-               } else if (startswith(memstat, "active_anon")) {
-                       sscanf(memstat + 11, "%lu", active_anon);
+               } else if (startswith(memstat, "total_active_anon")) {
+                       sscanf(memstat + 17, "%lu", active_anon);
                        *active_anon /= 1024;
-               } else if (startswith(memstat, "inactive_anon")) {
-                       sscanf(memstat + 13, "%lu", inactive_anon);
+               } else if (startswith(memstat, "total_inactive_anon")) {
+                       sscanf(memstat + 19, "%lu", inactive_anon);
                        *inactive_anon /= 1024;
-               } else if (startswith(memstat, "active_file")) {
-                       sscanf(memstat + 11, "%lu", active_file);
+               } else if (startswith(memstat, "total_active_file")) {
+                       sscanf(memstat + 17, "%lu", active_file);
                        *active_file /= 1024;
-               } else if (startswith(memstat, "inactive_file")) {
-                       sscanf(memstat + 13, "%lu", inactive_file);
+               } else if (startswith(memstat, "total_inactive_file")) {
+                       sscanf(memstat + 19, "%lu", inactive_file);
                        *inactive_file /= 1024;
-               } else if (startswith(memstat, "unevictable")) {
-                       sscanf(memstat + 11, "%lu", unevictable);
+               } else if (startswith(memstat, "total_unevictable")) {
+                       sscanf(memstat + 17, "%lu", unevictable);
                        *unevictable /= 1024;
                }
                eol = strchr(memstat, '\n');
@@ -3165,16 +3320,16 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset,
                        snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
                        printme = lbuf;
                } else if (startswith(line, "MemAvailable:")) {
-                       snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage);
+                       snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
                        printme = lbuf;
                } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
                        sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
-                       if (hostswtotal < memswlimit - memlimit)
-                               memswlimit = hostswtotal + memlimit;
-                       snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit - memlimit);
+                       if (hostswtotal < memswlimit)
+                               memswlimit = hostswtotal;
+                       snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
                        printme = lbuf;
                } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
-                       unsigned long swaptotal = memswlimit - memlimit,
+                       unsigned long swaptotal = memswlimit,
                                        swapusage = memswusage - memusage,
                                        swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
                        snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
@@ -3591,24 +3746,6 @@ static uint64_t get_reaper_age(pid_t pid)
        return procage;
 }
 
-static uint64_t get_reaper_btime(pid)
-{
-       int ret;
-       struct sysinfo sys;
-       uint64_t procstart;
-       uint64_t uptime;
-
-       ret = sysinfo(&sys);
-       if (ret < 0) {
-               lxcfs_debug("%s\n", "failed to retrieve system information");
-               return 0;
-       }
-
-       uptime = (uint64_t)time(NULL) - (uint64_t)sys.uptime;
-       procstart = get_reaper_start_time_in_sec(pid);
-       return uptime + procstart;
-}
-
 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
 static int proc_stat_read(char *buf, size_t size, off_t offset,
                struct fuse_file_info *fi)
@@ -3672,10 +3809,7 @@ static int proc_stat_read(char *buf, size_t size, off_t offset,
                        continue;
                if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
                        /* not a ^cpuN line containing a number N, just print it */
-                       if (strncmp(line, "btime", 5) == 0)
-                               l = snprintf(cache, cache_size, "btime %"PRIu64"\n", get_reaper_btime(fc->pid));
-                       else
-                               l = snprintf(cache, cache_size, "%s", line);
+                       l = snprintf(cache, cache_size, "%s", line);
                        if (l < 0) {
                                perror("Error writing to cache");
                                rv = 0;
@@ -3845,10 +3979,10 @@ static int proc_uptime_read(char *buf, size_t size, off_t offset,
 #endif
 
        if (offset){
-               if (offset > d->size)
-                       return -EINVAL;
                if (!d->cached)
                        return 0;
+               if (offset > d->size)
+                       return -EINVAL;
                int left = d->size - offset;
                total_len = left > size ? size: left;
                memcpy(buf, cache + offset, total_len);
@@ -3863,8 +3997,8 @@ static int proc_uptime_read(char *buf, size_t size, off_t offset,
        if (reaperage >= busytime)
                idletime = reaperage - busytime;
 
-       total_len = snprintf(d->buf, d->size, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
-       if (total_len < 0 || total_len >=  d->size){
+       total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
+       if (total_len < 0 || total_len >=  d->buflen){
                lxcfs_error("%s\n", "failed to write to cache");
                return 0;
        }
@@ -4147,7 +4281,8 @@ int proc_getattr(const char *path, struct stat *sb)
                        strcmp(path, "/proc/uptime") == 0 ||
                        strcmp(path, "/proc/stat") == 0 ||
                        strcmp(path, "/proc/diskstats") == 0 ||
-                       strcmp(path, "/proc/swaps") == 0) {
+                       strcmp(path, "/proc/swaps") == 0 ||
+                       strcmp(path, "/proc/loadavg") == 0) {
                sb->st_size = 0;
                sb->st_mode = S_IFREG | 00444;
                sb->st_nlink = 1;
@@ -4167,7 +4302,8 @@ int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offs
            filler(buf, "stat", NULL, 0) != 0 ||
            filler(buf, "uptime", NULL, 0) != 0 ||
            filler(buf, "diskstats", NULL, 0) != 0 ||
-           filler(buf, "swaps", NULL, 0) != 0)
+           filler(buf, "swaps", NULL, 0) != 0   ||
+           filler(buf, "loadavg", NULL, 0) != 0)
                return -EINVAL;
        return 0;
 }
@@ -4189,6 +4325,8 @@ int proc_open(const char *path, struct fuse_file_info *fi)
                type = LXC_TYPE_PROC_DISKSTATS;
        else if (strcmp(path, "/proc/swaps") == 0)
                type = LXC_TYPE_PROC_SWAPS;
+       else if (strcmp(path, "/proc/loadavg") == 0)
+               type = LXC_TYPE_PROC_LOADAVG;
        if (type == -1)
                return -ENOENT;
 
@@ -4246,6 +4384,8 @@ int proc_read(const char *path, char *buf, size_t size, off_t offset,
                return proc_diskstats_read(buf, size, offset, fi);
        case LXC_TYPE_PROC_SWAPS:
                return proc_swaps_read(buf, size, offset, fi);
+       case LXC_TYPE_PROC_LOADAVG:
+               return proc_loadavg_read(buf, size, offset, fi);
        default:
                return -EINVAL;
        }
@@ -4480,6 +4620,19 @@ static bool permute_root(void)
        return true;
 }
 
+static int preserve_mnt_ns(int pid)
+{
+       int ret;
+       size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
+       char path[len];
+
+       ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
+       if (ret < 0 || (size_t)ret >= len)
+               return -1;
+
+       return open(path, O_RDONLY | O_CLOEXEC);
+}
+
 static bool cgfs_prepare_mounts(void)
 {
        if (!mkdir_p(BASEDIR, 0700)) {
@@ -4497,6 +4650,12 @@ static bool cgfs_prepare_mounts(void)
                return false;
        }
 
+       cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
+       if (cgroup_mount_ns_fd < 0) {
+               lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
+               return false;
+       }
+
        if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
                lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
                return false;
@@ -4570,19 +4729,6 @@ static bool cgfs_setup_controllers(void)
        return true;
 }
 
-static int preserve_ns(int pid)
-{
-       int ret;
-       size_t len = 5 /* /proc */ + 21 /* /int_as_str */ + 7 /* /ns/mnt */ + 1 /* \0 */;
-       char path[len];
-
-       ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
-       if (ret < 0 || (size_t)ret >= len)
-               return -1;
-
-       return open(path, O_RDONLY | O_CLOEXEC);
-}
-
 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
 {
        FILE *f;
@@ -4626,7 +4772,7 @@ static void __attribute__((constructor)) collect_and_mount_subsystems(void)
        }
 
        /* Preserve initial namespace. */
-       init_ns = preserve_ns(getpid());
+       init_ns = preserve_mnt_ns(getpid());
        if (init_ns < 0) {
                lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
                goto out;
@@ -4683,4 +4829,7 @@ static void __attribute__((destructor)) free_subsystems(void)
        }
        free(hierarchies);
        free(fd_hierarchies);
+
+       if (cgroup_mount_ns_fd >= 0)
+               close(cgroup_mount_ns_fd);
 }