int cached;
};
-/* reserve buffer size, for cpuall in /proc/stat */
-#define BUF_RESERVE_SIZE 256
+/* Reserve buffer size to account for file size changes. */
+#define BUF_RESERVE_SIZE 512
/*
* A table caching which pid is init for a pid namespace.
fnam = alloca(len);
ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
if (ret < 0 || (size_t)ret >= len)
- return NULL;
+ return false;
fd = openat(cfd, fnam, O_RDONLY);
if (fd < 0)
- return NULL;
+ return false;
*value = slurp_file(fnam, fd);
return *value != NULL;
ret = -ENOENT;
goto out;
}
- if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
- ret = -EACCES;
- goto out;
- }
-
ret = 0;
}
if (initpid <= 0)
initpid = fc->pid;
if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
- if (!last || strcmp(next, last) == 0)
+ if (!last || (next && (strcmp(next, last) == 0)))
ret = -EBUSY;
else
ret = -ENOENT;
while (*memstat) {
if (startswith(memstat, "cache")) {
- sscanf(memstat + 11, "%lu", cached);
+ sscanf(memstat + 5, "%lu", cached);
*cached /= 1024;
} else if (startswith(memstat, "active_anon")) {
sscanf(memstat + 11, "%lu", active_anon);
*active_anon /= 1024;
} else if (startswith(memstat, "inactive_anon")) {
- sscanf(memstat + 11, "%lu", inactive_anon);
+ sscanf(memstat + 13, "%lu", inactive_anon);
*inactive_anon /= 1024;
} else if (startswith(memstat, "active_file")) {
sscanf(memstat + 11, "%lu", active_file);
*active_file /= 1024;
} else if (startswith(memstat, "inactive_file")) {
- sscanf(memstat + 11, "%lu", inactive_file);
+ sscanf(memstat + 13, "%lu", inactive_file);
*inactive_file /= 1024;
} else if (startswith(memstat, "unevictable")) {
sscanf(memstat + 11, "%lu", unevictable);
* FUSE ops for /proc
*/
-static unsigned long get_memlimit(const char *cgroup)
+static unsigned long get_memlimit(const char *cgroup, const char *file)
{
char *memlimit_str = NULL;
unsigned long memlimit = -1;
- if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
+ if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
memlimit = strtoul(memlimit_str, NULL, 10);
free(memlimit_str);
return memlimit;
}
-static unsigned long get_min_memlimit(const char *cgroup)
+static unsigned long get_min_memlimit(const char *cgroup, const char *file)
{
char *copy = strdupa(cgroup);
unsigned long memlimit = 0, retlimit;
- retlimit = get_memlimit(copy);
+ retlimit = get_memlimit(copy, file);
while (strcmp(copy, "/") != 0) {
copy = dirname(copy);
- memlimit = get_memlimit(copy);
+ memlimit = get_memlimit(copy, file);
if (memlimit != -1 && memlimit < retlimit)
retlimit = memlimit;
};
struct file_info *d = (struct file_info *)fi->fh;
char *cg;
char *memusage_str = NULL, *memstat_str = NULL,
- *memswlimit_str = NULL, *memswusage_str = NULL,
- *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
+ *memswlimit_str = NULL, *memswusage_str = NULL;
unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
- active_file = 0, inactive_file = 0, unevictable = 0;
+ active_file = 0, inactive_file = 0, unevictable = 0,
+ hostswtotal = 0;
char *line = NULL;
size_t linelen = 0, total_len = 0, rv = 0;
char *cache = d->buf;
return read_file("/proc/meminfo", buf, size, d);
prune_init_slice(cg);
- memlimit = get_min_memlimit(cg);
+ memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
goto err;
if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
{
- /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
- if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
- goto err;
- if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
- goto err;
-
- memswlimit = strtoul(memswlimit_str, NULL, 10);
+ memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
memswusage = strtoul(memswusage_str, NULL, 10);
- if (!strcmp(memswlimit_str, memswlimit_default_str))
- memswlimit = 0;
- if (!strcmp(memswusage_str, memswusage_default_str))
- memswusage = 0;
-
memswlimit = memswlimit / 1024;
memswusage = memswusage / 1024;
}
memset(lbuf, 0, 100);
if (startswith(line, "MemTotal:")) {
- sscanf(line+14, "%lu", &hosttotal);
+ sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
if (hosttotal < memlimit)
memlimit = hosttotal;
snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
printme = lbuf;
} else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
+ sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
+ if (hostswtotal < memswlimit - memlimit)
+ memswlimit = hostswtotal + memlimit;
snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit);
printme = lbuf;
} else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
} else if (startswith(line, "SwapCached:")) {
snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
printme = lbuf;
- } else if (startswith(line, "Active")) {
+ } else if (startswith(line, "Active:")) {
snprintf(lbuf, 100, "Active: %8lu kB\n",
active_anon + active_file);
printme = lbuf;
- } else if (startswith(line, "Inactive")) {
+ } else if (startswith(line, "Inactive:")) {
snprintf(lbuf, 100, "Inactive: %8lu kB\n",
inactive_anon + inactive_file);
printme = lbuf;
free(memswlimit_str);
free(memswusage_str);
free(memstat_str);
- free(memswlimit_default_str);
- free(memswusage_default_str);
return rv;
}
return rv;
}
+static long int getreaperctime(pid_t pid)
+{
+ char fnam[100];
+ struct stat sb;
+ int ret;
+ pid_t qpid;
+
+ qpid = lookup_initpid_in_store(pid);
+ if (qpid <= 0)
+ return 0;
+
+ ret = snprintf(fnam, 100, "/proc/%d", qpid);
+ if (ret < 0 || ret >= 100)
+ return 0;
+
+ if (lstat(fnam, &sb) < 0)
+ return 0;
+
+ return sb.st_ctime;
+}
+
+#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
static int proc_stat_read(char *buf, size_t size, off_t offset,
struct fuse_file_info *fi)
{
char *line = NULL;
size_t linelen = 0, total_len = 0, rv = 0;
int curcpu = -1; /* cpu numbering starts at 0 */
- unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
+ unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
- irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
-#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
+ irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
char cpuall[CPUALL_MAX_SIZE];
/* reserve for cpu all */
char *cache = d->buf + CPUALL_MAX_SIZE;
continue;
if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
/* not a ^cpuN line containing a number N, just print it */
- l = snprintf(cache, cache_size, "%s", line);
+ if (strncmp(line, "btime", 5) == 0)
+ l = snprintf(cache, cache_size, "btime %ld\n", getreaperctime(fc->pid));
+ else
+ l = snprintf(cache, cache_size, "%s", line);
if (l < 0) {
perror("Error writing to cache");
rv = 0;
cache_size -= l;
total_len += l;
- if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
- &softirq, &steal, &guest) != 9)
+ if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
+ &user,
+ &nice,
+ &system,
+ &idle,
+ &iowait,
+ &irq,
+ &softirq,
+ &steal,
+ &guest,
+ &guest_nice) != 10)
continue;
user_sum += user;
nice_sum += nice;
softirq_sum += softirq;
steal_sum += steal;
guest_sum += guest;
+ guest_nice_sum += guest_nice;
}
cache = d->buf;
- int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
- "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
- if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
+ int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+ user_sum,
+ nice_sum,
+ system_sum,
+ idle_sum,
+ iowait_sum,
+ irq_sum,
+ softirq_sum,
+ steal_sum,
+ guest_sum,
+ guest_nice_sum);
+ if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
memcpy(cache, cpuall, cpuall_len);
cache += cpuall_len;
- } else{
+ } else {
/* shouldn't happen */
lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
cpuall_len = 0;
total_len += cpuall_len;
d->cached = 1;
d->size = total_len;
- if (total_len > size ) total_len = size;
+ if (total_len > size)
+ total_len = size;
memcpy(buf, d->buf, total_len);
rv = total_len;
static long int getreaperage(pid_t pid)
{
- char fnam[100];
- struct stat sb;
- int ret;
- pid_t qpid;
-
- qpid = lookup_initpid_in_store(pid);
- if (qpid <= 0)
- return 0;
-
- ret = snprintf(fnam, 100, "/proc/%d", qpid);
- if (ret < 0 || ret >= 100)
- return 0;
-
- if (lstat(fnam, &sb) < 0)
- return 0;
+ long int ctime;
- return time(NULL) - sb.st_ctime;
+ ctime = getreaperctime(pid);
+ if (ctime)
+ return time(NULL) - ctime;
+ return ctime;
}
static unsigned long get_reaper_busy(pid_t task)
struct fuse_context *fc = fuse_get_context();
struct file_info *d = (struct file_info *)fi->fh;
char *cg = NULL;
- char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL,
- *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
+ char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
ssize_t total_len = 0, rv = 0;
ssize_t l = 0;
return read_file("/proc/swaps", buf, size, d);
prune_init_slice(cg);
- if (!cgfs_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
- goto err;
+ memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
goto err;
- memlimit = strtoul(memlimit_str, NULL, 10);
memusage = strtoul(memusage_str, NULL, 10);
if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
- /* If swap accounting is turned on, then default value is assumed to be that of cgroup / */
- if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
- goto err;
- if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
- goto err;
-
- memswlimit = strtoul(memswlimit_str, NULL, 10);
+ memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
memswusage = strtoul(memswusage_str, NULL, 10);
- if (!strcmp(memswlimit_str, memswlimit_default_str))
- memswlimit = 0;
- if (!strcmp(memswusage_str, memswusage_default_str))
- memswusage = 0;
-
swap_total = (memswlimit - memlimit) / 1024;
swap_free = (memswusage - memusage) / 1024;
}
free(memlimit_str);
free(memusage_str);
free(memswusage_str);
- free(memswusage_default_str);
- free(memswlimit_default_str);
return rv;
}
return true;
}
-bool has_fs_type(const struct statfs *fs, __fsword_t magic_val)
+/* __typeof__ should be safe to use with all compilers. */
+typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
+static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
+{
+ return (fs->f_type == (fs_type_magic)magic_val);
+}
+
+/*
+ * looking at fs/proc_namespace.c, it appears we can
+ * actually expect the rootfs entry to very specifically contain
+ * " - rootfs rootfs "
+ * IIUC, so long as we've chrooted so that rootfs is not our root,
+ * the rootfs entry should always be skipped in mountinfo contents.
+ */
+static bool is_on_ramfs(void)
{
- return (fs->f_type == (__fsword_t)magic_val);
+ FILE *f;
+ char *p, *p2;
+ char *line = NULL;
+ size_t len = 0;
+ int i;
+
+ f = fopen("/proc/self/mountinfo", "r");
+ if (!f)
+ return false;
+
+ while (getline(&line, &len, f) != -1) {
+ for (p = line, i = 0; p && i < 4; i++)
+ p = strchr(p + 1, ' ');
+ if (!p)
+ continue;
+ p2 = strchr(p + 1, ' ');
+ if (!p2)
+ continue;
+ *p2 = '\0';
+ if (strcmp(p + 1, "/") == 0) {
+ // this is '/'. is it the ramfs?
+ p = strchr(p2 + 1, '-');
+ if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
+ free(line);
+ fclose(f);
+ return true;
+ }
+ }
+ }
+ free(line);
+ fclose(f);
+ return false;
}
-static int pivot_enter(void)
+static int pivot_enter()
{
int ret = -1, oldroot = -1, newroot = -1;
lxcfs_error("%s\n", "Failed to enter old root.");
goto err;
}
+
if (umount2(".", MNT_DETACH) < 0) {
lxcfs_error("%s\n", "Failed to detach old root.");
goto err;
close(oldroot);
if (newroot > 0)
close(newroot);
+
return ret;
}
+static int chroot_enter()
+{
+ if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
+ lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
+ return -1;
+ }
+
+ if (chroot(".") < 0) {
+ lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
+ return -1;
+ }
+
+ if (chdir("/") < 0) {
+ lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int permute_and_enter(void)
+{
+ struct statfs sb;
+
+ if (statfs("/", &sb) < 0) {
+ lxcfs_error("%s\n", "Could not stat / mountpoint.");
+ return -1;
+ }
+
+ /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
+ * likely report TMPFS_MAGIC. Hence, when it reports no we still check
+ * /proc/1/mountinfo. */
+ if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
+ return chroot_enter();
+
+ if (pivot_enter() < 0) {
+ lxcfs_error("%s\n", "Could not perform pivot root.");
+ return -1;
+ }
+
+ return 0;
+}
+
/* Prepare our new clean root. */
-static int pivot_prepare(void)
+static int permute_prepare(void)
{
if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
lxcfs_error("%s\n", "Failed to create directory for new root.");
return 0;
}
-static bool pivot_new_root(void)
+/* Calls chroot() on ramfs, pivot_root() in all other cases. */
+static bool permute_root(void)
{
/* Prepare new root. */
- if (pivot_prepare() < 0)
+ if (permute_prepare() < 0)
return false;
/* Pivot into new root. */
- if (pivot_enter() < 0)
+ if (permute_and_enter() < 0)
return false;
return true;
}
-static bool setup_cgfs_dir(void)
+static bool cgfs_prepare_mounts(void)
{
if (!mkdir_p(BASEDIR, 0700)) {
lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
return true;
}
-static bool do_mount_cgroups(void)
+static bool cgfs_mount_hierarchies(void)
{
char *target;
size_t clen, len;
for (i = 0; i < num_hierarchies; i++) {
char *controller = hierarchies[i];
+
clen = strlen(controller);
len = strlen(BASEDIR) + clen + 2;
target = malloc(len);
if (!target)
return false;
+
ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
if (ret < 0 || ret >= len) {
free(target);
free(target);
return false;
}
- if (mount(controller, target, "cgroup", 0, controller) < 0) {
- lxcfs_error("Failed mounting cgroup %s\n", controller);
+ if (!strcmp(controller, "unified"))
+ ret = mount("none", target, "cgroup2", 0, NULL);
+ else
+ ret = mount(controller, target, "cgroup", 0, controller);
+ if (ret < 0) {
+ lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
free(target);
return false;
}
static bool cgfs_setup_controllers(void)
{
- if (!setup_cgfs_dir())
+ if (!cgfs_prepare_mounts())
return false;
- if (!do_mount_cgroups()) {
+ if (!cgfs_mount_hierarchies()) {
lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
return false;
}
- if (!pivot_new_root())
+ if (!permute_root())
return false;
return true;
char cwd[MAXPATHLEN];
size_t len = 0;
int i, init_ns = -1;
+ bool found_unified = false;
if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
}
while (getline(&line, &len, f) != -1) {
- char *p, *p2;
+ char *idx, *p, *p2;
p = strchr(line, ':');
if (!p)
goto out;
+ idx = line;
*(p++) = '\0';
p2 = strrchr(p, ':');
* because it parses out the empty string "" and later on passes
* it to mount(). Let's skip such entries.
*/
- if (!strcmp(p, ""))
- continue;
+ if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
+ found_unified = true;
+ p = "unified";
+ }
if (!store_hierarchy(line, p))
goto out;
goto out;
}
- fd_hierarchies = malloc(sizeof(int *) * num_hierarchies);
+ fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
if (!fd_hierarchies) {
lxcfs_error("%s\n", strerror(errno));
goto out;