#include <seastar/util/read_first_line.hh>
#include <stdlib.h>
#include <limits>
+#include "cgroup.hh"
+#include <seastar/util/log.hh>
#include <boost/range/adaptor/map.hpp>
#include <boost/range/algorithm/copy.hpp>
namespace seastar {
-// Overload for boost program options parsing/validation
-void validate(boost::any& v,
- const std::vector<std::string>& values,
- cpuset_bpo_wrapper* target_type, int) {
- using namespace boost::program_options;
+extern logger seastar_logger;
+
+// This function was made optional because of validate. It needs to
+// throw an error when a non parseable input is given.
+compat::optional<resource::cpuset> parse_cpuset(std::string value) {
static std::regex r("(\\d+-)?(\\d+)(,(\\d+-)?(\\d+))*");
- validators::check_first_occurrence(v);
- // Extract the first string from 'values'. If there is more than
- // one string, it's an error, and exception will be thrown.
- auto&& s = validators::get_single_string(values);
+
std::smatch match;
- if (std::regex_match(s, match, r)) {
+ if (std::regex_match(value, match, r)) {
std::vector<std::string> ranges;
- boost::split(ranges, s, boost::is_any_of(","));
- cpuset_bpo_wrapper ret;
+ boost::split(ranges, value, boost::is_any_of(","));
+ resource::cpuset ret;
for (auto&& range: ranges) {
std::string beg = range;
std::string end = range;
}
auto b = boost::lexical_cast<unsigned>(beg);
auto e = boost::lexical_cast<unsigned>(end);
+
if (b > e) {
- throw validation_error(validation_error::invalid_option_value);
+ return seastar::compat::nullopt;
}
+
for (auto i = b; i <= e; ++i) {
- ret.value.insert(i);
+ ret.insert(i);
}
}
+ return ret;
+ }
+ return seastar::compat::nullopt;
+}
+
+// Overload for boost program options parsing/validation
+void validate(boost::any& v,
+ const std::vector<std::string>& values,
+ cpuset_bpo_wrapper* target_type, int) {
+ using namespace boost::program_options;
+ validators::check_first_occurrence(v);
+
+ // Extract the first string from 'values'. If there is more than
+ // one string, it's an error, and exception will be thrown.
+ auto&& s = validators::get_single_string(values);
+ auto parsed_cpu_set = parse_cpuset(s);
+
+ if (parsed_cpu_set) {
+ cpuset_bpo_wrapper ret;
+ ret.value = *parsed_cpu_set;
v = std::move(ret);
} else {
throw validation_error(validation_error::invalid_option_value);
}
}
+namespace cgroup {
+
+namespace fs = seastar::compat::filesystem;
+
+optional<cpuset> cpu_set() {
+ auto cpuset = read_setting_V1V2_as<std::string>(
+ "cpuset/cpuset.cpus",
+ "cpuset.cpus.effective");
+ if (cpuset) {
+ return seastar::parse_cpuset(*cpuset);
+ }
+
+ seastar_logger.warn("Unable to parse cgroup's cpuset. Ignoring.");
+ return seastar::compat::nullopt;
+}
+
+size_t memory_limit() {
+ return read_setting_V1V2_as<size_t>(
+ "memory/memory.limit_in_bytes",
+ "memory.max")
+ .value_or(std::numeric_limits<size_t>::max());
+}
+
+template <typename T>
+optional<T> read_setting_as(std::string path) {
+ try {
+ auto line = read_first_line(path);
+ return boost::lexical_cast<T>(line);
+ } catch (...) {
+ seastar_logger.warn("Couldn't read cgroup file {}.", path);
+ }
+
+ return seastar::compat::nullopt;
+}
+
+/*
+ * what cgroup do we belong to?
+ *
+ * For cgroups V2, /proc/self/cgroup should read "0::<cgroup-dir-path>"
+ * Note: true only for V2-only systems, but there is no reason to support
+ * a hybrid configuration.
+ */
+static optional<fs::path> cgroup2_path_my_pid() {
+ seastar::sstring cline;
+ try {
+ cline = read_first_line(fs::path{"/proc/self/cgroup"});
+ } catch (...) {
+ // '/proc/self/cgroup' must be there. If not - there is an issue
+ // with the system configuration.
+ throw std::runtime_error("no cgroup data for our process");
+ }
+
+ // for a V2-only system, we expect exactly one line:
+ // 0::<abs-path-to-cgroup>
+ if (cline.at(0) != '0') {
+ // This is either a v1 system, or system configured with a hybrid of v1 & v2.
+ // We do not support such combinations of v1 and v2 at this point.
+ seastar_logger.debug("Not a cgroups-v2-only system");
+ return seastar::compat::nullopt;
+ }
+
+ // the path is guaranteed to start with '0::/'
+ return fs::path{"/sys/fs/cgroup/" + cline.substr(4)};
+}
+
+/*
+ * traverse the cgroups V2 hierarchy bottom-up, starting from our process'
+ * specific cgroup up to /sys/fs/cgroup, looking for the named file.
+ */
+static optional<fs::path> locate_lowest_cgroup2(fs::path lowest_subdir, std::string filename) {
+ // locate the lowest subgroup containing the named file (i.e.
+ // handles the requested control by itself)
+ do {
+ // does the cgroup settings file exist?
+ auto set_path = lowest_subdir / filename;
+ if (fs::exists(set_path) ) {
+ return set_path;
+ }
+
+ lowest_subdir = lowest_subdir.parent_path();
+ } while (lowest_subdir.compare("/sys/fs"));
+
+ return seastar::compat::nullopt;
+}
+
+/*
+ * Read a settings value from either the cgroups V2 or the corresponding
+ * cgroups V1 files.
+ * For V2, look for the lowest cgroup in our hierarchy that manages the
+ * requested settings.
+ */
+template <typename T>
+optional<T> read_setting_V1V2_as(std::string cg1_path, std::string cg2_fname) {
+ // on v2-systems, cg2_path will be initialized with the leaf cgroup that
+ // controls this process
+ static optional<fs::path> cg2_path{cgroup2_path_my_pid()};
+
+ if (cg2_path) {
+ // this is a v2 system
+ seastar::sstring line;
+ try {
+ line = read_first_line(locate_lowest_cgroup2(*cg2_path, cg2_fname).value());
+ } catch (...) {
+ seastar_logger.warn("Could not read cgroups v2 file ({}).", cg2_fname);
+ return seastar::compat::nullopt;
+ }
+ if (line.compare("max")) {
+ try {
+ return boost::lexical_cast<T>(line);
+ } catch (...) {
+ seastar_logger.warn("Malformed cgroups file ({}) contents.", cg2_fname);
+ }
+ }
+ return seastar::compat::nullopt;
+ }
+
+ // try cgroups v1:
+ try {
+ auto line = read_first_line(fs::path{"/sys/fs/cgroup"} / cg1_path);
+ return boost::lexical_cast<T>(line);
+ } catch (...) {
+ seastar_logger.warn("Could not parse cgroups v1 file ({}).", cg1_path);
+ }
+
+ return seastar::compat::nullopt;
+}
+
+}
+
namespace resource {
size_t calculate_memory(configuration c, size_t available_memory, float panic_factor = 1) {
return (num + denom - 1) / denom;
}
-static unsigned find_memory_depth(hwloc_topology_t& topology) {
- auto depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
- auto obj = hwloc_get_next_obj_by_depth(topology, depth, nullptr);
-
- while (!obj->memory.local_memory && obj) {
- obj = hwloc_get_ancestor_obj_by_depth(topology, --depth, obj);
- }
- assert(obj);
- return depth;
-}
-
static size_t alloc_from_node(cpu& this_cpu, hwloc_obj_t node, std::unordered_map<hwloc_obj_t, size_t>& used_mem, size_t alloc) {
- auto taken = std::min(node->memory.local_memory - used_mem[node], alloc);
+#if HWLOC_API_VERSION >= 0x00020000
+ // FIXME: support nodes with multiple NUMA nodes, whatever that means
+ auto local_memory = node->total_memory;
+#else
+ auto local_memory = node->memory.local_memory;
+#endif
+ auto taken = std::min(local_memory - used_mem[node], alloc);
if (taken) {
used_mem[node] += taken;
auto node_id = hwloc_bitmap_first(node->nodeset);
return taken;
}
+// Find the numa node that contains a specific PU.
+static hwloc_obj_t get_numa_node_for_pu(hwloc_topology_t& topology, hwloc_obj_t pu) {
+ // Can't use ancestry because hwloc 2.0 NUMA nodes are not ancestors of PUs
+ hwloc_obj_t tmp = NULL;
+ auto depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_NUMANODE);
+ while ((tmp = hwloc_get_next_obj_by_depth(topology, depth, tmp)) != NULL) {
+ if (hwloc_bitmap_intersects(tmp->cpuset, pu->cpuset)) {
+ return tmp;
+ }
+ }
+ assert(false && "PU not inside any NUMA node");
+ abort();
+}
+
struct distribute_objects {
std::vector<hwloc_cpuset_t> cpu_sets;
hwloc_obj_t root;
static io_queue_topology
allocate_io_queues(hwloc_topology_t& topology, std::vector<cpu> cpus, unsigned num_io_queues, unsigned& last_node_idx) {
- unsigned depth = find_memory_depth(topology);
- auto node_of_shard = [&topology, &cpus, &depth] (unsigned shard) {
+ auto node_of_shard = [&topology, &cpus] (unsigned shard) {
auto pu = hwloc_get_pu_obj_by_os_index(topology, cpus[shard].cpu_id);
- auto node = hwloc_get_ancestor_obj_by_depth(topology, depth, pu);
+ auto node = get_numa_node_for_pu(topology, pu);
return hwloc_bitmap_first(node->nodeset);
};
}
-size_t get_cgroup_memory_limit() {
- std::experimental::filesystem::path cgroup_memory = "/sys/fs/cgroup/memory/memory.limit_in_bytes";
-
- try {
- return boost::lexical_cast<size_t>(read_first_line(cgroup_memory));
- } catch (...) {
- return std::numeric_limits<size_t>::max();
- }
-}
-
resources allocate(configuration c) {
hwloc_topology_t topology;
hwloc_topology_init(&topology);
hwloc_bitmap_set(bm, idx);
}
auto r = hwloc_topology_restrict(topology, bm,
+#if HWLOC_API_VERSION >= 0x00020000
+ 0
+#else
HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES
+#endif
| HWLOC_RESTRICT_FLAG_ADAPT_MISC
| HWLOC_RESTRICT_FLAG_ADAPT_IO);
if (r == -1) {
auto machine_depth = hwloc_get_type_depth(topology, HWLOC_OBJ_MACHINE);
assert(hwloc_get_nbobjs_by_depth(topology, machine_depth) == 1);
auto machine = hwloc_get_obj_by_depth(topology, machine_depth, 0);
+#if HWLOC_API_VERSION >= 0x00020000
+ auto available_memory = machine->total_memory;
+#else
auto available_memory = machine->memory.total_memory;
+#endif
size_t mem = calculate_memory(c, std::min(available_memory,
- get_cgroup_memory_limit()));
+ cgroup::memory_limit()));
unsigned available_procs = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
unsigned procs = c.cpus.value_or(available_procs);
if (procs > available_procs) {
std::unordered_map<hwloc_obj_t, size_t> topo_used_mem;
std::vector<std::pair<cpu, size_t>> remains;
size_t remain;
- unsigned depth = find_memory_depth(topology);
auto cpu_sets = distribute_objects(topology, procs);
auto cpu_id = hwloc_bitmap_first(cs);
assert(cpu_id != -1);
auto pu = hwloc_get_pu_obj_by_os_index(topology, cpu_id);
- auto node = hwloc_get_ancestor_obj_by_depth(topology, depth, pu);
+ auto node = get_numa_node_for_pu(topology, pu);
cpu this_cpu;
this_cpu.cpu_id = cpu_id;
remain = mem_per_proc - alloc_from_node(this_cpu, node, topo_used_mem, mem_per_proc);
}
// Divide the rest of the memory
+ auto depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_NUMANODE);
for (auto&& r : remains) {
cpu this_cpu;
size_t remain;
std::tie(this_cpu, remain) = r;
auto pu = hwloc_get_pu_obj_by_os_index(topology, this_cpu.cpu_id);
- auto node = hwloc_get_ancestor_obj_by_depth(topology, depth, pu);
+ auto node = get_numa_node_for_pu(topology, pu);
auto obj = node;
while (remain) {
auto cpuset_procs = c.cpu_set ? c.cpu_set->size() : nr_processing_units();
auto procs = c.cpus.value_or(cpuset_procs);
ret.cpus.reserve(procs);
- for (unsigned i = 0; i < procs; ++i) {
- ret.cpus.push_back(cpu{i, {{mem / procs, 0}}});
+ if (c.cpu_set) {
+ for (auto cpuid : *c.cpu_set) {
+ ret.cpus.push_back(cpu{cpuid, {{mem / procs, 0}}});
+ }
+ } else {
+ for (unsigned i = 0; i < procs; ++i) {
+ ret.cpus.push_back(cpu{i, {{mem / procs, 0}}});
+ }
}
ret.ioq_topology.emplace(0, allocate_io_queues(c, ret.cpus));