import 15.2.0 Octopus source

[ceph.git] / ceph / src / seastar / src / core / resource.cc
diff --git a/ceph/src/seastar/src/core/resource.cc b/ceph/src/seastar/src/core/resource.cc

index bbc1354a79a3f00b54f460d0fc1992f8043dc63a..e8650973057a290ef1cfa67b5350a9f4f99cfb70 100644 (file)
--- a/ceph/src/seastar/src/core/resource.cc
+++ b/ceph/src/seastar/src/core/resource.cc
@@ -29,27 +29,26 @@
  #include <seastar/util/read_first_line.hh>
  #include <stdlib.h>
  #include <limits>
+#include "cgroup.hh"
+#include <seastar/util/log.hh>
  
  #include <boost/range/adaptor/map.hpp>
  #include <boost/range/algorithm/copy.hpp>
  
  namespace seastar {
  
-// Overload for boost program options parsing/validation
-void validate(boost::any& v,
-              const std::vector<std::string>& values,
-              cpuset_bpo_wrapper* target_type, int) {
-    using namespace boost::program_options;
+extern logger seastar_logger;
+
+// This function was made optional because of validate. It needs to
+// throw an error when a non parseable input is given.
+compat::optional<resource::cpuset> parse_cpuset(std::string value) {
      static std::regex r("(\\d+-)?(\\d+)(,(\\d+-)?(\\d+))*");
-    validators::check_first_occurrence(v);
-    // Extract the first string from 'values'. If there is more than
-    // one string, it's an error, and exception will be thrown.
-    auto&& s = validators::get_single_string(values);
+
      std::smatch match;
-    if (std::regex_match(s, match, r)) {
+    if (std::regex_match(value, match, r)) {
          std::vector<std::string> ranges;
-        boost::split(ranges, s, boost::is_any_of(","));
-        cpuset_bpo_wrapper ret;
+        boost::split(ranges, value, boost::is_any_of(","));
+        resource::cpuset ret;
          for (auto&& range: ranges) {
              std::string beg = range;
              std::string end = range;
@@ -60,19 +59,170 @@ void validate(boost::any& v,
              }
              auto b = boost::lexical_cast<unsigned>(beg);
              auto e = boost::lexical_cast<unsigned>(end);
+
              if (b > e) {
-                throw validation_error(validation_error::invalid_option_value);
+                return seastar::compat::nullopt;
              }
+
              for (auto i = b; i <= e; ++i) {
-                ret.value.insert(i);
+                ret.insert(i);
              }
          }
+        return ret;
+    }
+    return seastar::compat::nullopt;
+}
+
+// Overload for boost program options parsing/validation
+void validate(boost::any& v,
+              const std::vector<std::string>& values,
+              cpuset_bpo_wrapper* target_type, int) {
+    using namespace boost::program_options;
+    validators::check_first_occurrence(v);
+
+    // Extract the first string from 'values'. If there is more than
+    // one string, it's an error, and exception will be thrown.
+    auto&& s = validators::get_single_string(values);
+    auto parsed_cpu_set = parse_cpuset(s);
+
+    if (parsed_cpu_set) {
+        cpuset_bpo_wrapper ret;
+        ret.value = *parsed_cpu_set;
          v = std::move(ret);
      } else {
          throw validation_error(validation_error::invalid_option_value);
      }
  }
  
+namespace cgroup {
+
+namespace fs = seastar::compat::filesystem;
+
+optional<cpuset> cpu_set() {
+    auto cpuset = read_setting_V1V2_as<std::string>(
+                              "cpuset/cpuset.cpus",
+                              "cpuset.cpus.effective");
+    if (cpuset) {
+        return seastar::parse_cpuset(*cpuset);
+    }
+
+    seastar_logger.warn("Unable to parse cgroup's cpuset. Ignoring.");
+    return seastar::compat::nullopt;
+}
+
+size_t memory_limit() {
+    return read_setting_V1V2_as<size_t>(
+                             "memory/memory.limit_in_bytes",
+                             "memory.max")
+        .value_or(std::numeric_limits<size_t>::max());
+}
+
+template <typename T>
+optional<T> read_setting_as(std::string path) {
+    try {
+        auto line = read_first_line(path);
+        return boost::lexical_cast<T>(line);
+    } catch (...) {
+        seastar_logger.warn("Couldn't read cgroup file {}.", path);
+    }
+
+    return seastar::compat::nullopt;
+}
+
+/*
+ * what cgroup do we belong to?
+ *
+ * For cgroups V2, /proc/self/cgroup should read "0::<cgroup-dir-path>"
+ * Note: true only for V2-only systems, but there is no reason to support
+ * a hybrid configuration.
+ */
+static optional<fs::path> cgroup2_path_my_pid() {
+    seastar::sstring cline;
+    try {
+        cline = read_first_line(fs::path{"/proc/self/cgroup"});
+    } catch (...) {
+        // '/proc/self/cgroup' must be there. If not - there is an issue
+        // with the system configuration.
+        throw std::runtime_error("no cgroup data for our process");
+    }
+
+    // for a V2-only system, we expect exactly one line:
+    // 0::<abs-path-to-cgroup>
+    if (cline.at(0) != '0') {
+        // This is either a v1 system, or system configured with a hybrid of v1 & v2.
+        // We do not support such combinations of v1 and v2 at this point.
+        seastar_logger.debug("Not a cgroups-v2-only system");
+        return seastar::compat::nullopt;
+    }
+
+    // the path is guaranteed to start with '0::/'
+    return fs::path{"/sys/fs/cgroup/" + cline.substr(4)};
+}
+
+/*
+ * traverse the cgroups V2 hierarchy bottom-up, starting from our process'
+ * specific cgroup up to /sys/fs/cgroup, looking for the named file.
+ */
+static optional<fs::path> locate_lowest_cgroup2(fs::path lowest_subdir, std::string filename) {
+    // locate the lowest subgroup containing the named file (i.e.
+    // handles the requested control by itself)
+    do {
+        //  does the cgroup settings file exist?
+        auto set_path = lowest_subdir / filename;
+        if (fs::exists(set_path) ) {
+            return set_path;
+        }
+
+        lowest_subdir = lowest_subdir.parent_path();
+    } while (lowest_subdir.compare("/sys/fs"));
+
+    return seastar::compat::nullopt;
+}
+
+/*
+ * Read a settings value from either the cgroups V2 or the corresponding
+ * cgroups V1 files.
+ * For V2, look for the lowest cgroup in our hierarchy that manages the
+ * requested settings.
+ */
+template <typename T>
+optional<T> read_setting_V1V2_as(std::string cg1_path, std::string cg2_fname) {
+    // on v2-systems, cg2_path will be initialized with the leaf cgroup that
+    // controls this process
+    static optional<fs::path> cg2_path{cgroup2_path_my_pid()};
+
+    if (cg2_path) {
+        // this is a v2 system
+        seastar::sstring line;
+        try {
+            line = read_first_line(locate_lowest_cgroup2(*cg2_path, cg2_fname).value());
+        } catch (...) {
+            seastar_logger.warn("Could not read cgroups v2 file ({}).", cg2_fname);
+            return seastar::compat::nullopt;
+        }
+        if (line.compare("max")) {
+            try {
+                return boost::lexical_cast<T>(line);
+            } catch (...) {
+                seastar_logger.warn("Malformed cgroups file ({}) contents.", cg2_fname);
+            }
+        }
+        return seastar::compat::nullopt;
+    }
+
+    // try cgroups v1:
+    try {
+        auto line = read_first_line(fs::path{"/sys/fs/cgroup"} / cg1_path);
+        return boost::lexical_cast<T>(line);
+    } catch (...) {
+        seastar_logger.warn("Could not parse cgroups v1 file ({}).", cg1_path);
+    }
+
+    return seastar::compat::nullopt;
+}
+
+}
+
  namespace resource {
  
  size_t calculate_memory(configuration c, size_t available_memory, float panic_factor = 1) {
@@ -119,19 +269,14 @@ size_t div_roundup(size_t num, size_t denom) {
      return (num + denom - 1) / denom;
  }
  
-static unsigned find_memory_depth(hwloc_topology_t& topology) {
-    auto depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
-    auto obj = hwloc_get_next_obj_by_depth(topology, depth, nullptr);
-
-    while (!obj->memory.local_memory && obj) {
-        obj = hwloc_get_ancestor_obj_by_depth(topology, --depth, obj);
-    }
-    assert(obj);
-    return depth;
-}
-
  static size_t alloc_from_node(cpu& this_cpu, hwloc_obj_t node, std::unordered_map<hwloc_obj_t, size_t>& used_mem, size_t alloc) {
-    auto taken = std::min(node->memory.local_memory - used_mem[node], alloc);
+#if HWLOC_API_VERSION >= 0x00020000
+    // FIXME: support nodes with multiple NUMA nodes, whatever that means
+    auto local_memory = node->total_memory;
+#else
+    auto local_memory = node->memory.local_memory;
+#endif
+    auto taken = std::min(local_memory - used_mem[node], alloc);
      if (taken) {
          used_mem[node] += taken;
          auto node_id = hwloc_bitmap_first(node->nodeset);
@@ -141,6 +286,20 @@ static size_t alloc_from_node(cpu& this_cpu, hwloc_obj_t node, std::unordered_ma
      return taken;
  }
  
+// Find the numa node that contains a specific PU.
+static hwloc_obj_t get_numa_node_for_pu(hwloc_topology_t& topology, hwloc_obj_t pu) {
+    // Can't use ancestry because hwloc 2.0 NUMA nodes are not ancestors of PUs
+    hwloc_obj_t tmp = NULL;
+    auto depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_NUMANODE);
+    while ((tmp = hwloc_get_next_obj_by_depth(topology, depth, tmp)) != NULL) {
+        if (hwloc_bitmap_intersects(tmp->cpuset, pu->cpuset)) {
+            return tmp;
+        }
+    }
+    assert(false && "PU not inside any NUMA node");
+    abort();
+}
+
  struct distribute_objects {
      std::vector<hwloc_cpuset_t> cpu_sets;
      hwloc_obj_t root;
@@ -165,10 +324,9 @@ struct distribute_objects {
  
  static io_queue_topology
  allocate_io_queues(hwloc_topology_t& topology, std::vector<cpu> cpus, unsigned num_io_queues, unsigned& last_node_idx) {
-    unsigned depth = find_memory_depth(topology);
-    auto node_of_shard = [&topology, &cpus, &depth] (unsigned shard) {
+    auto node_of_shard = [&topology, &cpus] (unsigned shard) {
          auto pu = hwloc_get_pu_obj_by_os_index(topology, cpus[shard].cpu_id);
-        auto node = hwloc_get_ancestor_obj_by_depth(topology, depth, pu);
+        auto node = get_numa_node_for_pu(topology, pu);
          return hwloc_bitmap_first(node->nodeset);
      };
  
@@ -263,16 +421,6 @@ allocate_io_queues(hwloc_topology_t& topology, std::vector<cpu> cpus, unsigned n
  }
  
  
-size_t get_cgroup_memory_limit() {
-    std::experimental::filesystem::path cgroup_memory = "/sys/fs/cgroup/memory/memory.limit_in_bytes";
-
-    try {
-        return boost::lexical_cast<size_t>(read_first_line(cgroup_memory));
-    } catch (...) {
-        return std::numeric_limits<size_t>::max();
-    }
-}
-
  resources allocate(configuration c) {
      hwloc_topology_t topology;
      hwloc_topology_init(&topology);
@@ -285,7 +433,11 @@ resources allocate(configuration c) {
              hwloc_bitmap_set(bm, idx);
          }
          auto r = hwloc_topology_restrict(topology, bm,
+#if HWLOC_API_VERSION >= 0x00020000
+                0
+#else
                  HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES
+#endif
                  | HWLOC_RESTRICT_FLAG_ADAPT_MISC
                  | HWLOC_RESTRICT_FLAG_ADAPT_IO);
          if (r == -1) {
@@ -301,9 +453,13 @@ resources allocate(configuration c) {
      auto machine_depth = hwloc_get_type_depth(topology, HWLOC_OBJ_MACHINE);
      assert(hwloc_get_nbobjs_by_depth(topology, machine_depth) == 1);
      auto machine = hwloc_get_obj_by_depth(topology, machine_depth, 0);
+#if HWLOC_API_VERSION >= 0x00020000
+    auto available_memory = machine->total_memory;
+#else
      auto available_memory = machine->memory.total_memory;
+#endif
      size_t mem = calculate_memory(c, std::min(available_memory,
-                                              get_cgroup_memory_limit()));
+                                              cgroup::memory_limit()));
      unsigned available_procs = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
      unsigned procs = c.cpus.value_or(available_procs);
      if (procs > available_procs) {
@@ -315,7 +471,6 @@ resources allocate(configuration c) {
      std::unordered_map<hwloc_obj_t, size_t> topo_used_mem;
      std::vector<std::pair<cpu, size_t>> remains;
      size_t remain;
-    unsigned depth = find_memory_depth(topology);
  
      auto cpu_sets = distribute_objects(topology, procs);
  
@@ -324,7 +479,7 @@ resources allocate(configuration c) {
          auto cpu_id = hwloc_bitmap_first(cs);
          assert(cpu_id != -1);
          auto pu = hwloc_get_pu_obj_by_os_index(topology, cpu_id);
-        auto node = hwloc_get_ancestor_obj_by_depth(topology, depth, pu);
+        auto node = get_numa_node_for_pu(topology, pu);
          cpu this_cpu;
          this_cpu.cpu_id = cpu_id;
          remain = mem_per_proc - alloc_from_node(this_cpu, node, topo_used_mem, mem_per_proc);
@@ -333,12 +488,13 @@ resources allocate(configuration c) {
      }
  
      // Divide the rest of the memory
+    auto depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_NUMANODE);
      for (auto&& r : remains) {
          cpu this_cpu;
          size_t remain;
          std::tie(this_cpu, remain) = r;
          auto pu = hwloc_get_pu_obj_by_os_index(topology, this_cpu.cpu_id);
-        auto node = hwloc_get_ancestor_obj_by_depth(topology, depth, pu);
+        auto node = get_numa_node_for_pu(topology, pu);
          auto obj = node;
  
          while (remain) {
@@ -412,8 +568,14 @@ resources allocate(configuration c) {
      auto cpuset_procs = c.cpu_set ? c.cpu_set->size() : nr_processing_units();
      auto procs = c.cpus.value_or(cpuset_procs);
      ret.cpus.reserve(procs);
-    for (unsigned i = 0; i < procs; ++i) {
-        ret.cpus.push_back(cpu{i, {{mem / procs, 0}}});
+    if (c.cpu_set) {
+        for (auto cpuid : *c.cpu_set) {
+            ret.cpus.push_back(cpu{cpuid, {{mem / procs, 0}}});
+        }
+    } else {
+        for (unsigned i = 0; i < procs; ++i) {
+            ret.cpus.push_back(cpu{i, {{mem / procs, 0}}});
+        }
      }
  
      ret.ioq_topology.emplace(0, allocate_io_queues(c, ret.cpus));