[ceph.git] / ceph / src / seastar / src / core / resource.cc


/*
 * This file is open source software, licensed to you under the terms
 * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
 * distributed with this work for additional information regarding copyright
 * ownership.  You may not use this file except in compliance with the License.
 *
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
/*
 * Copyright (C) 2014 Cloudius Systems, Ltd.
 */

#include <boost/program_options.hpp>
#include <boost/algorithm/string.hpp>
#include <regex>
#include <seastar/core/resource.hh>
#include <seastar/core/align.hh>
#include <seastar/core/print.hh>
#include <seastar/util/read_first_line.hh>
#include <stdlib.h>
#include <limits>
#include "cgroup.hh"
#include <seastar/util/log.hh>

#include <boost/range/adaptor/map.hpp>
#include <boost/range/algorithm/copy.hpp>

namespace seastar {

extern logger seastar_logger;

// This function was made optional because of validate. It needs to
// throw an error when a non parseable input is given.
compat::optional<resource::cpuset> parse_cpuset(std::string value) {
    static std::regex r("(\\d+-)?(\\d+)(,(\\d+-)?(\\d+))*");

    std::smatch match;
    if (std::regex_match(value, match, r)) {
        std::vector<std::string> ranges;
        boost::split(ranges, value, boost::is_any_of(","));
        resource::cpuset ret;
        for (auto&& range: ranges) {
            std::string beg = range;
            std::string end = range;
            auto dash = range.find('-');
            if (dash != range.npos) {
                beg = range.substr(0, dash);
                end = range.substr(dash + 1);
            }
            auto b = boost::lexical_cast<unsigned>(beg);
            auto e = boost::lexical_cast<unsigned>(end);

            if (b > e) {
                return seastar::compat::nullopt;
            }

            for (auto i = b; i <= e; ++i) {
                ret.insert(i);
            }
        }
        return ret;
    }
    return seastar::compat::nullopt;
}

// Overload for boost program options parsing/validation
void validate(boost::any& v,
              const std::vector<std::string>& values,
              cpuset_bpo_wrapper* target_type, int) {
    using namespace boost::program_options;
    validators::check_first_occurrence(v);

    // Extract the first string from 'values'. If there is more than
    // one string, it's an error, and exception will be thrown.
    auto&& s = validators::get_single_string(values);
    auto parsed_cpu_set = parse_cpuset(s);

    if (parsed_cpu_set) {
        cpuset_bpo_wrapper ret;
        ret.value = *parsed_cpu_set;
        v = std::move(ret);
    } else {
        throw validation_error(validation_error::invalid_option_value);
    }
}

namespace cgroup {

namespace fs = seastar::compat::filesystem;

optional<cpuset> cpu_set() {
    auto cpuset = read_setting_V1V2_as<std::string>(
                              "cpuset/cpuset.cpus",
                              "cpuset.cpus.effective");
    if (cpuset) {
        return seastar::parse_cpuset(*cpuset);
    }

    seastar_logger.warn("Unable to parse cgroup's cpuset. Ignoring.");
    return seastar::compat::nullopt;
}

size_t memory_limit() {
    return read_setting_V1V2_as<size_t>(
                             "memory/memory.limit_in_bytes",
                             "memory.max")
        .value_or(std::numeric_limits<size_t>::max());
}

template <typename T>
optional<T> read_setting_as(std::string path) {
    try {
        auto line = read_first_line(path);
        return boost::lexical_cast<T>(line);
    } catch (...) {
        seastar_logger.warn("Couldn't read cgroup file {}.", path);
    }

    return seastar::compat::nullopt;
}

/*
 * what cgroup do we belong to?
 *
 * For cgroups V2, /proc/self/cgroup should read "0::<cgroup-dir-path>"
 * Note: true only for V2-only systems, but there is no reason to support
 * a hybrid configuration.
 */
static optional<fs::path> cgroup2_path_my_pid() {
    seastar::sstring cline;
    try {
        cline = read_first_line(fs::path{"/proc/self/cgroup"});
    } catch (...) {
        // '/proc/self/cgroup' must be there. If not - there is an issue
        // with the system configuration.
        throw std::runtime_error("no cgroup data for our process");
    }

    // for a V2-only system, we expect exactly one line:
    // 0::<abs-path-to-cgroup>
    if (cline.at(0) != '0') {
        // This is either a v1 system, or system configured with a hybrid of v1 & v2.
        // We do not support such combinations of v1 and v2 at this point.
        seastar_logger.debug("Not a cgroups-v2-only system");
        return seastar::compat::nullopt;
    }

    // the path is guaranteed to start with '0::/'
    return fs::path{"/sys/fs/cgroup/" + cline.substr(4)};
}

/*
 * traverse the cgroups V2 hierarchy bottom-up, starting from our process'
 * specific cgroup up to /sys/fs/cgroup, looking for the named file.
 */
static optional<fs::path> locate_lowest_cgroup2(fs::path lowest_subdir, std::string filename) {
    // locate the lowest subgroup containing the named file (i.e.
    // handles the requested control by itself)
    do {
        //  does the cgroup settings file exist?
        auto set_path = lowest_subdir / filename;
        if (fs::exists(set_path) ) {
            return set_path;
        }

        lowest_subdir = lowest_subdir.parent_path();
    } while (lowest_subdir.compare("/sys/fs"));

    return seastar::compat::nullopt;
}

/*
 * Read a settings value from either the cgroups V2 or the corresponding
 * cgroups V1 files.
 * For V2, look for the lowest cgroup in our hierarchy that manages the
 * requested settings.
 */
template <typename T>
optional<T> read_setting_V1V2_as(std::string cg1_path, std::string cg2_fname) {
    // on v2-systems, cg2_path will be initialized with the leaf cgroup that
    // controls this process
    static optional<fs::path> cg2_path{cgroup2_path_my_pid()};

    if (cg2_path) {
        // this is a v2 system
        seastar::sstring line;
        try {
            line = read_first_line(locate_lowest_cgroup2(*cg2_path, cg2_fname).value());
        } catch (...) {
            seastar_logger.warn("Could not read cgroups v2 file ({}).", cg2_fname);
            return seastar::compat::nullopt;
        }
        if (line.compare("max")) {
            try {
                return boost::lexical_cast<T>(line);
            } catch (...) {
                seastar_logger.warn("Malformed cgroups file ({}) contents.", cg2_fname);
            }
        }
        return seastar::compat::nullopt;
    }

    // try cgroups v1:
    try {
        auto line = read_first_line(fs::path{"/sys/fs/cgroup"} / cg1_path);
        return boost::lexical_cast<T>(line);
    } catch (...) {
        seastar_logger.warn("Could not parse cgroups v1 file ({}).", cg1_path);
    }

    return seastar::compat::nullopt;
}

}

namespace resource {

size_t calculate_memory(configuration c, size_t available_memory, float panic_factor = 1) {
    size_t default_reserve_memory = std::max<size_t>(1536 * 1024 * 1024, 0.07 * available_memory) * panic_factor;
    auto reserve = c.reserve_memory.value_or(default_reserve_memory);
    size_t min_memory = 500'000'000;
    if (available_memory >= reserve + min_memory) {
        available_memory -= reserve;
    } else {
        // Allow starting up even in low memory configurations (e.g. 2GB boot2docker VM)
        available_memory = min_memory;
    }
    size_t mem = c.total_memory.value_or(available_memory);
    if (mem > available_memory) {
        throw std::runtime_error(format("insufficient physical memory: needed {} available {}", mem, available_memory));
    }
    return mem;
}

}

}

#ifdef SEASTAR_HAVE_HWLOC

#include <seastar/util/defer.hh>
#include <seastar/core/print.hh>
#include <hwloc.h>
#include <unordered_map>
#include <boost/range/irange.hpp>

namespace seastar {

cpu_set_t cpuid_to_cpuset(unsigned cpuid) {
    cpu_set_t cs;
    CPU_ZERO(&cs);
    CPU_SET(cpuid, &cs);
    return cs;
}

namespace resource {

size_t div_roundup(size_t num, size_t denom) {
    return (num + denom - 1) / denom;
}

static size_t alloc_from_node(cpu& this_cpu, hwloc_obj_t node, std::unordered_map<hwloc_obj_t, size_t>& used_mem, size_t alloc) {
#if HWLOC_API_VERSION >= 0x00020000
    // FIXME: support nodes with multiple NUMA nodes, whatever that means
    auto local_memory = node->total_memory;
#else
    auto local_memory = node->memory.local_memory;
#endif
    auto taken = std::min(local_memory - used_mem[node], alloc);
    if (taken) {
        used_mem[node] += taken;
        auto node_id = hwloc_bitmap_first(node->nodeset);
        assert(node_id != -1);
        this_cpu.mem.push_back({taken, unsigned(node_id)});
    }
    return taken;
}

// Find the numa node that contains a specific PU.
static hwloc_obj_t get_numa_node_for_pu(hwloc_topology_t& topology, hwloc_obj_t pu) {
    // Can't use ancestry because hwloc 2.0 NUMA nodes are not ancestors of PUs
    hwloc_obj_t tmp = NULL;
    auto depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_NUMANODE);
    while ((tmp = hwloc_get_next_obj_by_depth(topology, depth, tmp)) != NULL) {
        if (hwloc_bitmap_intersects(tmp->cpuset, pu->cpuset)) {
            return tmp;
        }
    }
    assert(false && "PU not inside any NUMA node");
    abort();
}

struct distribute_objects {
    std::vector<hwloc_cpuset_t> cpu_sets;
    hwloc_obj_t root;

    distribute_objects(hwloc_topology_t& topology, size_t nobjs) : cpu_sets(nobjs), root(hwloc_get_root_obj(topology)) {
#if HWLOC_API_VERSION >= 0x00010900
        hwloc_distrib(topology, &root, 1, cpu_sets.data(), cpu_sets.size(), INT_MAX, 0);
#else
        hwloc_distribute(topology, root, cpu_sets.data(), cpu_sets.size(), INT_MAX);
#endif
    }

    ~distribute_objects() {
        for (auto&& cs : cpu_sets) {
            hwloc_bitmap_free(cs);
        }
    }
    std::vector<hwloc_cpuset_t>& operator()() {
        return cpu_sets;
    }
};

static io_queue_topology
allocate_io_queues(hwloc_topology_t& topology, std::vector<cpu> cpus, unsigned num_io_queues, unsigned& last_node_idx) {
    auto node_of_shard = [&topology, &cpus] (unsigned shard) {
        auto pu = hwloc_get_pu_obj_by_os_index(topology, cpus[shard].cpu_id);
        auto node = get_numa_node_for_pu(topology, pu);
        return hwloc_bitmap_first(node->nodeset);
    };

    // There are two things we are trying to achieve by populating a numa_nodes map.
    //
    // The first is to find out how many nodes we have in the system. We can't use
    // hwloc for that, because at this point we are not longer talking about the physical system,
    // but the actual booted seastar server instead. So if we have restricted the run to a subset
    // of the available processors, counting topology nodes won't spur the same result.
    //
    // Secondly, we need to find out which processors live in each node. For a reason similar to the
    // above, hwloc won't do us any good here. Later on, we will use this information to assign
    // shards to coordinators that are node-local to themselves.
    std::unordered_map<unsigned, std::set<unsigned>> numa_nodes;
    for (auto shard: boost::irange(0, int(cpus.size()))) {
        auto node_id = node_of_shard(shard);

        if (numa_nodes.count(node_id) == 0) {
            numa_nodes.emplace(node_id, std::set<unsigned>());
        }
        numa_nodes.at(node_id).insert(shard);
    }

    io_queue_topology ret;
    ret.shard_to_coordinator.resize(cpus.size());
    ret.coordinator_to_idx.resize(cpus.size());
    ret.coordinator_to_idx_valid.resize(cpus.size());

    // User may be playing with --smp option, but num_io_queues was independently
    // determined by iotune, so adjust for any conflicts.
    if (num_io_queues > cpus.size()) {
        fmt::print("Warning: number of IO queues ({:d}) greater than logical cores ({:d}). Adjusting downwards.\n", num_io_queues, cpus.size());
        num_io_queues = cpus.size();
    }

    auto find_shard = [&cpus] (unsigned cpu_id) {
        auto idx = 0u;
        for (auto& c: cpus) {
            if (c.cpu_id == cpu_id) {
                return idx;
            }
            idx++;
        }
        assert(0);
    };

    auto cpu_sets = distribute_objects(topology, num_io_queues);
    ret.coordinators.reserve(cpu_sets().size());

    // First step: distribute the IO queues given the information returned in cpu_sets.
    // If there is one IO queue per processor, only this loop will be executed.
    std::unordered_map<unsigned, std::vector<unsigned>> node_coordinators;
    for (auto&& cs : cpu_sets()) {
        auto io_coordinator = find_shard(hwloc_bitmap_first(cs));

        ret.coordinator_to_idx[io_coordinator] = ret.coordinators.size();
        assert(!ret.coordinator_to_idx_valid[io_coordinator]);
        ret.coordinator_to_idx_valid[io_coordinator] = true;
        ret.coordinators.emplace_back(io_coordinator);
        // If a processor is a coordinator, it is also obviously a coordinator of itself
        ret.shard_to_coordinator[io_coordinator] = io_coordinator;

        auto node_id = node_of_shard(io_coordinator);
        if (node_coordinators.count(node_id) == 0) {
            node_coordinators.emplace(node_id, std::vector<unsigned>());
        }
        node_coordinators.at(node_id).push_back(io_coordinator);
        numa_nodes[node_id].erase(io_coordinator);
    }


    auto available_nodes = boost::copy_range<std::vector<unsigned>>(node_coordinators | boost::adaptors::map_keys);

    // If there are more processors than coordinators, we will have to assign them to existing
    // coordinators. We prefer do that within the same NUMA node, but if not possible we assign
    // the shard to a random node.
    for (auto& node: numa_nodes) {
        auto cid_idx = 0;
        for (auto& remaining_shard: node.second) {
            auto my_node = node.first;
            // No I/O queue in this node, round-robin shards from this node into existing ones.
            if (!node_coordinators.count(node.first)) {
                my_node = available_nodes[last_node_idx++ % available_nodes.size()];
            }
            auto idx = cid_idx++ % node_coordinators.at(my_node).size();
            auto io_coordinator = node_coordinators.at(my_node)[idx];
            ret.shard_to_coordinator[remaining_shard] = io_coordinator;
        }
    }

    return ret;
}


resources allocate(configuration c) {
    hwloc_topology_t topology;
    hwloc_topology_init(&topology);
    auto free_hwloc = defer([&] { hwloc_topology_destroy(topology); });
    hwloc_topology_load(topology);
    if (c.cpu_set) {
        auto bm = hwloc_bitmap_alloc();
        auto free_bm = defer([&] { hwloc_bitmap_free(bm); });
        for (auto idx : *c.cpu_set) {
            hwloc_bitmap_set(bm, idx);
        }
        auto r = hwloc_topology_restrict(topology, bm,
#if HWLOC_API_VERSION >= 0x00020000
                0
#else
                HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES
#endif
                | HWLOC_RESTRICT_FLAG_ADAPT_MISC
                | HWLOC_RESTRICT_FLAG_ADAPT_IO);
        if (r == -1) {
            if (errno == ENOMEM) {
                throw std::bad_alloc();
            }
            if (errno == EINVAL) {
                throw std::runtime_error("bad cpuset");
            }
            abort();
        }
    }
    auto machine_depth = hwloc_get_type_depth(topology, HWLOC_OBJ_MACHINE);
    assert(hwloc_get_nbobjs_by_depth(topology, machine_depth) == 1);
    auto machine = hwloc_get_obj_by_depth(topology, machine_depth, 0);
#if HWLOC_API_VERSION >= 0x00020000
    auto available_memory = machine->total_memory;
#else
    auto available_memory = machine->memory.total_memory;
#endif
    size_t mem = calculate_memory(c, std::min(available_memory,
                                              cgroup::memory_limit()));
    unsigned available_procs = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
    unsigned procs = c.cpus.value_or(available_procs);
    if (procs > available_procs) {
        throw std::runtime_error("insufficient processing units");
    }
    auto mem_per_proc = align_down<size_t>(mem / procs, 2 << 20);

    resources ret;
    std::unordered_map<hwloc_obj_t, size_t> topo_used_mem;
    std::vector<std::pair<cpu, size_t>> remains;
    size_t remain;

    auto cpu_sets = distribute_objects(topology, procs);

    // Divide local memory to cpus
    for (auto&& cs : cpu_sets()) {
        auto cpu_id = hwloc_bitmap_first(cs);
        assert(cpu_id != -1);
        auto pu = hwloc_get_pu_obj_by_os_index(topology, cpu_id);
        auto node = get_numa_node_for_pu(topology, pu);
        cpu this_cpu;
        this_cpu.cpu_id = cpu_id;
        remain = mem_per_proc - alloc_from_node(this_cpu, node, topo_used_mem, mem_per_proc);

        remains.emplace_back(std::move(this_cpu), remain);
    }

    // Divide the rest of the memory
    auto depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_NUMANODE);
    for (auto&& r : remains) {
        cpu this_cpu;
        size_t remain;
        std::tie(this_cpu, remain) = r;
        auto pu = hwloc_get_pu_obj_by_os_index(topology, this_cpu.cpu_id);
        auto node = get_numa_node_for_pu(topology, pu);
        auto obj = node;

        while (remain) {
            remain -= alloc_from_node(this_cpu, obj, topo_used_mem, remain);
            do {
                obj = hwloc_get_next_obj_by_depth(topology, depth, obj);
            } while (!obj);
            if (obj == node)
                break;
        }
        assert(!remain);
        ret.cpus.push_back(std::move(this_cpu));
    }

    unsigned last_node_idx = 0;
    for (auto d : c.num_io_queues) {
        auto devid = d.first;
        auto num_io_queues = d.second;
        ret.ioq_topology.emplace(devid, allocate_io_queues(topology, ret.cpus, num_io_queues, last_node_idx));
    }
    return ret;
}

unsigned nr_processing_units() {
    hwloc_topology_t topology;
    hwloc_topology_init(&topology);
    auto free_hwloc = defer([&] { hwloc_topology_destroy(topology); });
    hwloc_topology_load(topology);
    return hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
}

}

}

#else

#include <seastar/core/resource.hh>
#include <unistd.h>

namespace seastar {

namespace resource {

// Without hwloc, we don't support tuning the number of IO queues. So each CPU gets their.
static io_queue_topology
allocate_io_queues(configuration c, std::vector<cpu> cpus) {
    io_queue_topology ret;

    unsigned nr_cpus = unsigned(cpus.size());
    ret.shard_to_coordinator.resize(nr_cpus);
    ret.coordinators.resize(nr_cpus);
    ret.coordinator_to_idx.resize(nr_cpus);
    ret.coordinator_to_idx_valid.resize(nr_cpus);

    for (unsigned shard = 0; shard < nr_cpus; ++shard) {
        ret.shard_to_coordinator[shard] = shard;
        ret.coordinators[shard] = shard;
        ret.coordinator_to_idx[shard] = shard;
        ret.coordinator_to_idx_valid[shard] = true;
    }
    return ret;
}


resources allocate(configuration c) {
    resources ret;

    auto available_memory = ::sysconf(_SC_PAGESIZE) * size_t(::sysconf(_SC_PHYS_PAGES));
    auto mem = calculate_memory(c, available_memory);
    auto cpuset_procs = c.cpu_set ? c.cpu_set->size() : nr_processing_units();
    auto procs = c.cpus.value_or(cpuset_procs);
    ret.cpus.reserve(procs);
    if (c.cpu_set) {
        for (auto cpuid : *c.cpu_set) {
            ret.cpus.push_back(cpu{cpuid, {{mem / procs, 0}}});
        }
    } else {
        for (unsigned i = 0; i < procs; ++i) {
            ret.cpus.push_back(cpu{i, {{mem / procs, 0}}});
        }
    }

    ret.ioq_topology.emplace(0, allocate_io_queues(c, ret.cpus));
    return ret;
}

unsigned nr_processing_units() {
    return ::sysconf(_SC_NPROCESSORS_ONLN);
}

}

}

#endif
Commit	Line	Data
11fdf7f2 TL	1
	2	/*
	3	* This file is open source software, licensed to you under the terms
	4	* of the Apache License, Version 2.0 (the "License"). See the NOTICE file
	5	* distributed with this work for additional information regarding copyright
	6	* ownership. You may not use this file except in compliance with the License.
	7	*
	8	* You may obtain a copy of the License at
	9	*
	10	* http://www.apache.org/licenses/LICENSE-2.0
	11	*
	12	* Unless required by applicable law or agreed to in writing,
	13	* software distributed under the License is distributed on an
	14	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	15	* KIND, either express or implied. See the License for the
	16	* specific language governing permissions and limitations
	17	* under the License.
	18	*/
	19	/*
	20	* Copyright (C) 2014 Cloudius Systems, Ltd.
	21	*/
	22
	23	#include <boost/program_options.hpp>
	24	#include <boost/algorithm/string.hpp>
	25	#include <regex>
	26	#include <seastar/core/resource.hh>
	27	#include <seastar/core/align.hh>
	28	#include <seastar/core/print.hh>
	29	#include <seastar/util/read_first_line.hh>
	30	#include <stdlib.h>
	31	#include <limits>
9f95a23c TL	32	#include "cgroup.hh"
9f95a23c TL	33	#include <seastar/util/log.hh>
11fdf7f2 TL	34
	35	#include <boost/range/adaptor/map.hpp>
	36	#include <boost/range/algorithm/copy.hpp>
	37
	38	namespace seastar {
	39
9f95a23c TL	40	extern logger seastar_logger;
	41
	42	// This function was made optional because of validate. It needs to
	43	// throw an error when a non parseable input is given.
	44	compat::optional<resource::cpuset> parse_cpuset(std::string value) {
11fdf7f2	45	static std::regex r("(\\d+-)?(\\d+)(,(\\d+-)?(\\d+))*");
9f95a23c	46
11fdf7f2	47	std::smatch match;
9f95a23c	48	if (std::regex_match(value, match, r)) {
11fdf7f2	49	std::vector<std::string> ranges;
9f95a23c TL	50	boost::split(ranges, value, boost::is_any_of(","));
9f95a23c TL	51	resource::cpuset ret;
11fdf7f2 TL	52	for (auto&& range: ranges) {
	53	std::string beg = range;
	54	std::string end = range;
	55	auto dash = range.find('-');
	56	if (dash != range.npos) {
	57	beg = range.substr(0, dash);
	58	end = range.substr(dash + 1);
	59	}
	60	auto b = boost::lexical_cast<unsigned>(beg);
	61	auto e = boost::lexical_cast<unsigned>(end);
9f95a23c	62
11fdf7f2	63	if (b > e) {
9f95a23c	64	return seastar::compat::nullopt;
11fdf7f2	65	}
9f95a23c	66
11fdf7f2	67	for (auto i = b; i <= e; ++i) {
9f95a23c	68	ret.insert(i);
11fdf7f2 TL	69	}
11fdf7f2 TL	70	}
9f95a23c TL	71	return ret;
	72	}
	73	return seastar::compat::nullopt;
	74	}
	75
	76	// Overload for boost program options parsing/validation
	77	void validate(boost::any& v,
	78	const std::vector<std::string>& values,
	79	cpuset_bpo_wrapper* target_type, int) {
	80	using namespace boost::program_options;
	81	validators::check_first_occurrence(v);
	82
	83	// Extract the first string from 'values'. If there is more than
	84	// one string, it's an error, and exception will be thrown.
	85	auto&& s = validators::get_single_string(values);
	86	auto parsed_cpu_set = parse_cpuset(s);
	87
	88	if (parsed_cpu_set) {
	89	cpuset_bpo_wrapper ret;
	90	ret.value = *parsed_cpu_set;
11fdf7f2 TL	91	v = std::move(ret);
	92	} else {
	93	throw validation_error(validation_error::invalid_option_value);
	94	}
	95	}
	96
9f95a23c TL	97	namespace cgroup {
	98
	99	namespace fs = seastar::compat::filesystem;
	100
	101	optional<cpuset> cpu_set() {
	102	auto cpuset = read_setting_V1V2_as<std::string>(
	103	"cpuset/cpuset.cpus",
	104	"cpuset.cpus.effective");
	105	if (cpuset) {
	106	return seastar::parse_cpuset(*cpuset);
	107	}
	108
	109	seastar_logger.warn("Unable to parse cgroup's cpuset. Ignoring.");
	110	return seastar::compat::nullopt;
	111	}
	112
	113	size_t memory_limit() {
	114	return read_setting_V1V2_as<size_t>(
	115	"memory/memory.limit_in_bytes",
	116	"memory.max")
	117	.value_or(std::numeric_limits<size_t>::max());
	118	}
	119
	120	template <typename T>
	121	optional<T> read_setting_as(std::string path) {
	122	try {
	123	auto line = read_first_line(path);
	124	return boost::lexical_cast<T>(line);
	125	} catch (...) {
	126	seastar_logger.warn("Couldn't read cgroup file {}.", path);
	127	}
	128
	129	return seastar::compat::nullopt;
	130	}
	131
	132	/*
	133	* what cgroup do we belong to?
	134	*
	135	* For cgroups V2, /proc/self/cgroup should read "0::<cgroup-dir-path>"
	136	* Note: true only for V2-only systems, but there is no reason to support
	137	* a hybrid configuration.
	138	*/
	139	static optional<fs::path> cgroup2_path_my_pid() {
	140	seastar::sstring cline;
	141	try {
	142	cline = read_first_line(fs::path{"/proc/self/cgroup"});
	143	} catch (...) {
	144	// '/proc/self/cgroup' must be there. If not - there is an issue
	145	// with the system configuration.
	146	throw std::runtime_error("no cgroup data for our process");
	147	}
	148
	149	// for a V2-only system, we expect exactly one line:
	150	// 0::<abs-path-to-cgroup>
	151	if (cline.at(0) != '0') {
	152	// This is either a v1 system, or system configured with a hybrid of v1 & v2.
	153	// We do not support such combinations of v1 and v2 at this point.
	154	seastar_logger.debug("Not a cgroups-v2-only system");
	155	return seastar::compat::nullopt;
	156	}
	157
	158	// the path is guaranteed to start with '0::/'
	159	return fs::path{"/sys/fs/cgroup/" + cline.substr(4)};
	160	}
161
162	/*
163	* traverse the cgroups V2 hierarchy bottom-up, starting from our process'
164	* specific cgroup up to /sys/fs/cgroup, looking for the named file.
165	*/
166	static optional<fs::path> locate_lowest_cgroup2(fs::path lowest_subdir, std::string filename) {
167	// locate the lowest subgroup containing the named file (i.e.
168	// handles the requested control by itself)
169	do {
170	// does the cgroup settings file exist?
171	auto set_path = lowest_subdir / filename;
172	if (fs::exists(set_path) ) {
173	return set_path;
174	}
175
176	lowest_subdir = lowest_subdir.parent_path();
177	} while (lowest_subdir.compare("/sys/fs"));
178
179	return seastar::compat::nullopt;
180	}
181
182	/*
183	* Read a settings value from either the cgroups V2 or the corresponding
184	* cgroups V1 files.
185	* For V2, look for the lowest cgroup in our hierarchy that manages the
186	* requested settings.
187	*/
188	template <typename T>
189	optional<T> read_setting_V1V2_as(std::string cg1_path, std::string cg2_fname) {
190	// on v2-systems, cg2_path will be initialized with the leaf cgroup that
191	// controls this process
192	static optional<fs::path> cg2_path{cgroup2_path_my_pid()};
193
194	if (cg2_path) {
195	// this is a v2 system
196	seastar::sstring line;
197	try {
198	line = read_first_line(locate_lowest_cgroup2(*cg2_path, cg2_fname).value());
199	} catch (...) {
200	seastar_logger.warn("Could not read cgroups v2 file ({}).", cg2_fname);
201	return seastar::compat::nullopt;
202	}
203	if (line.compare("max")) {
204	try {
205	return boost::lexical_cast<T>(line);
206	} catch (...) {
207	seastar_logger.warn("Malformed cgroups file ({}) contents.", cg2_fname);
208	}
209	}
210	return seastar::compat::nullopt;
211	}
212
213	// try cgroups v1:
214	try {
215	auto line = read_first_line(fs::path{"/sys/fs/cgroup"} / cg1_path);
216	return boost::lexical_cast<T>(line);
217	} catch (...) {
218	seastar_logger.warn("Could not parse cgroups v1 file ({}).", cg1_path);
219	}
220
221	return seastar::compat::nullopt;
222	}
223
224	}
225
11fdf7f2 TL	226	namespace resource {
	227
	228	size_t calculate_memory(configuration c, size_t available_memory, float panic_factor = 1) {
	229	size_t default_reserve_memory = std::max<size_t>(1536 * 1024 * 1024, 0.07 * available_memory) * panic_factor;
	230	auto reserve = c.reserve_memory.value_or(default_reserve_memory);
	231	size_t min_memory = 500'000'000;
	232	if (available_memory >= reserve + min_memory) {
	233	available_memory -= reserve;
	234	} else {
	235	// Allow starting up even in low memory configurations (e.g. 2GB boot2docker VM)
	236	available_memory = min_memory;
	237	}
	238	size_t mem = c.total_memory.value_or(available_memory);
	239	if (mem > available_memory) {
	240	throw std::runtime_error(format("insufficient physical memory: needed {} available {}", mem, available_memory));
	241	}
	242	return mem;
	243	}
	244
	245	}
	246
	247	}
	248
	249	#ifdef SEASTAR_HAVE_HWLOC
	250
	251	#include <seastar/util/defer.hh>
	252	#include <seastar/core/print.hh>
	253	#include <hwloc.h>
	254	#include <unordered_map>
	255	#include <boost/range/irange.hpp>
	256
	257	namespace seastar {
	258
	259	cpu_set_t cpuid_to_cpuset(unsigned cpuid) {
	260	cpu_set_t cs;
	261	CPU_ZERO(&cs);
	262	CPU_SET(cpuid, &cs);
	263	return cs;
	264	}
	265
	266	namespace resource {
	267
	268	size_t div_roundup(size_t num, size_t denom) {
	269	return (num + denom - 1) / denom;
	270	}
	271
11fdf7f2	272	static size_t alloc_from_node(cpu& this_cpu, hwloc_obj_t node, std::unordered_map<hwloc_obj_t, size_t>& used_mem, size_t alloc) {
9f95a23c TL	273	#if HWLOC_API_VERSION >= 0x00020000
	274	// FIXME: support nodes with multiple NUMA nodes, whatever that means
	275	auto local_memory = node->total_memory;
	276	#else
	277	auto local_memory = node->memory.local_memory;
	278	#endif
	279	auto taken = std::min(local_memory - used_mem[node], alloc);
11fdf7f2 TL	280	if (taken) {
	281	used_mem[node] += taken;
	282	auto node_id = hwloc_bitmap_first(node->nodeset);
	283	assert(node_id != -1);
	284	this_cpu.mem.push_back({taken, unsigned(node_id)});
	285	}
	286	return taken;
	287	}
	288
9f95a23c TL	289	// Find the numa node that contains a specific PU.
	290	static hwloc_obj_t get_numa_node_for_pu(hwloc_topology_t& topology, hwloc_obj_t pu) {
	291	// Can't use ancestry because hwloc 2.0 NUMA nodes are not ancestors of PUs
	292	hwloc_obj_t tmp = NULL;
	293	auto depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_NUMANODE);
	294	while ((tmp = hwloc_get_next_obj_by_depth(topology, depth, tmp)) != NULL) {
	295	if (hwloc_bitmap_intersects(tmp->cpuset, pu->cpuset)) {
	296	return tmp;
	297	}
	298	}
	299	assert(false && "PU not inside any NUMA node");
	300	abort();
	301	}
	302
11fdf7f2 TL	303	struct distribute_objects {
	304	std::vector<hwloc_cpuset_t> cpu_sets;
	305	hwloc_obj_t root;
	306
	307	distribute_objects(hwloc_topology_t& topology, size_t nobjs) : cpu_sets(nobjs), root(hwloc_get_root_obj(topology)) {
	308	#if HWLOC_API_VERSION >= 0x00010900
	309	hwloc_distrib(topology, &root, 1, cpu_sets.data(), cpu_sets.size(), INT_MAX, 0);
	310	#else
	311	hwloc_distribute(topology, root, cpu_sets.data(), cpu_sets.size(), INT_MAX);
	312	#endif
	313	}
	314
	315	~distribute_objects() {
	316	for (auto&& cs : cpu_sets) {
	317	hwloc_bitmap_free(cs);
	318	}
	319	}
	320	std::vector<hwloc_cpuset_t>& operator()() {
	321	return cpu_sets;
	322	}
	323	};
	324
	325	static io_queue_topology
	326	allocate_io_queues(hwloc_topology_t& topology, std::vector<cpu> cpus, unsigned num_io_queues, unsigned& last_node_idx) {
9f95a23c	327	auto node_of_shard = [&topology, &cpus] (unsigned shard) {
11fdf7f2	328	auto pu = hwloc_get_pu_obj_by_os_index(topology, cpus[shard].cpu_id);
9f95a23c	329	auto node = get_numa_node_for_pu(topology, pu);
11fdf7f2 TL	330	return hwloc_bitmap_first(node->nodeset);
	331	};
	332
	333	// There are two things we are trying to achieve by populating a numa_nodes map.
	334	//
	335	// The first is to find out how many nodes we have in the system. We can't use
	336	// hwloc for that, because at this point we are not longer talking about the physical system,
	337	// but the actual booted seastar server instead. So if we have restricted the run to a subset
	338	// of the available processors, counting topology nodes won't spur the same result.
	339	//
	340	// Secondly, we need to find out which processors live in each node. For a reason similar to the
	341	// above, hwloc won't do us any good here. Later on, we will use this information to assign
	342	// shards to coordinators that are node-local to themselves.
	343	std::unordered_map<unsigned, std::set<unsigned>> numa_nodes;
	344	for (auto shard: boost::irange(0, int(cpus.size()))) {
	345	auto node_id = node_of_shard(shard);
	346
	347	if (numa_nodes.count(node_id) == 0) {
	348	numa_nodes.emplace(node_id, std::set<unsigned>());
	349	}
	350	numa_nodes.at(node_id).insert(shard);
	351	}
	352
	353	io_queue_topology ret;
	354	ret.shard_to_coordinator.resize(cpus.size());
	355	ret.coordinator_to_idx.resize(cpus.size());
	356	ret.coordinator_to_idx_valid.resize(cpus.size());
	357
	358	// User may be playing with --smp option, but num_io_queues was independently
	359	// determined by iotune, so adjust for any conflicts.
	360	if (num_io_queues > cpus.size()) {
	361	fmt::print("Warning: number of IO queues ({:d}) greater than logical cores ({:d}). Adjusting downwards.\n", num_io_queues, cpus.size());
	362	num_io_queues = cpus.size();
	363	}
	364
	365	auto find_shard = [&cpus] (unsigned cpu_id) {
	366	auto idx = 0u;
	367	for (auto& c: cpus) {
	368	if (c.cpu_id == cpu_id) {
	369	return idx;
	370	}
	371	idx++;
	372	}
	373	assert(0);
	374	};
	375
	376	auto cpu_sets = distribute_objects(topology, num_io_queues);
	377	ret.coordinators.reserve(cpu_sets().size());
	378
	379	// First step: distribute the IO queues given the information returned in cpu_sets.
	380	// If there is one IO queue per processor, only this loop will be executed.
	381	std::unordered_map<unsigned, std::vector<unsigned>> node_coordinators;
	382	for (auto&& cs : cpu_sets()) {
	383	auto io_coordinator = find_shard(hwloc_bitmap_first(cs));
	384
	385	ret.coordinator_to_idx[io_coordinator] = ret.coordinators.size();
	386	assert(!ret.coordinator_to_idx_valid[io_coordinator]);
	387	ret.coordinator_to_idx_valid[io_coordinator] = true;
	388	ret.coordinators.emplace_back(io_coordinator);
	389	// If a processor is a coordinator, it is also obviously a coordinator of itself
	390	ret.shard_to_coordinator[io_coordinator] = io_coordinator;
	391
	392	auto node_id = node_of_shard(io_coordinator);
	393	if (node_coordinators.count(node_id) == 0) {
394	node_coordinators.emplace(node_id, std::vector<unsigned>());
395	}
396	node_coordinators.at(node_id).push_back(io_coordinator);
397	numa_nodes[node_id].erase(io_coordinator);
398	}
399
400
401	auto available_nodes = boost::copy_range<std::vector<unsigned>>(node_coordinators \| boost::adaptors::map_keys);
402
403	// If there are more processors than coordinators, we will have to assign them to existing
404	// coordinators. We prefer do that within the same NUMA node, but if not possible we assign
405	// the shard to a random node.
406	for (auto& node: numa_nodes) {
407	auto cid_idx = 0;
408	for (auto& remaining_shard: node.second) {
409	auto my_node = node.first;
410	// No I/O queue in this node, round-robin shards from this node into existing ones.
411	if (!node_coordinators.count(node.first)) {
412	my_node = available_nodes[last_node_idx++ % available_nodes.size()];
413	}
414	auto idx = cid_idx++ % node_coordinators.at(my_node).size();
415	auto io_coordinator = node_coordinators.at(my_node)[idx];
416	ret.shard_to_coordinator[remaining_shard] = io_coordinator;
417	}
418	}
419
420	return ret;
421	}
422
423
11fdf7f2 TL	424	resources allocate(configuration c) {
	425	hwloc_topology_t topology;
	426	hwloc_topology_init(&topology);
	427	auto free_hwloc = defer([&] { hwloc_topology_destroy(topology); });
	428	hwloc_topology_load(topology);
	429	if (c.cpu_set) {
	430	auto bm = hwloc_bitmap_alloc();
	431	auto free_bm = defer([&] { hwloc_bitmap_free(bm); });
	432	for (auto idx : *c.cpu_set) {
	433	hwloc_bitmap_set(bm, idx);
	434	}
	435	auto r = hwloc_topology_restrict(topology, bm,
9f95a23c TL	436	#if HWLOC_API_VERSION >= 0x00020000
	437	0
	438	#else
11fdf7f2	439	HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES
9f95a23c	440	#endif
11fdf7f2 TL	441	\| HWLOC_RESTRICT_FLAG_ADAPT_MISC
	442	\| HWLOC_RESTRICT_FLAG_ADAPT_IO);
	443	if (r == -1) {
	444	if (errno == ENOMEM) {
	445	throw std::bad_alloc();
	446	}
	447	if (errno == EINVAL) {
	448	throw std::runtime_error("bad cpuset");
	449	}
	450	abort();
	451	}
	452	}
	453	auto machine_depth = hwloc_get_type_depth(topology, HWLOC_OBJ_MACHINE);
	454	assert(hwloc_get_nbobjs_by_depth(topology, machine_depth) == 1);
	455	auto machine = hwloc_get_obj_by_depth(topology, machine_depth, 0);
9f95a23c TL	456	#if HWLOC_API_VERSION >= 0x00020000
	457	auto available_memory = machine->total_memory;
	458	#else
11fdf7f2	459	auto available_memory = machine->memory.total_memory;
9f95a23c	460	#endif
11fdf7f2	461	size_t mem = calculate_memory(c, std::min(available_memory,
9f95a23c	462	cgroup::memory_limit()));
11fdf7f2 TL	463	unsigned available_procs = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
	464	unsigned procs = c.cpus.value_or(available_procs);
	465	if (procs > available_procs) {
	466	throw std::runtime_error("insufficient processing units");
	467	}
	468	auto mem_per_proc = align_down<size_t>(mem / procs, 2 << 20);
	469
	470	resources ret;
	471	std::unordered_map<hwloc_obj_t, size_t> topo_used_mem;
	472	std::vector<std::pair<cpu, size_t>> remains;
	473	size_t remain;
11fdf7f2 TL	474
	475	auto cpu_sets = distribute_objects(topology, procs);
	476
	477	// Divide local memory to cpus
	478	for (auto&& cs : cpu_sets()) {
	479	auto cpu_id = hwloc_bitmap_first(cs);
	480	assert(cpu_id != -1);
	481	auto pu = hwloc_get_pu_obj_by_os_index(topology, cpu_id);
9f95a23c	482	auto node = get_numa_node_for_pu(topology, pu);
11fdf7f2 TL	483	cpu this_cpu;
	484	this_cpu.cpu_id = cpu_id;
	485	remain = mem_per_proc - alloc_from_node(this_cpu, node, topo_used_mem, mem_per_proc);
	486
	487	remains.emplace_back(std::move(this_cpu), remain);
	488	}
	489
	490	// Divide the rest of the memory
9f95a23c	491	auto depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_NUMANODE);
11fdf7f2 TL	492	for (auto&& r : remains) {
	493	cpu this_cpu;
	494	size_t remain;
	495	std::tie(this_cpu, remain) = r;
	496	auto pu = hwloc_get_pu_obj_by_os_index(topology, this_cpu.cpu_id);
9f95a23c	497	auto node = get_numa_node_for_pu(topology, pu);
11fdf7f2 TL	498	auto obj = node;
	499
	500	while (remain) {
	501	remain -= alloc_from_node(this_cpu, obj, topo_used_mem, remain);
	502	do {
	503	obj = hwloc_get_next_obj_by_depth(topology, depth, obj);
	504	} while (!obj);
	505	if (obj == node)
	506	break;
	507	}
	508	assert(!remain);
	509	ret.cpus.push_back(std::move(this_cpu));
	510	}
	511
	512	unsigned last_node_idx = 0;
	513	for (auto d : c.num_io_queues) {
	514	auto devid = d.first;
	515	auto num_io_queues = d.second;
	516	ret.ioq_topology.emplace(devid, allocate_io_queues(topology, ret.cpus, num_io_queues, last_node_idx));
	517	}
	518	return ret;
	519	}
	520
	521	unsigned nr_processing_units() {
	522	hwloc_topology_t topology;
	523	hwloc_topology_init(&topology);
	524	auto free_hwloc = defer([&] { hwloc_topology_destroy(topology); });
	525	hwloc_topology_load(topology);
	526	return hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
	527	}
	528
	529	}
	530
	531	}
	532
	533	#else
	534
	535	#include <seastar/core/resource.hh>
	536	#include <unistd.h>
	537
	538	namespace seastar {
	539
	540	namespace resource {
	541
	542	// Without hwloc, we don't support tuning the number of IO queues. So each CPU gets their.
	543	static io_queue_topology
	544	allocate_io_queues(configuration c, std::vector<cpu> cpus) {
	545	io_queue_topology ret;
	546
	547	unsigned nr_cpus = unsigned(cpus.size());
	548	ret.shard_to_coordinator.resize(nr_cpus);
	549	ret.coordinators.resize(nr_cpus);
	550	ret.coordinator_to_idx.resize(nr_cpus);
	551	ret.coordinator_to_idx_valid.resize(nr_cpus);
	552
	553	for (unsigned shard = 0; shard < nr_cpus; ++shard) {
	554	ret.shard_to_coordinator[shard] = shard;
	555	ret.coordinators[shard] = shard;
	556	ret.coordinator_to_idx[shard] = shard;
	557	ret.coordinator_to_idx_valid[shard] = true;
	558	}
	559	return ret;
	560	}
	561
562
563	resources allocate(configuration c) {
564	resources ret;
565
566	auto available_memory = ::sysconf(_SC_PAGESIZE) * size_t(::sysconf(_SC_PHYS_PAGES));
567	auto mem = calculate_memory(c, available_memory);
568	auto cpuset_procs = c.cpu_set ? c.cpu_set->size() : nr_processing_units();
569	auto procs = c.cpus.value_or(cpuset_procs);
570	ret.cpus.reserve(procs);
9f95a23c TL	571	if (c.cpu_set) {
	572	for (auto cpuid : *c.cpu_set) {
	573	ret.cpus.push_back(cpu{cpuid, {{mem / procs, 0}}});
	574	}
	575	} else {
	576	for (unsigned i = 0; i < procs; ++i) {
	577	ret.cpus.push_back(cpu{i, {{mem / procs, 0}}});
	578	}
11fdf7f2 TL	579	}
	580
	581	ret.ioq_topology.emplace(0, allocate_io_queues(c, ret.cpus));
	582	return ret;
	583	}
	584
	585	unsigned nr_processing_units() {
	586	return ::sysconf(_SC_NPROCESSORS_ONLN);
	587	}
	588
	589	}
	590
	591	}
	592
	593	#endif