]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | |
2 | /* | |
3 | * This file is open source software, licensed to you under the terms | |
4 | * of the Apache License, Version 2.0 (the "License"). See the NOTICE file | |
5 | * distributed with this work for additional information regarding copyright | |
6 | * ownership. You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You may obtain a copy of the License at | |
9 | * | |
10 | * http://www.apache.org/licenses/LICENSE-2.0 | |
11 | * | |
12 | * Unless required by applicable law or agreed to in writing, | |
13 | * software distributed under the License is distributed on an | |
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
15 | * KIND, either express or implied. See the License for the | |
16 | * specific language governing permissions and limitations | |
17 | * under the License. | |
18 | */ | |
19 | /* | |
20 | * Copyright (C) 2014 Cloudius Systems, Ltd. | |
21 | */ | |
22 | ||
23 | #include <boost/program_options.hpp> | |
24 | #include <boost/algorithm/string.hpp> | |
25 | #include <regex> | |
26 | #include <seastar/core/resource.hh> | |
27 | #include <seastar/core/align.hh> | |
28 | #include <seastar/core/print.hh> | |
29 | #include <seastar/util/read_first_line.hh> | |
30 | #include <stdlib.h> | |
31 | #include <limits> | |
9f95a23c TL |
32 | #include "cgroup.hh" |
33 | #include <seastar/util/log.hh> | |
11fdf7f2 TL |
34 | |
35 | #include <boost/range/adaptor/map.hpp> | |
36 | #include <boost/range/algorithm/copy.hpp> | |
37 | ||
38 | namespace seastar { | |
39 | ||
9f95a23c TL |
40 | extern logger seastar_logger; |
41 | ||
42 | // This function was made optional because of validate. It needs to | |
43 | // throw an error when a non parseable input is given. | |
44 | compat::optional<resource::cpuset> parse_cpuset(std::string value) { | |
11fdf7f2 | 45 | static std::regex r("(\\d+-)?(\\d+)(,(\\d+-)?(\\d+))*"); |
9f95a23c | 46 | |
11fdf7f2 | 47 | std::smatch match; |
9f95a23c | 48 | if (std::regex_match(value, match, r)) { |
11fdf7f2 | 49 | std::vector<std::string> ranges; |
9f95a23c TL |
50 | boost::split(ranges, value, boost::is_any_of(",")); |
51 | resource::cpuset ret; | |
11fdf7f2 TL |
52 | for (auto&& range: ranges) { |
53 | std::string beg = range; | |
54 | std::string end = range; | |
55 | auto dash = range.find('-'); | |
56 | if (dash != range.npos) { | |
57 | beg = range.substr(0, dash); | |
58 | end = range.substr(dash + 1); | |
59 | } | |
60 | auto b = boost::lexical_cast<unsigned>(beg); | |
61 | auto e = boost::lexical_cast<unsigned>(end); | |
9f95a23c | 62 | |
11fdf7f2 | 63 | if (b > e) { |
9f95a23c | 64 | return seastar::compat::nullopt; |
11fdf7f2 | 65 | } |
9f95a23c | 66 | |
11fdf7f2 | 67 | for (auto i = b; i <= e; ++i) { |
9f95a23c | 68 | ret.insert(i); |
11fdf7f2 TL |
69 | } |
70 | } | |
9f95a23c TL |
71 | return ret; |
72 | } | |
73 | return seastar::compat::nullopt; | |
74 | } | |
75 | ||
76 | // Overload for boost program options parsing/validation | |
77 | void validate(boost::any& v, | |
78 | const std::vector<std::string>& values, | |
79 | cpuset_bpo_wrapper* target_type, int) { | |
80 | using namespace boost::program_options; | |
81 | validators::check_first_occurrence(v); | |
82 | ||
83 | // Extract the first string from 'values'. If there is more than | |
84 | // one string, it's an error, and exception will be thrown. | |
85 | auto&& s = validators::get_single_string(values); | |
86 | auto parsed_cpu_set = parse_cpuset(s); | |
87 | ||
88 | if (parsed_cpu_set) { | |
89 | cpuset_bpo_wrapper ret; | |
90 | ret.value = *parsed_cpu_set; | |
11fdf7f2 TL |
91 | v = std::move(ret); |
92 | } else { | |
93 | throw validation_error(validation_error::invalid_option_value); | |
94 | } | |
95 | } | |
96 | ||
9f95a23c TL |
97 | namespace cgroup { |
98 | ||
99 | namespace fs = seastar::compat::filesystem; | |
100 | ||
101 | optional<cpuset> cpu_set() { | |
102 | auto cpuset = read_setting_V1V2_as<std::string>( | |
103 | "cpuset/cpuset.cpus", | |
104 | "cpuset.cpus.effective"); | |
105 | if (cpuset) { | |
106 | return seastar::parse_cpuset(*cpuset); | |
107 | } | |
108 | ||
109 | seastar_logger.warn("Unable to parse cgroup's cpuset. Ignoring."); | |
110 | return seastar::compat::nullopt; | |
111 | } | |
112 | ||
113 | size_t memory_limit() { | |
114 | return read_setting_V1V2_as<size_t>( | |
115 | "memory/memory.limit_in_bytes", | |
116 | "memory.max") | |
117 | .value_or(std::numeric_limits<size_t>::max()); | |
118 | } | |
119 | ||
120 | template <typename T> | |
121 | optional<T> read_setting_as(std::string path) { | |
122 | try { | |
123 | auto line = read_first_line(path); | |
124 | return boost::lexical_cast<T>(line); | |
125 | } catch (...) { | |
126 | seastar_logger.warn("Couldn't read cgroup file {}.", path); | |
127 | } | |
128 | ||
129 | return seastar::compat::nullopt; | |
130 | } | |
131 | ||
132 | /* | |
133 | * what cgroup do we belong to? | |
134 | * | |
135 | * For cgroups V2, /proc/self/cgroup should read "0::<cgroup-dir-path>" | |
136 | * Note: true only for V2-only systems, but there is no reason to support | |
137 | * a hybrid configuration. | |
138 | */ | |
139 | static optional<fs::path> cgroup2_path_my_pid() { | |
140 | seastar::sstring cline; | |
141 | try { | |
142 | cline = read_first_line(fs::path{"/proc/self/cgroup"}); | |
143 | } catch (...) { | |
144 | // '/proc/self/cgroup' must be there. If not - there is an issue | |
145 | // with the system configuration. | |
146 | throw std::runtime_error("no cgroup data for our process"); | |
147 | } | |
148 | ||
149 | // for a V2-only system, we expect exactly one line: | |
150 | // 0::<abs-path-to-cgroup> | |
151 | if (cline.at(0) != '0') { | |
152 | // This is either a v1 system, or system configured with a hybrid of v1 & v2. | |
153 | // We do not support such combinations of v1 and v2 at this point. | |
154 | seastar_logger.debug("Not a cgroups-v2-only system"); | |
155 | return seastar::compat::nullopt; | |
156 | } | |
157 | ||
158 | // the path is guaranteed to start with '0::/' | |
159 | return fs::path{"/sys/fs/cgroup/" + cline.substr(4)}; | |
160 | } | |
161 | ||
162 | /* | |
163 | * traverse the cgroups V2 hierarchy bottom-up, starting from our process' | |
164 | * specific cgroup up to /sys/fs/cgroup, looking for the named file. | |
165 | */ | |
166 | static optional<fs::path> locate_lowest_cgroup2(fs::path lowest_subdir, std::string filename) { | |
167 | // locate the lowest subgroup containing the named file (i.e. | |
168 | // handles the requested control by itself) | |
169 | do { | |
170 | // does the cgroup settings file exist? | |
171 | auto set_path = lowest_subdir / filename; | |
172 | if (fs::exists(set_path) ) { | |
173 | return set_path; | |
174 | } | |
175 | ||
176 | lowest_subdir = lowest_subdir.parent_path(); | |
177 | } while (lowest_subdir.compare("/sys/fs")); | |
178 | ||
179 | return seastar::compat::nullopt; | |
180 | } | |
181 | ||
182 | /* | |
183 | * Read a settings value from either the cgroups V2 or the corresponding | |
184 | * cgroups V1 files. | |
185 | * For V2, look for the lowest cgroup in our hierarchy that manages the | |
186 | * requested settings. | |
187 | */ | |
188 | template <typename T> | |
189 | optional<T> read_setting_V1V2_as(std::string cg1_path, std::string cg2_fname) { | |
190 | // on v2-systems, cg2_path will be initialized with the leaf cgroup that | |
191 | // controls this process | |
192 | static optional<fs::path> cg2_path{cgroup2_path_my_pid()}; | |
193 | ||
194 | if (cg2_path) { | |
195 | // this is a v2 system | |
196 | seastar::sstring line; | |
197 | try { | |
198 | line = read_first_line(locate_lowest_cgroup2(*cg2_path, cg2_fname).value()); | |
199 | } catch (...) { | |
200 | seastar_logger.warn("Could not read cgroups v2 file ({}).", cg2_fname); | |
201 | return seastar::compat::nullopt; | |
202 | } | |
203 | if (line.compare("max")) { | |
204 | try { | |
205 | return boost::lexical_cast<T>(line); | |
206 | } catch (...) { | |
207 | seastar_logger.warn("Malformed cgroups file ({}) contents.", cg2_fname); | |
208 | } | |
209 | } | |
210 | return seastar::compat::nullopt; | |
211 | } | |
212 | ||
213 | // try cgroups v1: | |
214 | try { | |
215 | auto line = read_first_line(fs::path{"/sys/fs/cgroup"} / cg1_path); | |
216 | return boost::lexical_cast<T>(line); | |
217 | } catch (...) { | |
218 | seastar_logger.warn("Could not parse cgroups v1 file ({}).", cg1_path); | |
219 | } | |
220 | ||
221 | return seastar::compat::nullopt; | |
222 | } | |
223 | ||
224 | } | |
225 | ||
11fdf7f2 TL |
226 | namespace resource { |
227 | ||
228 | size_t calculate_memory(configuration c, size_t available_memory, float panic_factor = 1) { | |
229 | size_t default_reserve_memory = std::max<size_t>(1536 * 1024 * 1024, 0.07 * available_memory) * panic_factor; | |
230 | auto reserve = c.reserve_memory.value_or(default_reserve_memory); | |
231 | size_t min_memory = 500'000'000; | |
232 | if (available_memory >= reserve + min_memory) { | |
233 | available_memory -= reserve; | |
234 | } else { | |
235 | // Allow starting up even in low memory configurations (e.g. 2GB boot2docker VM) | |
236 | available_memory = min_memory; | |
237 | } | |
238 | size_t mem = c.total_memory.value_or(available_memory); | |
239 | if (mem > available_memory) { | |
240 | throw std::runtime_error(format("insufficient physical memory: needed {} available {}", mem, available_memory)); | |
241 | } | |
242 | return mem; | |
243 | } | |
244 | ||
245 | } | |
246 | ||
247 | } | |
248 | ||
249 | #ifdef SEASTAR_HAVE_HWLOC | |
250 | ||
251 | #include <seastar/util/defer.hh> | |
252 | #include <seastar/core/print.hh> | |
253 | #include <hwloc.h> | |
254 | #include <unordered_map> | |
255 | #include <boost/range/irange.hpp> | |
256 | ||
257 | namespace seastar { | |
258 | ||
259 | cpu_set_t cpuid_to_cpuset(unsigned cpuid) { | |
260 | cpu_set_t cs; | |
261 | CPU_ZERO(&cs); | |
262 | CPU_SET(cpuid, &cs); | |
263 | return cs; | |
264 | } | |
265 | ||
266 | namespace resource { | |
267 | ||
268 | size_t div_roundup(size_t num, size_t denom) { | |
269 | return (num + denom - 1) / denom; | |
270 | } | |
271 | ||
11fdf7f2 | 272 | static size_t alloc_from_node(cpu& this_cpu, hwloc_obj_t node, std::unordered_map<hwloc_obj_t, size_t>& used_mem, size_t alloc) { |
9f95a23c TL |
273 | #if HWLOC_API_VERSION >= 0x00020000 |
274 | // FIXME: support nodes with multiple NUMA nodes, whatever that means | |
275 | auto local_memory = node->total_memory; | |
276 | #else | |
277 | auto local_memory = node->memory.local_memory; | |
278 | #endif | |
279 | auto taken = std::min(local_memory - used_mem[node], alloc); | |
11fdf7f2 TL |
280 | if (taken) { |
281 | used_mem[node] += taken; | |
282 | auto node_id = hwloc_bitmap_first(node->nodeset); | |
283 | assert(node_id != -1); | |
284 | this_cpu.mem.push_back({taken, unsigned(node_id)}); | |
285 | } | |
286 | return taken; | |
287 | } | |
288 | ||
9f95a23c TL |
289 | // Find the numa node that contains a specific PU. |
290 | static hwloc_obj_t get_numa_node_for_pu(hwloc_topology_t& topology, hwloc_obj_t pu) { | |
291 | // Can't use ancestry because hwloc 2.0 NUMA nodes are not ancestors of PUs | |
292 | hwloc_obj_t tmp = NULL; | |
293 | auto depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_NUMANODE); | |
294 | while ((tmp = hwloc_get_next_obj_by_depth(topology, depth, tmp)) != NULL) { | |
295 | if (hwloc_bitmap_intersects(tmp->cpuset, pu->cpuset)) { | |
296 | return tmp; | |
297 | } | |
298 | } | |
299 | assert(false && "PU not inside any NUMA node"); | |
300 | abort(); | |
301 | } | |
302 | ||
11fdf7f2 TL |
303 | struct distribute_objects { |
304 | std::vector<hwloc_cpuset_t> cpu_sets; | |
305 | hwloc_obj_t root; | |
306 | ||
307 | distribute_objects(hwloc_topology_t& topology, size_t nobjs) : cpu_sets(nobjs), root(hwloc_get_root_obj(topology)) { | |
308 | #if HWLOC_API_VERSION >= 0x00010900 | |
309 | hwloc_distrib(topology, &root, 1, cpu_sets.data(), cpu_sets.size(), INT_MAX, 0); | |
310 | #else | |
311 | hwloc_distribute(topology, root, cpu_sets.data(), cpu_sets.size(), INT_MAX); | |
312 | #endif | |
313 | } | |
314 | ||
315 | ~distribute_objects() { | |
316 | for (auto&& cs : cpu_sets) { | |
317 | hwloc_bitmap_free(cs); | |
318 | } | |
319 | } | |
320 | std::vector<hwloc_cpuset_t>& operator()() { | |
321 | return cpu_sets; | |
322 | } | |
323 | }; | |
324 | ||
325 | static io_queue_topology | |
326 | allocate_io_queues(hwloc_topology_t& topology, std::vector<cpu> cpus, unsigned num_io_queues, unsigned& last_node_idx) { | |
9f95a23c | 327 | auto node_of_shard = [&topology, &cpus] (unsigned shard) { |
11fdf7f2 | 328 | auto pu = hwloc_get_pu_obj_by_os_index(topology, cpus[shard].cpu_id); |
9f95a23c | 329 | auto node = get_numa_node_for_pu(topology, pu); |
11fdf7f2 TL |
330 | return hwloc_bitmap_first(node->nodeset); |
331 | }; | |
332 | ||
333 | // There are two things we are trying to achieve by populating a numa_nodes map. | |
334 | // | |
335 | // The first is to find out how many nodes we have in the system. We can't use | |
336 | // hwloc for that, because at this point we are not longer talking about the physical system, | |
337 | // but the actual booted seastar server instead. So if we have restricted the run to a subset | |
338 | // of the available processors, counting topology nodes won't spur the same result. | |
339 | // | |
340 | // Secondly, we need to find out which processors live in each node. For a reason similar to the | |
341 | // above, hwloc won't do us any good here. Later on, we will use this information to assign | |
342 | // shards to coordinators that are node-local to themselves. | |
343 | std::unordered_map<unsigned, std::set<unsigned>> numa_nodes; | |
344 | for (auto shard: boost::irange(0, int(cpus.size()))) { | |
345 | auto node_id = node_of_shard(shard); | |
346 | ||
347 | if (numa_nodes.count(node_id) == 0) { | |
348 | numa_nodes.emplace(node_id, std::set<unsigned>()); | |
349 | } | |
350 | numa_nodes.at(node_id).insert(shard); | |
351 | } | |
352 | ||
353 | io_queue_topology ret; | |
354 | ret.shard_to_coordinator.resize(cpus.size()); | |
355 | ret.coordinator_to_idx.resize(cpus.size()); | |
356 | ret.coordinator_to_idx_valid.resize(cpus.size()); | |
357 | ||
358 | // User may be playing with --smp option, but num_io_queues was independently | |
359 | // determined by iotune, so adjust for any conflicts. | |
360 | if (num_io_queues > cpus.size()) { | |
361 | fmt::print("Warning: number of IO queues ({:d}) greater than logical cores ({:d}). Adjusting downwards.\n", num_io_queues, cpus.size()); | |
362 | num_io_queues = cpus.size(); | |
363 | } | |
364 | ||
365 | auto find_shard = [&cpus] (unsigned cpu_id) { | |
366 | auto idx = 0u; | |
367 | for (auto& c: cpus) { | |
368 | if (c.cpu_id == cpu_id) { | |
369 | return idx; | |
370 | } | |
371 | idx++; | |
372 | } | |
373 | assert(0); | |
374 | }; | |
375 | ||
376 | auto cpu_sets = distribute_objects(topology, num_io_queues); | |
377 | ret.coordinators.reserve(cpu_sets().size()); | |
378 | ||
379 | // First step: distribute the IO queues given the information returned in cpu_sets. | |
380 | // If there is one IO queue per processor, only this loop will be executed. | |
381 | std::unordered_map<unsigned, std::vector<unsigned>> node_coordinators; | |
382 | for (auto&& cs : cpu_sets()) { | |
383 | auto io_coordinator = find_shard(hwloc_bitmap_first(cs)); | |
384 | ||
385 | ret.coordinator_to_idx[io_coordinator] = ret.coordinators.size(); | |
386 | assert(!ret.coordinator_to_idx_valid[io_coordinator]); | |
387 | ret.coordinator_to_idx_valid[io_coordinator] = true; | |
388 | ret.coordinators.emplace_back(io_coordinator); | |
389 | // If a processor is a coordinator, it is also obviously a coordinator of itself | |
390 | ret.shard_to_coordinator[io_coordinator] = io_coordinator; | |
391 | ||
392 | auto node_id = node_of_shard(io_coordinator); | |
393 | if (node_coordinators.count(node_id) == 0) { | |
394 | node_coordinators.emplace(node_id, std::vector<unsigned>()); | |
395 | } | |
396 | node_coordinators.at(node_id).push_back(io_coordinator); | |
397 | numa_nodes[node_id].erase(io_coordinator); | |
398 | } | |
399 | ||
400 | ||
401 | auto available_nodes = boost::copy_range<std::vector<unsigned>>(node_coordinators | boost::adaptors::map_keys); | |
402 | ||
403 | // If there are more processors than coordinators, we will have to assign them to existing | |
404 | // coordinators. We prefer do that within the same NUMA node, but if not possible we assign | |
405 | // the shard to a random node. | |
406 | for (auto& node: numa_nodes) { | |
407 | auto cid_idx = 0; | |
408 | for (auto& remaining_shard: node.second) { | |
409 | auto my_node = node.first; | |
410 | // No I/O queue in this node, round-robin shards from this node into existing ones. | |
411 | if (!node_coordinators.count(node.first)) { | |
412 | my_node = available_nodes[last_node_idx++ % available_nodes.size()]; | |
413 | } | |
414 | auto idx = cid_idx++ % node_coordinators.at(my_node).size(); | |
415 | auto io_coordinator = node_coordinators.at(my_node)[idx]; | |
416 | ret.shard_to_coordinator[remaining_shard] = io_coordinator; | |
417 | } | |
418 | } | |
419 | ||
420 | return ret; | |
421 | } | |
422 | ||
423 | ||
11fdf7f2 TL |
424 | resources allocate(configuration c) { |
425 | hwloc_topology_t topology; | |
426 | hwloc_topology_init(&topology); | |
427 | auto free_hwloc = defer([&] { hwloc_topology_destroy(topology); }); | |
428 | hwloc_topology_load(topology); | |
429 | if (c.cpu_set) { | |
430 | auto bm = hwloc_bitmap_alloc(); | |
431 | auto free_bm = defer([&] { hwloc_bitmap_free(bm); }); | |
432 | for (auto idx : *c.cpu_set) { | |
433 | hwloc_bitmap_set(bm, idx); | |
434 | } | |
435 | auto r = hwloc_topology_restrict(topology, bm, | |
9f95a23c TL |
436 | #if HWLOC_API_VERSION >= 0x00020000 |
437 | 0 | |
438 | #else | |
11fdf7f2 | 439 | HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES |
9f95a23c | 440 | #endif |
11fdf7f2 TL |
441 | | HWLOC_RESTRICT_FLAG_ADAPT_MISC |
442 | | HWLOC_RESTRICT_FLAG_ADAPT_IO); | |
443 | if (r == -1) { | |
444 | if (errno == ENOMEM) { | |
445 | throw std::bad_alloc(); | |
446 | } | |
447 | if (errno == EINVAL) { | |
448 | throw std::runtime_error("bad cpuset"); | |
449 | } | |
450 | abort(); | |
451 | } | |
452 | } | |
453 | auto machine_depth = hwloc_get_type_depth(topology, HWLOC_OBJ_MACHINE); | |
454 | assert(hwloc_get_nbobjs_by_depth(topology, machine_depth) == 1); | |
455 | auto machine = hwloc_get_obj_by_depth(topology, machine_depth, 0); | |
9f95a23c TL |
456 | #if HWLOC_API_VERSION >= 0x00020000 |
457 | auto available_memory = machine->total_memory; | |
458 | #else | |
11fdf7f2 | 459 | auto available_memory = machine->memory.total_memory; |
9f95a23c | 460 | #endif |
11fdf7f2 | 461 | size_t mem = calculate_memory(c, std::min(available_memory, |
9f95a23c | 462 | cgroup::memory_limit())); |
11fdf7f2 TL |
463 | unsigned available_procs = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); |
464 | unsigned procs = c.cpus.value_or(available_procs); | |
465 | if (procs > available_procs) { | |
466 | throw std::runtime_error("insufficient processing units"); | |
467 | } | |
468 | auto mem_per_proc = align_down<size_t>(mem / procs, 2 << 20); | |
469 | ||
470 | resources ret; | |
471 | std::unordered_map<hwloc_obj_t, size_t> topo_used_mem; | |
472 | std::vector<std::pair<cpu, size_t>> remains; | |
473 | size_t remain; | |
11fdf7f2 TL |
474 | |
475 | auto cpu_sets = distribute_objects(topology, procs); | |
476 | ||
477 | // Divide local memory to cpus | |
478 | for (auto&& cs : cpu_sets()) { | |
479 | auto cpu_id = hwloc_bitmap_first(cs); | |
480 | assert(cpu_id != -1); | |
481 | auto pu = hwloc_get_pu_obj_by_os_index(topology, cpu_id); | |
9f95a23c | 482 | auto node = get_numa_node_for_pu(topology, pu); |
11fdf7f2 TL |
483 | cpu this_cpu; |
484 | this_cpu.cpu_id = cpu_id; | |
485 | remain = mem_per_proc - alloc_from_node(this_cpu, node, topo_used_mem, mem_per_proc); | |
486 | ||
487 | remains.emplace_back(std::move(this_cpu), remain); | |
488 | } | |
489 | ||
490 | // Divide the rest of the memory | |
9f95a23c | 491 | auto depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_NUMANODE); |
11fdf7f2 TL |
492 | for (auto&& r : remains) { |
493 | cpu this_cpu; | |
494 | size_t remain; | |
495 | std::tie(this_cpu, remain) = r; | |
496 | auto pu = hwloc_get_pu_obj_by_os_index(topology, this_cpu.cpu_id); | |
9f95a23c | 497 | auto node = get_numa_node_for_pu(topology, pu); |
11fdf7f2 TL |
498 | auto obj = node; |
499 | ||
500 | while (remain) { | |
501 | remain -= alloc_from_node(this_cpu, obj, topo_used_mem, remain); | |
502 | do { | |
503 | obj = hwloc_get_next_obj_by_depth(topology, depth, obj); | |
504 | } while (!obj); | |
505 | if (obj == node) | |
506 | break; | |
507 | } | |
508 | assert(!remain); | |
509 | ret.cpus.push_back(std::move(this_cpu)); | |
510 | } | |
511 | ||
512 | unsigned last_node_idx = 0; | |
513 | for (auto d : c.num_io_queues) { | |
514 | auto devid = d.first; | |
515 | auto num_io_queues = d.second; | |
516 | ret.ioq_topology.emplace(devid, allocate_io_queues(topology, ret.cpus, num_io_queues, last_node_idx)); | |
517 | } | |
518 | return ret; | |
519 | } | |
520 | ||
521 | unsigned nr_processing_units() { | |
522 | hwloc_topology_t topology; | |
523 | hwloc_topology_init(&topology); | |
524 | auto free_hwloc = defer([&] { hwloc_topology_destroy(topology); }); | |
525 | hwloc_topology_load(topology); | |
526 | return hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); | |
527 | } | |
528 | ||
529 | } | |
530 | ||
531 | } | |
532 | ||
533 | #else | |
534 | ||
535 | #include <seastar/core/resource.hh> | |
536 | #include <unistd.h> | |
537 | ||
538 | namespace seastar { | |
539 | ||
540 | namespace resource { | |
541 | ||
542 | // Without hwloc, we don't support tuning the number of IO queues. So each CPU gets their. | |
543 | static io_queue_topology | |
544 | allocate_io_queues(configuration c, std::vector<cpu> cpus) { | |
545 | io_queue_topology ret; | |
546 | ||
547 | unsigned nr_cpus = unsigned(cpus.size()); | |
548 | ret.shard_to_coordinator.resize(nr_cpus); | |
549 | ret.coordinators.resize(nr_cpus); | |
550 | ret.coordinator_to_idx.resize(nr_cpus); | |
551 | ret.coordinator_to_idx_valid.resize(nr_cpus); | |
552 | ||
553 | for (unsigned shard = 0; shard < nr_cpus; ++shard) { | |
554 | ret.shard_to_coordinator[shard] = shard; | |
555 | ret.coordinators[shard] = shard; | |
556 | ret.coordinator_to_idx[shard] = shard; | |
557 | ret.coordinator_to_idx_valid[shard] = true; | |
558 | } | |
559 | return ret; | |
560 | } | |
561 | ||
562 | ||
563 | resources allocate(configuration c) { | |
564 | resources ret; | |
565 | ||
566 | auto available_memory = ::sysconf(_SC_PAGESIZE) * size_t(::sysconf(_SC_PHYS_PAGES)); | |
567 | auto mem = calculate_memory(c, available_memory); | |
568 | auto cpuset_procs = c.cpu_set ? c.cpu_set->size() : nr_processing_units(); | |
569 | auto procs = c.cpus.value_or(cpuset_procs); | |
570 | ret.cpus.reserve(procs); | |
9f95a23c TL |
571 | if (c.cpu_set) { |
572 | for (auto cpuid : *c.cpu_set) { | |
573 | ret.cpus.push_back(cpu{cpuid, {{mem / procs, 0}}}); | |
574 | } | |
575 | } else { | |
576 | for (unsigned i = 0; i < procs; ++i) { | |
577 | ret.cpus.push_back(cpu{i, {{mem / procs, 0}}}); | |
578 | } | |
11fdf7f2 TL |
579 | } |
580 | ||
581 | ret.ioq_topology.emplace(0, allocate_io_queues(c, ret.cpus)); | |
582 | return ret; | |
583 | } | |
584 | ||
585 | unsigned nr_processing_units() { | |
586 | return ::sysconf(_SC_NPROCESSORS_ONLN); | |
587 | } | |
588 | ||
589 | } | |
590 | ||
591 | } | |
592 | ||
593 | #endif |