]> git.proxmox.com Git - ceph.git/blame - ceph/src/seastar/src/core/resource.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / seastar / src / core / resource.cc
CommitLineData
11fdf7f2
TL
1
2/*
3 * This file is open source software, licensed to you under the terms
4 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
5 * distributed with this work for additional information regarding copyright
6 * ownership. You may not use this file except in compliance with the License.
7 *
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19/*
20 * Copyright (C) 2014 Cloudius Systems, Ltd.
21 */
22
23#include <boost/program_options.hpp>
24#include <boost/algorithm/string.hpp>
25#include <regex>
26#include <seastar/core/resource.hh>
27#include <seastar/core/align.hh>
28#include <seastar/core/print.hh>
29#include <seastar/util/read_first_line.hh>
30#include <stdlib.h>
31#include <limits>
9f95a23c
TL
32#include "cgroup.hh"
33#include <seastar/util/log.hh>
11fdf7f2
TL
34
35#include <boost/range/adaptor/map.hpp>
36#include <boost/range/algorithm/copy.hpp>
37
38namespace seastar {
39
9f95a23c
TL
40extern logger seastar_logger;
41
42// This function was made optional because of validate. It needs to
43// throw an error when a non parseable input is given.
44compat::optional<resource::cpuset> parse_cpuset(std::string value) {
11fdf7f2 45 static std::regex r("(\\d+-)?(\\d+)(,(\\d+-)?(\\d+))*");
9f95a23c 46
11fdf7f2 47 std::smatch match;
9f95a23c 48 if (std::regex_match(value, match, r)) {
11fdf7f2 49 std::vector<std::string> ranges;
9f95a23c
TL
50 boost::split(ranges, value, boost::is_any_of(","));
51 resource::cpuset ret;
11fdf7f2
TL
52 for (auto&& range: ranges) {
53 std::string beg = range;
54 std::string end = range;
55 auto dash = range.find('-');
56 if (dash != range.npos) {
57 beg = range.substr(0, dash);
58 end = range.substr(dash + 1);
59 }
60 auto b = boost::lexical_cast<unsigned>(beg);
61 auto e = boost::lexical_cast<unsigned>(end);
9f95a23c 62
11fdf7f2 63 if (b > e) {
9f95a23c 64 return seastar::compat::nullopt;
11fdf7f2 65 }
9f95a23c 66
11fdf7f2 67 for (auto i = b; i <= e; ++i) {
9f95a23c 68 ret.insert(i);
11fdf7f2
TL
69 }
70 }
9f95a23c
TL
71 return ret;
72 }
73 return seastar::compat::nullopt;
74}
75
76// Overload for boost program options parsing/validation
77void validate(boost::any& v,
78 const std::vector<std::string>& values,
79 cpuset_bpo_wrapper* target_type, int) {
80 using namespace boost::program_options;
81 validators::check_first_occurrence(v);
82
83 // Extract the first string from 'values'. If there is more than
84 // one string, it's an error, and exception will be thrown.
85 auto&& s = validators::get_single_string(values);
86 auto parsed_cpu_set = parse_cpuset(s);
87
88 if (parsed_cpu_set) {
89 cpuset_bpo_wrapper ret;
90 ret.value = *parsed_cpu_set;
11fdf7f2
TL
91 v = std::move(ret);
92 } else {
93 throw validation_error(validation_error::invalid_option_value);
94 }
95}
96
9f95a23c
TL
97namespace cgroup {
98
99namespace fs = seastar::compat::filesystem;
100
101optional<cpuset> cpu_set() {
102 auto cpuset = read_setting_V1V2_as<std::string>(
103 "cpuset/cpuset.cpus",
104 "cpuset.cpus.effective");
105 if (cpuset) {
106 return seastar::parse_cpuset(*cpuset);
107 }
108
109 seastar_logger.warn("Unable to parse cgroup's cpuset. Ignoring.");
110 return seastar::compat::nullopt;
111}
112
113size_t memory_limit() {
114 return read_setting_V1V2_as<size_t>(
115 "memory/memory.limit_in_bytes",
116 "memory.max")
117 .value_or(std::numeric_limits<size_t>::max());
118}
119
120template <typename T>
121optional<T> read_setting_as(std::string path) {
122 try {
123 auto line = read_first_line(path);
124 return boost::lexical_cast<T>(line);
125 } catch (...) {
126 seastar_logger.warn("Couldn't read cgroup file {}.", path);
127 }
128
129 return seastar::compat::nullopt;
130}
131
132/*
133 * what cgroup do we belong to?
134 *
135 * For cgroups V2, /proc/self/cgroup should read "0::<cgroup-dir-path>"
136 * Note: true only for V2-only systems, but there is no reason to support
137 * a hybrid configuration.
138 */
139static optional<fs::path> cgroup2_path_my_pid() {
140 seastar::sstring cline;
141 try {
142 cline = read_first_line(fs::path{"/proc/self/cgroup"});
143 } catch (...) {
144 // '/proc/self/cgroup' must be there. If not - there is an issue
145 // with the system configuration.
146 throw std::runtime_error("no cgroup data for our process");
147 }
148
149 // for a V2-only system, we expect exactly one line:
150 // 0::<abs-path-to-cgroup>
151 if (cline.at(0) != '0') {
152 // This is either a v1 system, or system configured with a hybrid of v1 & v2.
153 // We do not support such combinations of v1 and v2 at this point.
154 seastar_logger.debug("Not a cgroups-v2-only system");
155 return seastar::compat::nullopt;
156 }
157
158 // the path is guaranteed to start with '0::/'
159 return fs::path{"/sys/fs/cgroup/" + cline.substr(4)};
160}
161
162/*
163 * traverse the cgroups V2 hierarchy bottom-up, starting from our process'
164 * specific cgroup up to /sys/fs/cgroup, looking for the named file.
165 */
166static optional<fs::path> locate_lowest_cgroup2(fs::path lowest_subdir, std::string filename) {
167 // locate the lowest subgroup containing the named file (i.e.
168 // handles the requested control by itself)
169 do {
170 // does the cgroup settings file exist?
171 auto set_path = lowest_subdir / filename;
172 if (fs::exists(set_path) ) {
173 return set_path;
174 }
175
176 lowest_subdir = lowest_subdir.parent_path();
177 } while (lowest_subdir.compare("/sys/fs"));
178
179 return seastar::compat::nullopt;
180}
181
182/*
183 * Read a settings value from either the cgroups V2 or the corresponding
184 * cgroups V1 files.
185 * For V2, look for the lowest cgroup in our hierarchy that manages the
186 * requested settings.
187 */
188template <typename T>
189optional<T> read_setting_V1V2_as(std::string cg1_path, std::string cg2_fname) {
190 // on v2-systems, cg2_path will be initialized with the leaf cgroup that
191 // controls this process
192 static optional<fs::path> cg2_path{cgroup2_path_my_pid()};
193
194 if (cg2_path) {
195 // this is a v2 system
196 seastar::sstring line;
197 try {
198 line = read_first_line(locate_lowest_cgroup2(*cg2_path, cg2_fname).value());
199 } catch (...) {
200 seastar_logger.warn("Could not read cgroups v2 file ({}).", cg2_fname);
201 return seastar::compat::nullopt;
202 }
203 if (line.compare("max")) {
204 try {
205 return boost::lexical_cast<T>(line);
206 } catch (...) {
207 seastar_logger.warn("Malformed cgroups file ({}) contents.", cg2_fname);
208 }
209 }
210 return seastar::compat::nullopt;
211 }
212
213 // try cgroups v1:
214 try {
215 auto line = read_first_line(fs::path{"/sys/fs/cgroup"} / cg1_path);
216 return boost::lexical_cast<T>(line);
217 } catch (...) {
218 seastar_logger.warn("Could not parse cgroups v1 file ({}).", cg1_path);
219 }
220
221 return seastar::compat::nullopt;
222}
223
224}
225
11fdf7f2
TL
226namespace resource {
227
228size_t calculate_memory(configuration c, size_t available_memory, float panic_factor = 1) {
229 size_t default_reserve_memory = std::max<size_t>(1536 * 1024 * 1024, 0.07 * available_memory) * panic_factor;
230 auto reserve = c.reserve_memory.value_or(default_reserve_memory);
231 size_t min_memory = 500'000'000;
232 if (available_memory >= reserve + min_memory) {
233 available_memory -= reserve;
234 } else {
235 // Allow starting up even in low memory configurations (e.g. 2GB boot2docker VM)
236 available_memory = min_memory;
237 }
238 size_t mem = c.total_memory.value_or(available_memory);
239 if (mem > available_memory) {
240 throw std::runtime_error(format("insufficient physical memory: needed {} available {}", mem, available_memory));
241 }
242 return mem;
243}
244
245}
246
247}
248
249#ifdef SEASTAR_HAVE_HWLOC
250
251#include <seastar/util/defer.hh>
252#include <seastar/core/print.hh>
253#include <hwloc.h>
254#include <unordered_map>
255#include <boost/range/irange.hpp>
256
257namespace seastar {
258
259cpu_set_t cpuid_to_cpuset(unsigned cpuid) {
260 cpu_set_t cs;
261 CPU_ZERO(&cs);
262 CPU_SET(cpuid, &cs);
263 return cs;
264}
265
266namespace resource {
267
268size_t div_roundup(size_t num, size_t denom) {
269 return (num + denom - 1) / denom;
270}
271
11fdf7f2 272static size_t alloc_from_node(cpu& this_cpu, hwloc_obj_t node, std::unordered_map<hwloc_obj_t, size_t>& used_mem, size_t alloc) {
9f95a23c
TL
273#if HWLOC_API_VERSION >= 0x00020000
274 // FIXME: support nodes with multiple NUMA nodes, whatever that means
275 auto local_memory = node->total_memory;
276#else
277 auto local_memory = node->memory.local_memory;
278#endif
279 auto taken = std::min(local_memory - used_mem[node], alloc);
11fdf7f2
TL
280 if (taken) {
281 used_mem[node] += taken;
282 auto node_id = hwloc_bitmap_first(node->nodeset);
283 assert(node_id != -1);
284 this_cpu.mem.push_back({taken, unsigned(node_id)});
285 }
286 return taken;
287}
288
9f95a23c
TL
289// Find the numa node that contains a specific PU.
290static hwloc_obj_t get_numa_node_for_pu(hwloc_topology_t& topology, hwloc_obj_t pu) {
291 // Can't use ancestry because hwloc 2.0 NUMA nodes are not ancestors of PUs
292 hwloc_obj_t tmp = NULL;
293 auto depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_NUMANODE);
294 while ((tmp = hwloc_get_next_obj_by_depth(topology, depth, tmp)) != NULL) {
295 if (hwloc_bitmap_intersects(tmp->cpuset, pu->cpuset)) {
296 return tmp;
297 }
298 }
299 assert(false && "PU not inside any NUMA node");
300 abort();
301}
302
11fdf7f2
TL
303struct distribute_objects {
304 std::vector<hwloc_cpuset_t> cpu_sets;
305 hwloc_obj_t root;
306
307 distribute_objects(hwloc_topology_t& topology, size_t nobjs) : cpu_sets(nobjs), root(hwloc_get_root_obj(topology)) {
308#if HWLOC_API_VERSION >= 0x00010900
309 hwloc_distrib(topology, &root, 1, cpu_sets.data(), cpu_sets.size(), INT_MAX, 0);
310#else
311 hwloc_distribute(topology, root, cpu_sets.data(), cpu_sets.size(), INT_MAX);
312#endif
313 }
314
315 ~distribute_objects() {
316 for (auto&& cs : cpu_sets) {
317 hwloc_bitmap_free(cs);
318 }
319 }
320 std::vector<hwloc_cpuset_t>& operator()() {
321 return cpu_sets;
322 }
323};
324
325static io_queue_topology
326allocate_io_queues(hwloc_topology_t& topology, std::vector<cpu> cpus, unsigned num_io_queues, unsigned& last_node_idx) {
9f95a23c 327 auto node_of_shard = [&topology, &cpus] (unsigned shard) {
11fdf7f2 328 auto pu = hwloc_get_pu_obj_by_os_index(topology, cpus[shard].cpu_id);
9f95a23c 329 auto node = get_numa_node_for_pu(topology, pu);
11fdf7f2
TL
330 return hwloc_bitmap_first(node->nodeset);
331 };
332
333 // There are two things we are trying to achieve by populating a numa_nodes map.
334 //
335 // The first is to find out how many nodes we have in the system. We can't use
336 // hwloc for that, because at this point we are not longer talking about the physical system,
337 // but the actual booted seastar server instead. So if we have restricted the run to a subset
338 // of the available processors, counting topology nodes won't spur the same result.
339 //
340 // Secondly, we need to find out which processors live in each node. For a reason similar to the
341 // above, hwloc won't do us any good here. Later on, we will use this information to assign
342 // shards to coordinators that are node-local to themselves.
343 std::unordered_map<unsigned, std::set<unsigned>> numa_nodes;
344 for (auto shard: boost::irange(0, int(cpus.size()))) {
345 auto node_id = node_of_shard(shard);
346
347 if (numa_nodes.count(node_id) == 0) {
348 numa_nodes.emplace(node_id, std::set<unsigned>());
349 }
350 numa_nodes.at(node_id).insert(shard);
351 }
352
353 io_queue_topology ret;
354 ret.shard_to_coordinator.resize(cpus.size());
355 ret.coordinator_to_idx.resize(cpus.size());
356 ret.coordinator_to_idx_valid.resize(cpus.size());
357
358 // User may be playing with --smp option, but num_io_queues was independently
359 // determined by iotune, so adjust for any conflicts.
360 if (num_io_queues > cpus.size()) {
361 fmt::print("Warning: number of IO queues ({:d}) greater than logical cores ({:d}). Adjusting downwards.\n", num_io_queues, cpus.size());
362 num_io_queues = cpus.size();
363 }
364
365 auto find_shard = [&cpus] (unsigned cpu_id) {
366 auto idx = 0u;
367 for (auto& c: cpus) {
368 if (c.cpu_id == cpu_id) {
369 return idx;
370 }
371 idx++;
372 }
373 assert(0);
374 };
375
376 auto cpu_sets = distribute_objects(topology, num_io_queues);
377 ret.coordinators.reserve(cpu_sets().size());
378
379 // First step: distribute the IO queues given the information returned in cpu_sets.
380 // If there is one IO queue per processor, only this loop will be executed.
381 std::unordered_map<unsigned, std::vector<unsigned>> node_coordinators;
382 for (auto&& cs : cpu_sets()) {
383 auto io_coordinator = find_shard(hwloc_bitmap_first(cs));
384
385 ret.coordinator_to_idx[io_coordinator] = ret.coordinators.size();
386 assert(!ret.coordinator_to_idx_valid[io_coordinator]);
387 ret.coordinator_to_idx_valid[io_coordinator] = true;
388 ret.coordinators.emplace_back(io_coordinator);
389 // If a processor is a coordinator, it is also obviously a coordinator of itself
390 ret.shard_to_coordinator[io_coordinator] = io_coordinator;
391
392 auto node_id = node_of_shard(io_coordinator);
393 if (node_coordinators.count(node_id) == 0) {
394 node_coordinators.emplace(node_id, std::vector<unsigned>());
395 }
396 node_coordinators.at(node_id).push_back(io_coordinator);
397 numa_nodes[node_id].erase(io_coordinator);
398 }
399
400
401 auto available_nodes = boost::copy_range<std::vector<unsigned>>(node_coordinators | boost::adaptors::map_keys);
402
403 // If there are more processors than coordinators, we will have to assign them to existing
404 // coordinators. We prefer do that within the same NUMA node, but if not possible we assign
405 // the shard to a random node.
406 for (auto& node: numa_nodes) {
407 auto cid_idx = 0;
408 for (auto& remaining_shard: node.second) {
409 auto my_node = node.first;
410 // No I/O queue in this node, round-robin shards from this node into existing ones.
411 if (!node_coordinators.count(node.first)) {
412 my_node = available_nodes[last_node_idx++ % available_nodes.size()];
413 }
414 auto idx = cid_idx++ % node_coordinators.at(my_node).size();
415 auto io_coordinator = node_coordinators.at(my_node)[idx];
416 ret.shard_to_coordinator[remaining_shard] = io_coordinator;
417 }
418 }
419
420 return ret;
421}
422
423
11fdf7f2
TL
424resources allocate(configuration c) {
425 hwloc_topology_t topology;
426 hwloc_topology_init(&topology);
427 auto free_hwloc = defer([&] { hwloc_topology_destroy(topology); });
428 hwloc_topology_load(topology);
429 if (c.cpu_set) {
430 auto bm = hwloc_bitmap_alloc();
431 auto free_bm = defer([&] { hwloc_bitmap_free(bm); });
432 for (auto idx : *c.cpu_set) {
433 hwloc_bitmap_set(bm, idx);
434 }
435 auto r = hwloc_topology_restrict(topology, bm,
9f95a23c
TL
436#if HWLOC_API_VERSION >= 0x00020000
437 0
438#else
11fdf7f2 439 HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES
9f95a23c 440#endif
11fdf7f2
TL
441 | HWLOC_RESTRICT_FLAG_ADAPT_MISC
442 | HWLOC_RESTRICT_FLAG_ADAPT_IO);
443 if (r == -1) {
444 if (errno == ENOMEM) {
445 throw std::bad_alloc();
446 }
447 if (errno == EINVAL) {
448 throw std::runtime_error("bad cpuset");
449 }
450 abort();
451 }
452 }
453 auto machine_depth = hwloc_get_type_depth(topology, HWLOC_OBJ_MACHINE);
454 assert(hwloc_get_nbobjs_by_depth(topology, machine_depth) == 1);
455 auto machine = hwloc_get_obj_by_depth(topology, machine_depth, 0);
9f95a23c
TL
456#if HWLOC_API_VERSION >= 0x00020000
457 auto available_memory = machine->total_memory;
458#else
11fdf7f2 459 auto available_memory = machine->memory.total_memory;
9f95a23c 460#endif
11fdf7f2 461 size_t mem = calculate_memory(c, std::min(available_memory,
9f95a23c 462 cgroup::memory_limit()));
11fdf7f2
TL
463 unsigned available_procs = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
464 unsigned procs = c.cpus.value_or(available_procs);
465 if (procs > available_procs) {
466 throw std::runtime_error("insufficient processing units");
467 }
468 auto mem_per_proc = align_down<size_t>(mem / procs, 2 << 20);
469
470 resources ret;
471 std::unordered_map<hwloc_obj_t, size_t> topo_used_mem;
472 std::vector<std::pair<cpu, size_t>> remains;
473 size_t remain;
11fdf7f2
TL
474
475 auto cpu_sets = distribute_objects(topology, procs);
476
477 // Divide local memory to cpus
478 for (auto&& cs : cpu_sets()) {
479 auto cpu_id = hwloc_bitmap_first(cs);
480 assert(cpu_id != -1);
481 auto pu = hwloc_get_pu_obj_by_os_index(topology, cpu_id);
9f95a23c 482 auto node = get_numa_node_for_pu(topology, pu);
11fdf7f2
TL
483 cpu this_cpu;
484 this_cpu.cpu_id = cpu_id;
485 remain = mem_per_proc - alloc_from_node(this_cpu, node, topo_used_mem, mem_per_proc);
486
487 remains.emplace_back(std::move(this_cpu), remain);
488 }
489
490 // Divide the rest of the memory
9f95a23c 491 auto depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_NUMANODE);
11fdf7f2
TL
492 for (auto&& r : remains) {
493 cpu this_cpu;
494 size_t remain;
495 std::tie(this_cpu, remain) = r;
496 auto pu = hwloc_get_pu_obj_by_os_index(topology, this_cpu.cpu_id);
9f95a23c 497 auto node = get_numa_node_for_pu(topology, pu);
11fdf7f2
TL
498 auto obj = node;
499
500 while (remain) {
501 remain -= alloc_from_node(this_cpu, obj, topo_used_mem, remain);
502 do {
503 obj = hwloc_get_next_obj_by_depth(topology, depth, obj);
504 } while (!obj);
505 if (obj == node)
506 break;
507 }
508 assert(!remain);
509 ret.cpus.push_back(std::move(this_cpu));
510 }
511
512 unsigned last_node_idx = 0;
513 for (auto d : c.num_io_queues) {
514 auto devid = d.first;
515 auto num_io_queues = d.second;
516 ret.ioq_topology.emplace(devid, allocate_io_queues(topology, ret.cpus, num_io_queues, last_node_idx));
517 }
518 return ret;
519}
520
521unsigned nr_processing_units() {
522 hwloc_topology_t topology;
523 hwloc_topology_init(&topology);
524 auto free_hwloc = defer([&] { hwloc_topology_destroy(topology); });
525 hwloc_topology_load(topology);
526 return hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
527}
528
529}
530
531}
532
533#else
534
535#include <seastar/core/resource.hh>
536#include <unistd.h>
537
538namespace seastar {
539
540namespace resource {
541
542// Without hwloc, we don't support tuning the number of IO queues. So each CPU gets their.
543static io_queue_topology
544allocate_io_queues(configuration c, std::vector<cpu> cpus) {
545 io_queue_topology ret;
546
547 unsigned nr_cpus = unsigned(cpus.size());
548 ret.shard_to_coordinator.resize(nr_cpus);
549 ret.coordinators.resize(nr_cpus);
550 ret.coordinator_to_idx.resize(nr_cpus);
551 ret.coordinator_to_idx_valid.resize(nr_cpus);
552
553 for (unsigned shard = 0; shard < nr_cpus; ++shard) {
554 ret.shard_to_coordinator[shard] = shard;
555 ret.coordinators[shard] = shard;
556 ret.coordinator_to_idx[shard] = shard;
557 ret.coordinator_to_idx_valid[shard] = true;
558 }
559 return ret;
560}
561
562
563resources allocate(configuration c) {
564 resources ret;
565
566 auto available_memory = ::sysconf(_SC_PAGESIZE) * size_t(::sysconf(_SC_PHYS_PAGES));
567 auto mem = calculate_memory(c, available_memory);
568 auto cpuset_procs = c.cpu_set ? c.cpu_set->size() : nr_processing_units();
569 auto procs = c.cpus.value_or(cpuset_procs);
570 ret.cpus.reserve(procs);
9f95a23c
TL
571 if (c.cpu_set) {
572 for (auto cpuid : *c.cpu_set) {
573 ret.cpus.push_back(cpu{cpuid, {{mem / procs, 0}}});
574 }
575 } else {
576 for (unsigned i = 0; i < procs; ++i) {
577 ret.cpus.push_back(cpu{i, {{mem / procs, 0}}});
578 }
11fdf7f2
TL
579 }
580
581 ret.ioq_topology.emplace(0, allocate_io_queues(c, ret.cpus));
582 return ret;
583}
584
585unsigned nr_processing_units() {
586 return ::sysconf(_SC_NPROCESSORS_ONLN);
587}
588
589}
590
591}
592
593#endif