]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/AvlAllocator.cc
update ceph source to reef 18.2.1
[ceph.git] / ceph / src / os / bluestore / AvlAllocator.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "AvlAllocator.h"
5
6 #include <bit>
7 #include <limits>
8
9 #include "common/config_proxy.h"
10 #include "common/debug.h"
11
12 #define dout_context cct
13 #define dout_subsys ceph_subsys_bluestore
14 #undef dout_prefix
15 #define dout_prefix *_dout << "AvlAllocator "
16
17 MEMPOOL_DEFINE_OBJECT_FACTORY(range_seg_t, range_seg_t, bluestore_alloc);
18
19 namespace {
20 // a light-weight "range_seg_t", which only used as the key when searching in
21 // range_tree and range_size_tree
22 struct range_t {
23 uint64_t start;
24 uint64_t end;
25 };
26 }
27
28 /*
29 * This is a helper function that can be used by the allocator to find
30 * a suitable block to allocate. This will search the specified AVL
31 * tree looking for a block that matches the specified criteria.
32 */
33 uint64_t AvlAllocator::_pick_block_after(uint64_t *cursor,
34 uint64_t size,
35 uint64_t align)
36 {
37 const auto compare = range_tree.key_comp();
38 uint32_t search_count = 0;
39 uint64_t search_bytes = 0;
40 auto rs_start = range_tree.lower_bound(range_t{*cursor, size}, compare);
41 for (auto rs = rs_start; rs != range_tree.end(); ++rs) {
42 uint64_t offset = rs->start;
43 *cursor = offset + size;
44 if (offset + size <= rs->end) {
45 return offset;
46 }
47 if (max_search_count > 0 && ++search_count > max_search_count) {
48 return -1ULL;
49 }
50 if (search_bytes = rs->start - rs_start->start;
51 max_search_bytes > 0 && search_bytes > max_search_bytes) {
52 return -1ULL;
53 }
54 }
55
56 if (*cursor == 0) {
57 // If we already started from beginning, don't bother with searching from beginning
58 return -1ULL;
59 }
60 // If we reached end, start from beginning till cursor.
61 for (auto rs = range_tree.begin(); rs != rs_start; ++rs) {
62 uint64_t offset = rs->start;
63 *cursor = offset + size;
64 if (offset + size <= rs->end) {
65 return offset;
66 }
67 if (max_search_count > 0 && ++search_count > max_search_count) {
68 return -1ULL;
69 }
70 if (max_search_bytes > 0 && search_bytes + rs->start > max_search_bytes) {
71 return -1ULL;
72 }
73 }
74 return -1ULL;
75 }
76
77 uint64_t AvlAllocator::_pick_block_fits(uint64_t size,
78 uint64_t align)
79 {
80 // instead of searching from cursor, just pick the smallest range which fits
81 // the needs
82 const auto compare = range_size_tree.key_comp();
83 auto rs_start = range_size_tree.lower_bound(range_t{0, size}, compare);
84 for (auto rs = rs_start; rs != range_size_tree.end(); ++rs) {
85 uint64_t offset = rs->start;
86 if (offset + size <= rs->end) {
87 return offset;
88 }
89 }
90 return -1ULL;
91 }
92
93 void AvlAllocator::_add_to_tree(uint64_t start, uint64_t size)
94 {
95 ceph_assert(size != 0);
96
97 uint64_t end = start + size;
98
99 auto rs_after = range_tree.upper_bound(range_t{start, end},
100 range_tree.key_comp());
101
102 /* Make sure we don't overlap with either of our neighbors */
103 auto rs_before = range_tree.end();
104 if (rs_after != range_tree.begin()) {
105 rs_before = std::prev(rs_after);
106 }
107
108 bool merge_before = (rs_before != range_tree.end() && rs_before->end == start);
109 bool merge_after = (rs_after != range_tree.end() && rs_after->start == end);
110
111 if (merge_before && merge_after) {
112 _range_size_tree_rm(*rs_before);
113 _range_size_tree_rm(*rs_after);
114 rs_after->start = rs_before->start;
115 range_tree.erase_and_dispose(rs_before, dispose_rs{});
116 _range_size_tree_try_insert(*rs_after);
117 } else if (merge_before) {
118 _range_size_tree_rm(*rs_before);
119 rs_before->end = end;
120 _range_size_tree_try_insert(*rs_before);
121 } else if (merge_after) {
122 _range_size_tree_rm(*rs_after);
123 rs_after->start = start;
124 _range_size_tree_try_insert(*rs_after);
125 } else {
126 _try_insert_range(start, end, &rs_after);
127 }
128 }
129
130 void AvlAllocator::_process_range_removal(uint64_t start, uint64_t end,
131 AvlAllocator::range_tree_t::iterator& rs)
132 {
133 bool left_over = (rs->start != start);
134 bool right_over = (rs->end != end);
135
136 _range_size_tree_rm(*rs);
137
138 if (left_over && right_over) {
139 auto old_right_end = rs->end;
140 auto insert_pos = rs;
141 ceph_assert(insert_pos != range_tree.end());
142 ++insert_pos;
143 rs->end = start;
144
145 // Insert tail first to be sure insert_pos hasn't been disposed.
146 // This woulnd't dispose rs though since it's out of range_size_tree.
147 // Don't care about a small chance of 'not-the-best-choice-for-removal' case
148 // which might happen if rs has the lowest size.
149 _try_insert_range(end, old_right_end, &insert_pos);
150 _range_size_tree_try_insert(*rs);
151
152 } else if (left_over) {
153 rs->end = start;
154 _range_size_tree_try_insert(*rs);
155 } else if (right_over) {
156 rs->start = end;
157 _range_size_tree_try_insert(*rs);
158 } else {
159 range_tree.erase_and_dispose(rs, dispose_rs{});
160 }
161 }
162
163 void AvlAllocator::_remove_from_tree(uint64_t start, uint64_t size)
164 {
165 uint64_t end = start + size;
166
167 ceph_assert(size != 0);
168 ceph_assert(size <= num_free);
169
170 auto rs = range_tree.find(range_t{start, end}, range_tree.key_comp());
171 /* Make sure we completely overlap with someone */
172 ceph_assert(rs != range_tree.end());
173 ceph_assert(rs->start <= start);
174 ceph_assert(rs->end >= end);
175
176 _process_range_removal(start, end, rs);
177 }
178
179 void AvlAllocator::_try_remove_from_tree(uint64_t start, uint64_t size,
180 std::function<void(uint64_t, uint64_t, bool)> cb)
181 {
182 uint64_t end = start + size;
183
184 ceph_assert(size != 0);
185
186 auto rs = range_tree.find(range_t{ start, end },
187 range_tree.key_comp());
188
189 if (rs == range_tree.end() || rs->start >= end) {
190 cb(start, size, false);
191 return;
192 }
193
194 do {
195
196 auto next_rs = rs;
197 ++next_rs;
198
199 if (start < rs->start) {
200 cb(start, rs->start - start, false);
201 start = rs->start;
202 }
203 auto range_end = std::min(rs->end, end);
204 _process_range_removal(start, range_end, rs);
205 cb(start, range_end - start, true);
206 start = range_end;
207
208 rs = next_rs;
209 } while (rs != range_tree.end() && rs->start < end && start < end);
210 if (start < end) {
211 cb(start, end - start, false);
212 }
213 }
214
215 int64_t AvlAllocator::_allocate(
216 uint64_t want,
217 uint64_t unit,
218 uint64_t max_alloc_size,
219 int64_t hint, // unused, for now!
220 PExtentVector* extents)
221 {
222 uint64_t allocated = 0;
223 while (allocated < want) {
224 uint64_t offset, length;
225 int r = _allocate(std::min(max_alloc_size, want - allocated),
226 unit, &offset, &length);
227 if (r < 0) {
228 // Allocation failed.
229 break;
230 }
231 extents->emplace_back(offset, length);
232 allocated += length;
233 }
234 return allocated ? allocated : -ENOSPC;
235 }
236
237 int AvlAllocator::_allocate(
238 uint64_t size,
239 uint64_t unit,
240 uint64_t *offset,
241 uint64_t *length)
242 {
243 uint64_t max_size = 0;
244 if (auto p = range_size_tree.rbegin(); p != range_size_tree.rend()) {
245 max_size = p->end - p->start;
246 }
247
248 bool force_range_size_alloc = false;
249 if (max_size < size) {
250 if (max_size < unit) {
251 return -ENOSPC;
252 }
253 size = p2align(max_size, unit);
254 ceph_assert(size > 0);
255 force_range_size_alloc = true;
256 }
257
258 const int free_pct = num_free * 100 / device_size;
259 uint64_t start = 0;
260 // If we're running low on space, find a range by size by looking up in the size
261 // sorted tree (best-fit), instead of searching in the area pointed by cursor
262 if (force_range_size_alloc ||
263 max_size < range_size_alloc_threshold ||
264 free_pct < range_size_alloc_free_pct) {
265 start = -1ULL;
266 } else {
267 /*
268 * Find the largest power of 2 block size that evenly divides the
269 * requested size. This is used to try to allocate blocks with similar
270 * alignment from the same area (i.e. same cursor bucket) but it does
271 * not guarantee that other allocations sizes may exist in the same
272 * region.
273 */
274 uint64_t align = size & -size;
275 ceph_assert(align != 0);
276 uint64_t* cursor = &lbas[cbits(align) - 1];
277 start = _pick_block_after(cursor, size, unit);
278 dout(20) << __func__ << " first fit=" << start << " size=" << size << dendl;
279 }
280 if (start == -1ULL) {
281 do {
282 start = _pick_block_fits(size, unit);
283 dout(20) << __func__ << " best fit=" << start << " size=" << size << dendl;
284 if (start != uint64_t(-1ULL)) {
285 break;
286 }
287 // try to collect smaller extents as we could fail to retrieve
288 // that large block due to misaligned extents
289 size = p2align(size >> 1, unit);
290 } while (size >= unit);
291 }
292 if (start == -1ULL) {
293 return -ENOSPC;
294 }
295
296 _remove_from_tree(start, size);
297
298 *offset = start;
299 *length = size;
300 return 0;
301 }
302
303 void AvlAllocator::_release(const interval_set<uint64_t>& release_set)
304 {
305 for (auto p = release_set.begin(); p != release_set.end(); ++p) {
306 const auto offset = p.get_start();
307 const auto length = p.get_len();
308 ceph_assert(offset + length <= uint64_t(device_size));
309 ldout(cct, 10) << __func__ << std::hex
310 << " offset 0x" << offset
311 << " length 0x" << length
312 << std::dec << dendl;
313 _add_to_tree(offset, length);
314 }
315 }
316
317 void AvlAllocator::_release(const PExtentVector& release_set) {
318 for (auto& e : release_set) {
319 ldout(cct, 10) << __func__ << std::hex
320 << " offset 0x" << e.offset
321 << " length 0x" << e.length
322 << std::dec << dendl;
323 _add_to_tree(e.offset, e.length);
324 }
325 }
326
327 void AvlAllocator::_shutdown()
328 {
329 range_size_tree.clear();
330 range_tree.clear_and_dispose(dispose_rs{});
331 }
332
333 AvlAllocator::AvlAllocator(CephContext* cct,
334 int64_t device_size,
335 int64_t block_size,
336 uint64_t max_mem,
337 std::string_view name) :
338 Allocator(name, device_size, block_size),
339 range_size_alloc_threshold(
340 cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")),
341 range_size_alloc_free_pct(
342 cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")),
343 max_search_count(
344 cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_ff_max_search_count")),
345 max_search_bytes(
346 cct->_conf.get_val<Option::size_t>("bluestore_avl_alloc_ff_max_search_bytes")),
347 range_count_cap(max_mem / sizeof(range_seg_t)),
348 cct(cct)
349 {}
350
351 AvlAllocator::AvlAllocator(CephContext* cct,
352 int64_t device_size,
353 int64_t block_size,
354 std::string_view name) :
355 AvlAllocator(cct, device_size, block_size, 0 /* max_mem */, name)
356 {}
357
358 AvlAllocator::~AvlAllocator()
359 {
360 shutdown();
361 }
362
363 int64_t AvlAllocator::allocate(
364 uint64_t want,
365 uint64_t unit,
366 uint64_t max_alloc_size,
367 int64_t hint, // unused, for now!
368 PExtentVector* extents)
369 {
370 ldout(cct, 10) << __func__ << std::hex
371 << " want 0x" << want
372 << " unit 0x" << unit
373 << " max_alloc_size 0x" << max_alloc_size
374 << " hint 0x" << hint
375 << std::dec << dendl;
376 ceph_assert(std::has_single_bit(unit));
377 ceph_assert(want % unit == 0);
378
379 if (max_alloc_size == 0) {
380 max_alloc_size = want;
381 }
382 if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max();
383 max_alloc_size >= cap) {
384 max_alloc_size = p2align(uint64_t(cap), (uint64_t)block_size);
385 }
386 std::lock_guard l(lock);
387 return _allocate(want, unit, max_alloc_size, hint, extents);
388 }
389
390 void AvlAllocator::release(const interval_set<uint64_t>& release_set) {
391 std::lock_guard l(lock);
392 _release(release_set);
393 }
394
395 uint64_t AvlAllocator::get_free()
396 {
397 std::lock_guard l(lock);
398 return num_free;
399 }
400
401 double AvlAllocator::get_fragmentation()
402 {
403 std::lock_guard l(lock);
404 return _get_fragmentation();
405 }
406
407 void AvlAllocator::dump()
408 {
409 std::lock_guard l(lock);
410 _dump();
411 }
412
413 void AvlAllocator::_dump() const
414 {
415 ldout(cct, 0) << __func__ << " range_tree: " << dendl;
416 for (auto& rs : range_tree) {
417 ldout(cct, 0) << std::hex
418 << "0x" << rs.start << "~" << rs.end
419 << std::dec
420 << dendl;
421 }
422 ldout(cct, 0) << __func__ << " range_size_tree: " << dendl;
423 for (auto& rs : range_size_tree) {
424 ldout(cct, 0) << std::hex
425 << "0x" << rs.start << "~" << rs.end
426 << std::dec
427 << dendl;
428 }
429 }
430
431 void AvlAllocator::foreach(
432 std::function<void(uint64_t offset, uint64_t length)> notify)
433 {
434 std::lock_guard l(lock);
435 _foreach(notify);
436 }
437
438 void AvlAllocator::_foreach(
439 std::function<void(uint64_t offset, uint64_t length)> notify) const
440 {
441 for (auto& rs : range_tree) {
442 notify(rs.start, rs.end - rs.start);
443 }
444 }
445
446 void AvlAllocator::init_add_free(uint64_t offset, uint64_t length)
447 {
448 ldout(cct, 10) << __func__ << std::hex
449 << " offset 0x" << offset
450 << " length 0x" << length
451 << std::dec << dendl;
452 if (!length)
453 return;
454 std::lock_guard l(lock);
455 ceph_assert(offset + length <= uint64_t(device_size));
456 _add_to_tree(offset, length);
457 }
458
459 void AvlAllocator::init_rm_free(uint64_t offset, uint64_t length)
460 {
461 ldout(cct, 10) << __func__ << std::hex
462 << " offset 0x" << offset
463 << " length 0x" << length
464 << std::dec << dendl;
465 if (!length)
466 return;
467 std::lock_guard l(lock);
468 ceph_assert(offset + length <= uint64_t(device_size));
469 _remove_from_tree(offset, length);
470 }
471
472 void AvlAllocator::shutdown()
473 {
474 std::lock_guard l(lock);
475 _shutdown();
476 }