1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "AvlAllocator.h"
9 #include "common/config_proxy.h"
10 #include "common/debug.h"
12 #define dout_context cct
13 #define dout_subsys ceph_subsys_bluestore
15 #define dout_prefix *_dout << "AvlAllocator "
17 MEMPOOL_DEFINE_OBJECT_FACTORY(range_seg_t
, range_seg_t
, bluestore_alloc
);
20 // a light-weight "range_seg_t", which only used as the key when searching in
21 // range_tree and range_size_tree
29 * This is a helper function that can be used by the allocator to find
30 * a suitable block to allocate. This will search the specified AVL
31 * tree looking for a block that matches the specified criteria.
33 uint64_t AvlAllocator::_pick_block_after(uint64_t *cursor
,
37 const auto compare
= range_tree
.key_comp();
38 uint32_t search_count
= 0;
39 uint64_t search_bytes
= 0;
40 auto rs_start
= range_tree
.lower_bound(range_t
{*cursor
, size
}, compare
);
41 for (auto rs
= rs_start
; rs
!= range_tree
.end(); ++rs
) {
42 uint64_t offset
= rs
->start
;
43 *cursor
= offset
+ size
;
44 if (offset
+ size
<= rs
->end
) {
47 if (max_search_count
> 0 && ++search_count
> max_search_count
) {
50 if (search_bytes
= rs
->start
- rs_start
->start
;
51 max_search_bytes
> 0 && search_bytes
> max_search_bytes
) {
57 // If we already started from beginning, don't bother with searching from beginning
60 // If we reached end, start from beginning till cursor.
61 for (auto rs
= range_tree
.begin(); rs
!= rs_start
; ++rs
) {
62 uint64_t offset
= rs
->start
;
63 *cursor
= offset
+ size
;
64 if (offset
+ size
<= rs
->end
) {
67 if (max_search_count
> 0 && ++search_count
> max_search_count
) {
70 if (max_search_bytes
> 0 && search_bytes
+ rs
->start
> max_search_bytes
) {
77 uint64_t AvlAllocator::_pick_block_fits(uint64_t size
,
80 // instead of searching from cursor, just pick the smallest range which fits
82 const auto compare
= range_size_tree
.key_comp();
83 auto rs_start
= range_size_tree
.lower_bound(range_t
{0, size
}, compare
);
84 for (auto rs
= rs_start
; rs
!= range_size_tree
.end(); ++rs
) {
85 uint64_t offset
= rs
->start
;
86 if (offset
+ size
<= rs
->end
) {
93 void AvlAllocator::_add_to_tree(uint64_t start
, uint64_t size
)
95 ceph_assert(size
!= 0);
97 uint64_t end
= start
+ size
;
99 auto rs_after
= range_tree
.upper_bound(range_t
{start
, end
},
100 range_tree
.key_comp());
102 /* Make sure we don't overlap with either of our neighbors */
103 auto rs_before
= range_tree
.end();
104 if (rs_after
!= range_tree
.begin()) {
105 rs_before
= std::prev(rs_after
);
108 bool merge_before
= (rs_before
!= range_tree
.end() && rs_before
->end
== start
);
109 bool merge_after
= (rs_after
!= range_tree
.end() && rs_after
->start
== end
);
111 if (merge_before
&& merge_after
) {
112 _range_size_tree_rm(*rs_before
);
113 _range_size_tree_rm(*rs_after
);
114 rs_after
->start
= rs_before
->start
;
115 range_tree
.erase_and_dispose(rs_before
, dispose_rs
{});
116 _range_size_tree_try_insert(*rs_after
);
117 } else if (merge_before
) {
118 _range_size_tree_rm(*rs_before
);
119 rs_before
->end
= end
;
120 _range_size_tree_try_insert(*rs_before
);
121 } else if (merge_after
) {
122 _range_size_tree_rm(*rs_after
);
123 rs_after
->start
= start
;
124 _range_size_tree_try_insert(*rs_after
);
126 _try_insert_range(start
, end
, &rs_after
);
130 void AvlAllocator::_process_range_removal(uint64_t start
, uint64_t end
,
131 AvlAllocator::range_tree_t::iterator
& rs
)
133 bool left_over
= (rs
->start
!= start
);
134 bool right_over
= (rs
->end
!= end
);
136 _range_size_tree_rm(*rs
);
138 if (left_over
&& right_over
) {
139 auto old_right_end
= rs
->end
;
140 auto insert_pos
= rs
;
141 ceph_assert(insert_pos
!= range_tree
.end());
145 // Insert tail first to be sure insert_pos hasn't been disposed.
146 // This woulnd't dispose rs though since it's out of range_size_tree.
147 // Don't care about a small chance of 'not-the-best-choice-for-removal' case
148 // which might happen if rs has the lowest size.
149 _try_insert_range(end
, old_right_end
, &insert_pos
);
150 _range_size_tree_try_insert(*rs
);
152 } else if (left_over
) {
154 _range_size_tree_try_insert(*rs
);
155 } else if (right_over
) {
157 _range_size_tree_try_insert(*rs
);
159 range_tree
.erase_and_dispose(rs
, dispose_rs
{});
163 void AvlAllocator::_remove_from_tree(uint64_t start
, uint64_t size
)
165 uint64_t end
= start
+ size
;
167 ceph_assert(size
!= 0);
168 ceph_assert(size
<= num_free
);
170 auto rs
= range_tree
.find(range_t
{start
, end
}, range_tree
.key_comp());
171 /* Make sure we completely overlap with someone */
172 ceph_assert(rs
!= range_tree
.end());
173 ceph_assert(rs
->start
<= start
);
174 ceph_assert(rs
->end
>= end
);
176 _process_range_removal(start
, end
, rs
);
179 void AvlAllocator::_try_remove_from_tree(uint64_t start
, uint64_t size
,
180 std::function
<void(uint64_t, uint64_t, bool)> cb
)
182 uint64_t end
= start
+ size
;
184 ceph_assert(size
!= 0);
186 auto rs
= range_tree
.find(range_t
{ start
, end
},
187 range_tree
.key_comp());
189 if (rs
== range_tree
.end() || rs
->start
>= end
) {
190 cb(start
, size
, false);
199 if (start
< rs
->start
) {
200 cb(start
, rs
->start
- start
, false);
203 auto range_end
= std::min(rs
->end
, end
);
204 _process_range_removal(start
, range_end
, rs
);
205 cb(start
, range_end
- start
, true);
209 } while (rs
!= range_tree
.end() && rs
->start
< end
&& start
< end
);
211 cb(start
, end
- start
, false);
215 int64_t AvlAllocator::_allocate(
218 uint64_t max_alloc_size
,
219 int64_t hint
, // unused, for now!
220 PExtentVector
* extents
)
222 uint64_t allocated
= 0;
223 while (allocated
< want
) {
224 uint64_t offset
, length
;
225 int r
= _allocate(std::min(max_alloc_size
, want
- allocated
),
226 unit
, &offset
, &length
);
228 // Allocation failed.
231 extents
->emplace_back(offset
, length
);
234 return allocated
? allocated
: -ENOSPC
;
237 int AvlAllocator::_allocate(
243 uint64_t max_size
= 0;
244 if (auto p
= range_size_tree
.rbegin(); p
!= range_size_tree
.rend()) {
245 max_size
= p
->end
- p
->start
;
248 bool force_range_size_alloc
= false;
249 if (max_size
< size
) {
250 if (max_size
< unit
) {
253 size
= p2align(max_size
, unit
);
254 ceph_assert(size
> 0);
255 force_range_size_alloc
= true;
258 const int free_pct
= num_free
* 100 / device_size
;
260 // If we're running low on space, find a range by size by looking up in the size
261 // sorted tree (best-fit), instead of searching in the area pointed by cursor
262 if (force_range_size_alloc
||
263 max_size
< range_size_alloc_threshold
||
264 free_pct
< range_size_alloc_free_pct
) {
268 * Find the largest power of 2 block size that evenly divides the
269 * requested size. This is used to try to allocate blocks with similar
270 * alignment from the same area (i.e. same cursor bucket) but it does
271 * not guarantee that other allocations sizes may exist in the same
274 uint64_t align
= size
& -size
;
275 ceph_assert(align
!= 0);
276 uint64_t* cursor
= &lbas
[cbits(align
) - 1];
277 start
= _pick_block_after(cursor
, size
, unit
);
278 dout(20) << __func__
<< " first fit=" << start
<< " size=" << size
<< dendl
;
280 if (start
== -1ULL) {
282 start
= _pick_block_fits(size
, unit
);
283 dout(20) << __func__
<< " best fit=" << start
<< " size=" << size
<< dendl
;
284 if (start
!= uint64_t(-1ULL)) {
287 // try to collect smaller extents as we could fail to retrieve
288 // that large block due to misaligned extents
289 size
= p2align(size
>> 1, unit
);
290 } while (size
>= unit
);
292 if (start
== -1ULL) {
296 _remove_from_tree(start
, size
);
303 void AvlAllocator::_release(const interval_set
<uint64_t>& release_set
)
305 for (auto p
= release_set
.begin(); p
!= release_set
.end(); ++p
) {
306 const auto offset
= p
.get_start();
307 const auto length
= p
.get_len();
308 ceph_assert(offset
+ length
<= uint64_t(device_size
));
309 ldout(cct
, 10) << __func__
<< std::hex
310 << " offset 0x" << offset
311 << " length 0x" << length
312 << std::dec
<< dendl
;
313 _add_to_tree(offset
, length
);
317 void AvlAllocator::_release(const PExtentVector
& release_set
) {
318 for (auto& e
: release_set
) {
319 ldout(cct
, 10) << __func__
<< std::hex
320 << " offset 0x" << e
.offset
321 << " length 0x" << e
.length
322 << std::dec
<< dendl
;
323 _add_to_tree(e
.offset
, e
.length
);
327 void AvlAllocator::_shutdown()
329 range_size_tree
.clear();
330 range_tree
.clear_and_dispose(dispose_rs
{});
333 AvlAllocator::AvlAllocator(CephContext
* cct
,
337 std::string_view name
) :
338 Allocator(name
, device_size
, block_size
),
339 range_size_alloc_threshold(
340 cct
->_conf
.get_val
<uint64_t>("bluestore_avl_alloc_bf_threshold")),
341 range_size_alloc_free_pct(
342 cct
->_conf
.get_val
<uint64_t>("bluestore_avl_alloc_bf_free_pct")),
344 cct
->_conf
.get_val
<uint64_t>("bluestore_avl_alloc_ff_max_search_count")),
346 cct
->_conf
.get_val
<Option::size_t>("bluestore_avl_alloc_ff_max_search_bytes")),
347 range_count_cap(max_mem
/ sizeof(range_seg_t
)),
351 AvlAllocator::AvlAllocator(CephContext
* cct
,
354 std::string_view name
) :
355 AvlAllocator(cct
, device_size
, block_size
, 0 /* max_mem */, name
)
358 AvlAllocator::~AvlAllocator()
363 int64_t AvlAllocator::allocate(
366 uint64_t max_alloc_size
,
367 int64_t hint
, // unused, for now!
368 PExtentVector
* extents
)
370 ldout(cct
, 10) << __func__
<< std::hex
371 << " want 0x" << want
372 << " unit 0x" << unit
373 << " max_alloc_size 0x" << max_alloc_size
374 << " hint 0x" << hint
375 << std::dec
<< dendl
;
376 ceph_assert(std::has_single_bit(unit
));
377 ceph_assert(want
% unit
== 0);
379 if (max_alloc_size
== 0) {
380 max_alloc_size
= want
;
382 if (constexpr auto cap
= std::numeric_limits
<decltype(bluestore_pextent_t::length
)>::max();
383 max_alloc_size
>= cap
) {
384 max_alloc_size
= p2align(uint64_t(cap
), (uint64_t)block_size
);
386 std::lock_guard
l(lock
);
387 return _allocate(want
, unit
, max_alloc_size
, hint
, extents
);
390 void AvlAllocator::release(const interval_set
<uint64_t>& release_set
) {
391 std::lock_guard
l(lock
);
392 _release(release_set
);
395 uint64_t AvlAllocator::get_free()
397 std::lock_guard
l(lock
);
401 double AvlAllocator::get_fragmentation()
403 std::lock_guard
l(lock
);
404 return _get_fragmentation();
407 void AvlAllocator::dump()
409 std::lock_guard
l(lock
);
413 void AvlAllocator::_dump() const
415 ldout(cct
, 0) << __func__
<< " range_tree: " << dendl
;
416 for (auto& rs
: range_tree
) {
417 ldout(cct
, 0) << std::hex
418 << "0x" << rs
.start
<< "~" << rs
.end
422 ldout(cct
, 0) << __func__
<< " range_size_tree: " << dendl
;
423 for (auto& rs
: range_size_tree
) {
424 ldout(cct
, 0) << std::hex
425 << "0x" << rs
.start
<< "~" << rs
.end
431 void AvlAllocator::foreach(
432 std::function
<void(uint64_t offset
, uint64_t length
)> notify
)
434 std::lock_guard
l(lock
);
438 void AvlAllocator::_foreach(
439 std::function
<void(uint64_t offset
, uint64_t length
)> notify
) const
441 for (auto& rs
: range_tree
) {
442 notify(rs
.start
, rs
.end
- rs
.start
);
446 void AvlAllocator::init_add_free(uint64_t offset
, uint64_t length
)
448 ldout(cct
, 10) << __func__
<< std::hex
449 << " offset 0x" << offset
450 << " length 0x" << length
451 << std::dec
<< dendl
;
454 std::lock_guard
l(lock
);
455 ceph_assert(offset
+ length
<= uint64_t(device_size
));
456 _add_to_tree(offset
, length
);
459 void AvlAllocator::init_rm_free(uint64_t offset
, uint64_t length
)
461 ldout(cct
, 10) << __func__
<< std::hex
462 << " offset 0x" << offset
463 << " length 0x" << length
464 << std::dec
<< dendl
;
467 std::lock_guard
l(lock
);
468 ceph_assert(offset
+ length
<= uint64_t(device_size
));
469 _remove_from_tree(offset
, length
);
472 void AvlAllocator::shutdown()
474 std::lock_guard
l(lock
);