]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BtreeAllocator.cc
update ceph source to reef 18.2.1
[ceph.git] / ceph / src / os / bluestore / BtreeAllocator.cc
CommitLineData
20effc67
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "BtreeAllocator.h"
5
1e59de90 6#include <bit>
20effc67
TL
7#include <limits>
8
9#include "common/config_proxy.h"
10#include "common/debug.h"
11
12#define dout_context cct
13#define dout_subsys ceph_subsys_bluestore
14#undef dout_prefix
15#define dout_prefix *_dout << "BtreeAllocator "
16
17/*
18 * This is a helper function that can be used by the allocator to find
19 * a suitable block to allocate. This will search the specified B-tree
20 * looking for a block that matches the specified criteria.
21 */
22uint64_t BtreeAllocator::_pick_block_after(uint64_t *cursor,
23 uint64_t size,
24 uint64_t align)
25{
26 auto rs_start = range_tree.lower_bound(*cursor);
27 for (auto rs = rs_start; rs != range_tree.end(); ++rs) {
aee94f69 28 uint64_t offset = rs->first;
20effc67
TL
29 if (offset + size <= rs->second) {
30 *cursor = offset + size;
31 return offset;
32 }
33 }
34 if (*cursor == 0) {
35 // If we already started from beginning, don't bother with searching from beginning
36 return -1ULL;
37 }
38 // If we reached end, start from beginning till cursor.
39 for (auto rs = range_tree.begin(); rs != rs_start; ++rs) {
aee94f69 40 uint64_t offset = rs->first;
20effc67
TL
41 if (offset + size <= rs->second) {
42 *cursor = offset + size;
43 return offset;
44 }
45 }
46 return -1ULL;
47}
48
49uint64_t BtreeAllocator::_pick_block_fits(uint64_t size,
50 uint64_t align)
51{
52 // instead of searching from cursor, just pick the smallest range which fits
53 // the needs
54 auto rs_start = range_size_tree.lower_bound(range_value_t{0,size});
55 for (auto rs = rs_start; rs != range_size_tree.end(); ++rs) {
aee94f69 56 uint64_t offset = rs->start;
20effc67
TL
57 if (offset + size <= rs->start + rs->size) {
58 return offset;
59 }
60 }
61 return -1ULL;
62}
63
64void BtreeAllocator::_add_to_tree(uint64_t start, uint64_t size)
65{
66 ceph_assert(size != 0);
67
68 uint64_t end = start + size;
69
70 auto rs_after = range_tree.upper_bound(start);
71
72 /* Make sure we don't overlap with either of our neighbors */
73 auto rs_before = range_tree.end();
74 if (rs_after != range_tree.begin()) {
75 rs_before = std::prev(rs_after);
76 }
77
78 bool merge_before = (rs_before != range_tree.end() && rs_before->second == start);
79 bool merge_after = (rs_after != range_tree.end() && rs_after->first == end);
80
81 if (merge_before && merge_after) {
82 // | before |//////| after |
83 // | before >>>>>>>>>>>>>>> |
84 range_seg_t seg_before{rs_before->first, rs_before->second};
85 range_seg_t seg_after{rs_after->first, rs_after->second};
86 // expand the head seg before rs_{before,after} are invalidated
87 rs_before->second = seg_after.end;
88 // remove the tail seg from offset tree
89 range_tree.erase(rs_after);
90 // remove the head and tail seg from size tree
91 range_size_tree.erase(seg_before);
92 range_size_tree.erase(seg_after);
93 // insert the merged seg into size tree
94 range_size_tree.emplace(seg_before.start, seg_after.end);
95 } else if (merge_before) {
96 // | before |//////|
97 // | before >>>>>>>> |
98 // remove the head seg from the size tree
99 range_seg_t seg_before{rs_before->first, rs_before->second};
100 range_size_tree.erase(seg_before);
101 // expand the head seg in the offset tree
102 rs_before->second = end;
103 // insert the merged seg into size tree
104 range_size_tree.emplace(seg_before.start, end);
105 } else if (merge_after) {
106 // |//////| after |
107 // | merge after |
108 // remove the tail seg from size tree
109 range_seg_t seg_after{rs_after->first, rs_after->second};
110 range_size_tree.erase(seg_after);
111 // remove the tail seg from offset tree
112 range_tree.erase(rs_after);
113 // insert the merged seg
114 range_tree.emplace(start, seg_after.end);
115 range_size_tree.emplace(start, seg_after.end);
116 } else {
117 // no neighbours
118 range_tree.emplace_hint(rs_after, start, end);
119 range_size_tree.emplace(start, end);
120 }
121 num_free += size;
122}
123
124void BtreeAllocator::_process_range_removal(uint64_t start, uint64_t end,
125 BtreeAllocator::range_tree_t::iterator& rs)
126{
127 bool left_over = (rs->first != start);
128 bool right_over = (rs->second != end);
129
130 range_seg_t seg_whole{rs->first, rs->second};
131 range_size_tree.erase(seg_whole);
132
133 // | left <|////| right |
134 if (left_over && right_over) {
135 // add the spin-off right seg
136 range_seg_t seg_after{end, seg_whole.end};
137 range_tree.emplace_hint(rs, seg_after.start, seg_after.end);
138 range_size_tree.emplace(seg_after);
139 // shink the left seg in offset tree
140 rs->second = start;
141 // insert the shrinked left seg back into size tree
142 range_size_tree.emplace(seg_whole.start, start);
143 } else if (left_over) {
144 // | left <|///////////|
145 // shrink the left seg in the offset tree
146 rs->second = start;
147 // insert the shrinked left seg back into size tree
148 range_size_tree.emplace(seg_whole.start, start);
149 } else if (right_over) {
150 // |//////////| right |
151 // remove the whole seg from offset tree
152 range_tree.erase(rs);
153 // add the spin-off right seg
154 range_seg_t seg_after{end, seg_whole.end};
155 range_tree.emplace(seg_after.start, seg_after.end);
156 range_size_tree.emplace(seg_after);
157 } else {
158 range_tree.erase(rs);
159 }
160 num_free -= (end - start);
161}
162
163void BtreeAllocator::_remove_from_tree(uint64_t start, uint64_t size)
164{
165 uint64_t end = start + size;
166
167 ceph_assert(size != 0);
168 ceph_assert(size <= num_free);
169
170 auto rs = range_tree.find(start);
171 /* Make sure we completely overlap with someone */
172 ceph_assert(rs != range_tree.end());
173 ceph_assert(rs->first <= start);
174 ceph_assert(rs->second >= end);
175
176 _process_range_removal(start, end, rs);
177}
178
179void BtreeAllocator::_try_remove_from_tree(uint64_t start, uint64_t size,
180 std::function<void(uint64_t, uint64_t, bool)> cb)
181{
182 uint64_t end = start + size;
183
184 ceph_assert(size != 0);
185
186 auto rs = range_tree.find(start);
187
188 if (rs == range_tree.end() || rs->first >= end) {
189 cb(start, size, false);
190 return;
191 }
192
193 do {
194
195 auto next_rs = rs;
196 ++next_rs;
197
198 if (start < rs->first) {
199 cb(start, rs->first - start, false);
200 start = rs->first;
201 }
202 auto range_end = std::min(rs->second, end);
203 _process_range_removal(start, range_end, rs);
204 cb(start, range_end - start, true);
205 start = range_end;
206
207 rs = next_rs;
208 } while (rs != range_tree.end() && rs->first < end && start < end);
209 if (start < end) {
210 cb(start, end - start, false);
211 }
212}
213
214int64_t BtreeAllocator::_allocate(
215 uint64_t want,
216 uint64_t unit,
217 uint64_t max_alloc_size,
218 int64_t hint, // unused, for now!
219 PExtentVector* extents)
220{
221 uint64_t allocated = 0;
222 while (allocated < want) {
223 uint64_t offset, length;
224 int r = _allocate(std::min(max_alloc_size, want - allocated),
225 unit, &offset, &length);
226 if (r < 0) {
227 // Allocation failed.
228 break;
229 }
230 extents->emplace_back(offset, length);
231 allocated += length;
232 }
233 assert(range_size_tree.size() == range_tree.size());
234 return allocated ? allocated : -ENOSPC;
235}
236
237int BtreeAllocator::_allocate(
238 uint64_t size,
239 uint64_t unit,
240 uint64_t *offset,
241 uint64_t *length)
242{
243 uint64_t max_size = 0;
244 if (auto p = range_size_tree.rbegin(); p != range_size_tree.rend()) {
245 max_size = p->size;
246 }
247
248 bool force_range_size_alloc = false;
249 if (max_size < size) {
250 if (max_size < unit) {
251 return -ENOSPC;
252 }
253 size = p2align(max_size, unit);
254 ceph_assert(size > 0);
255 force_range_size_alloc = true;
256 }
257
258 const int free_pct = num_free * 100 / device_size;
259 uint64_t start = 0;
260 /*
261 * If we're running low on space switch to using the size
262 * sorted B-tree (best-fit).
263 */
264 if (force_range_size_alloc ||
265 max_size < range_size_alloc_threshold ||
266 free_pct < range_size_alloc_free_pct) {
267 do {
268 start = _pick_block_fits(size, unit);
269 dout(20) << __func__ << " best fit=" << start << " size=" << size << dendl;
270 if (start != uint64_t(-1ULL)) {
271 break;
272 }
273 // try to collect smaller extents as we could fail to retrieve
274 // that large block due to misaligned extents
275 size = p2align(size >> 1, unit);
276 } while (size >= unit);
277 } else {
278 do {
279 /*
280 * Find the largest power of 2 block size that evenly divides the
281 * requested size. This is used to try to allocate blocks with similar
282 * alignment from the same area (i.e. same cursor bucket) but it does
283 * not guarantee that other allocations sizes may exist in the same
284 * region.
285 */
286 uint64_t* cursor = &lbas[cbits(size) - 1];
287 start = _pick_block_after(cursor, size, unit);
288 dout(20) << __func__ << " first fit=" << start << " size=" << size << dendl;
289 if (start != uint64_t(-1ULL)) {
290 break;
291 }
292 // try to collect smaller extents as we could fail to retrieve
293 // that large block due to misaligned extents
294 size = p2align(size >> 1, unit);
295 } while (size >= unit);
296 }
297 if (start == -1ULL) {
298 return -ENOSPC;
299 }
300
301 _remove_from_tree(start, size);
302
303 *offset = start;
304 *length = size;
305 return 0;
306}
307
308void BtreeAllocator::_release(const interval_set<uint64_t>& release_set)
309{
310 for (auto p = release_set.begin(); p != release_set.end(); ++p) {
311 const auto offset = p.get_start();
312 const auto length = p.get_len();
313 ceph_assert(offset + length <= uint64_t(device_size));
314 ldout(cct, 10) << __func__ << std::hex
315 << " offset 0x" << offset
316 << " length 0x" << length
317 << std::dec << dendl;
318 _add_to_tree(offset, length);
319 }
320}
321
322void BtreeAllocator::_release(const PExtentVector& release_set) {
323 for (auto& e : release_set) {
324 ldout(cct, 10) << __func__ << std::hex
325 << " offset 0x" << e.offset
326 << " length 0x" << e.length
327 << std::dec << dendl;
328 _add_to_tree(e.offset, e.length);
329 }
330}
331
332void BtreeAllocator::_shutdown()
333{
334 range_size_tree.clear();
335 range_tree.clear();
336}
337
338BtreeAllocator::BtreeAllocator(CephContext* cct,
339 int64_t device_size,
340 int64_t block_size,
341 uint64_t max_mem,
342 std::string_view name) :
343 Allocator(name, device_size, block_size),
344 range_size_alloc_threshold(
345 cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")),
346 range_size_alloc_free_pct(
347 cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")),
348 range_count_cap(max_mem / sizeof(range_seg_t)),
349 cct(cct)
350{}
351
352BtreeAllocator::BtreeAllocator(CephContext* cct,
353 int64_t device_size,
354 int64_t block_size,
355 std::string_view name) :
356 BtreeAllocator(cct, device_size, block_size, 0 /* max_mem */, name)
357{}
358
359BtreeAllocator::~BtreeAllocator()
360{
361 shutdown();
362}
363
364int64_t BtreeAllocator::allocate(
365 uint64_t want,
366 uint64_t unit,
367 uint64_t max_alloc_size,
368 int64_t hint, // unused, for now!
369 PExtentVector* extents)
370{
371 ldout(cct, 10) << __func__ << std::hex
372 << " want 0x" << want
373 << " unit 0x" << unit
374 << " max_alloc_size 0x" << max_alloc_size
375 << " hint 0x" << hint
376 << std::dec << dendl;
1e59de90 377 ceph_assert(std::has_single_bit(unit));
20effc67
TL
378 ceph_assert(want % unit == 0);
379
380 if (max_alloc_size == 0) {
381 max_alloc_size = want;
382 }
383 if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max();
384 max_alloc_size >= cap) {
385 max_alloc_size = p2align(uint64_t(cap), (uint64_t)block_size);
386 }
387 std::lock_guard l(lock);
388 return _allocate(want, unit, max_alloc_size, hint, extents);
389}
390
391void BtreeAllocator::release(const interval_set<uint64_t>& release_set) {
392 std::lock_guard l(lock);
393 _release(release_set);
394}
395
396uint64_t BtreeAllocator::get_free()
397{
398 std::lock_guard l(lock);
399 return num_free;
400}
401
402double BtreeAllocator::get_fragmentation()
403{
404 std::lock_guard l(lock);
405 return _get_fragmentation();
406}
407
408void BtreeAllocator::dump()
409{
410 std::lock_guard l(lock);
411 _dump();
412}
413
414void BtreeAllocator::_dump() const
415{
416 ldout(cct, 0) << __func__ << " range_tree: " << dendl;
417 for (auto& rs : range_tree) {
418 ldout(cct, 0) << std::hex
419 << "0x" << rs.first << "~" << rs.second
420 << std::dec
421 << dendl;
422 }
423
424 ldout(cct, 0) << __func__ << " range_size_tree: " << dendl;
425 for (auto& rs : range_size_tree) {
426 ldout(cct, 0) << std::hex
427 << "0x" << rs.size << "@" << rs.start
428 << std::dec
429 << dendl;
430 }
431}
432
1e59de90 433void BtreeAllocator::foreach(std::function<void(uint64_t offset, uint64_t length)> notify)
20effc67 434{
1e59de90 435 std::lock_guard l(lock);
20effc67
TL
436 for (auto& rs : range_tree) {
437 notify(rs.first, rs.second - rs.first);
438 }
439}
440
441void BtreeAllocator::init_add_free(uint64_t offset, uint64_t length)
442{
443 if (!length)
444 return;
445 std::lock_guard l(lock);
446 ceph_assert(offset + length <= uint64_t(device_size));
447 ldout(cct, 10) << __func__ << std::hex
448 << " offset 0x" << offset
449 << " length 0x" << length
450 << std::dec << dendl;
451 _add_to_tree(offset, length);
452}
453
454void BtreeAllocator::init_rm_free(uint64_t offset, uint64_t length)
455{
456 if (!length)
457 return;
458 std::lock_guard l(lock);
459 ceph_assert(offset + length <= uint64_t(device_size));
460 ldout(cct, 10) << __func__ << std::hex
461 << " offset 0x" << offset
462 << " length 0x" << length
463 << std::dec << dendl;
464 _remove_from_tree(offset, length);
465}
466
467void BtreeAllocator::shutdown()
468{
469 std::lock_guard l(lock);
470 _shutdown();
471}