]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/AvlAllocator.cc
import ceph 15.2.14
[ceph.git] / ceph / src / os / bluestore / AvlAllocator.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "AvlAllocator.h"
5
6 #include <limits>
7
8 #include "common/config_proxy.h"
9 #include "common/debug.h"
10
11 #define dout_context cct
12 #define dout_subsys ceph_subsys_bluestore
13 #undef dout_prefix
14 #define dout_prefix *_dout << "AvlAllocator "
15
16 MEMPOOL_DEFINE_OBJECT_FACTORY(range_seg_t, range_seg_t, bluestore_alloc);
17
18 namespace {
19 // a light-weight "range_seg_t", which only used as the key when searching in
20 // range_tree and range_size_tree
21 struct range_t {
22 uint64_t start;
23 uint64_t end;
24 };
25 }
26
27 /*
28 * This is a helper function that can be used by the allocator to find
29 * a suitable block to allocate. This will search the specified AVL
30 * tree looking for a block that matches the specified criteria.
31 */
32 template<class Tree>
33 uint64_t AvlAllocator::_block_picker(const Tree& t,
34 uint64_t *cursor,
35 uint64_t size,
36 uint64_t align)
37 {
38 const auto compare = t.key_comp();
39 for (auto rs = t.lower_bound(range_t{*cursor, size}, compare);
40 rs != t.end(); ++rs) {
41 uint64_t offset = p2roundup(rs->start, align);
42 if (offset + size <= rs->end) {
43 *cursor = offset + size;
44 return offset;
45 }
46 }
47 /*
48 * If we know we've searched the whole tree (*cursor == 0), give up.
49 * Otherwise, reset the cursor to the beginning and try again.
50 */
51 if (*cursor == 0) {
52 return -1ULL;
53 }
54 *cursor = 0;
55 return _block_picker(t, cursor, size, align);
56 }
57
58 void AvlAllocator::_add_to_tree(uint64_t start, uint64_t size)
59 {
60 ceph_assert(size != 0);
61
62 uint64_t end = start + size;
63
64 auto rs_after = range_tree.upper_bound(range_t{start, end},
65 range_tree.key_comp());
66
67 /* Make sure we don't overlap with either of our neighbors */
68 auto rs_before = range_tree.end();
69 if (rs_after != range_tree.begin()) {
70 rs_before = std::prev(rs_after);
71 }
72
73 bool merge_before = (rs_before != range_tree.end() && rs_before->end == start);
74 bool merge_after = (rs_after != range_tree.end() && rs_after->start == end);
75
76 if (merge_before && merge_after) {
77 _range_size_tree_rm(*rs_before);
78 _range_size_tree_rm(*rs_after);
79 rs_after->start = rs_before->start;
80 range_tree.erase_and_dispose(rs_before, dispose_rs{});
81 _range_size_tree_try_insert(*rs_after);
82 } else if (merge_before) {
83 _range_size_tree_rm(*rs_before);
84 rs_before->end = end;
85 _range_size_tree_try_insert(*rs_before);
86 } else if (merge_after) {
87 _range_size_tree_rm(*rs_after);
88 rs_after->start = start;
89 _range_size_tree_try_insert(*rs_after);
90 } else {
91 _try_insert_range(start, end, &rs_after);
92 }
93 }
94
95 void AvlAllocator::_process_range_removal(uint64_t start, uint64_t end,
96 AvlAllocator::range_tree_t::iterator& rs)
97 {
98 bool left_over = (rs->start != start);
99 bool right_over = (rs->end != end);
100
101 _range_size_tree_rm(*rs);
102
103 if (left_over && right_over) {
104 auto old_right_end = rs->end;
105 auto insert_pos = rs;
106 ceph_assert(insert_pos != range_tree.end());
107 ++insert_pos;
108 rs->end = start;
109
110 // Insert tail first to be sure insert_pos hasn't been disposed.
111 // This woulnd't dispose rs though since it's out of range_size_tree.
112 // Don't care about a small chance of 'not-the-best-choice-for-removal' case
113 // which might happen if rs has the lowest size.
114 _try_insert_range(end, old_right_end, &insert_pos);
115 _range_size_tree_try_insert(*rs);
116
117 } else if (left_over) {
118 rs->end = start;
119 _range_size_tree_try_insert(*rs);
120 } else if (right_over) {
121 rs->start = end;
122 _range_size_tree_try_insert(*rs);
123 } else {
124 range_tree.erase_and_dispose(rs, dispose_rs{});
125 }
126 }
127
128 void AvlAllocator::_remove_from_tree(uint64_t start, uint64_t size)
129 {
130 uint64_t end = start + size;
131
132 ceph_assert(size != 0);
133 ceph_assert(size <= num_free);
134
135 auto rs = range_tree.find(range_t{start, end}, range_tree.key_comp());
136 /* Make sure we completely overlap with someone */
137 ceph_assert(rs != range_tree.end());
138 ceph_assert(rs->start <= start);
139 ceph_assert(rs->end >= end);
140
141 _process_range_removal(start, end, rs);
142 }
143
144 void AvlAllocator::_try_remove_from_tree(uint64_t start, uint64_t size,
145 std::function<void(uint64_t, uint64_t, bool)> cb)
146 {
147 uint64_t end = start + size;
148
149 ceph_assert(size != 0);
150
151 auto rs = range_tree.find(range_t{ start, end },
152 range_tree.key_comp());
153
154 if (rs == range_tree.end() || rs->start >= end) {
155 cb(start, size, false);
156 return;
157 }
158
159 do {
160
161 auto next_rs = rs;
162 ++next_rs;
163
164 if (start < rs->start) {
165 cb(start, rs->start - start, false);
166 start = rs->start;
167 }
168 auto range_end = std::min(rs->end, end);
169 _process_range_removal(start, range_end, rs);
170 cb(start, range_end - start, true);
171 start = range_end;
172
173 rs = next_rs;
174 } while (rs != range_tree.end() && rs->start < end && start < end);
175 if (start < end) {
176 cb(start, end - start, false);
177 }
178 }
179
180 int64_t AvlAllocator::_allocate(
181 uint64_t want,
182 uint64_t unit,
183 uint64_t max_alloc_size,
184 int64_t hint, // unused, for now!
185 PExtentVector* extents)
186 {
187 uint64_t allocated = 0;
188 while (allocated < want) {
189 uint64_t offset, length;
190 int r = _allocate(std::min(max_alloc_size, want - allocated),
191 unit, &offset, &length);
192 if (r < 0) {
193 // Allocation failed.
194 break;
195 }
196 extents->emplace_back(offset, length);
197 allocated += length;
198 }
199 return allocated ? allocated : -ENOSPC;
200 }
201
202 int AvlAllocator::_allocate(
203 uint64_t size,
204 uint64_t unit,
205 uint64_t *offset,
206 uint64_t *length)
207 {
208 uint64_t max_size = 0;
209 if (auto p = range_size_tree.rbegin(); p != range_size_tree.rend()) {
210 max_size = p->end - p->start;
211 }
212
213 bool force_range_size_alloc = false;
214 if (max_size < size) {
215 if (max_size < unit) {
216 return -ENOSPC;
217 }
218 size = p2align(max_size, unit);
219 ceph_assert(size > 0);
220 force_range_size_alloc = true;
221 }
222
223 const int free_pct = num_free * 100 / num_total;
224 uint64_t start = 0;
225 /*
226 * If we're running low on space switch to using the size
227 * sorted AVL tree (best-fit).
228 */
229 if (force_range_size_alloc ||
230 max_size < range_size_alloc_threshold ||
231 free_pct < range_size_alloc_free_pct) {
232 uint64_t fake_cursor = 0;
233 do {
234 start = _block_picker(range_size_tree, &fake_cursor, size, unit);
235 dout(20) << __func__ << " best fit=" << start << " size=" << size << dendl;
236 if (start != uint64_t(-1ULL)) {
237 break;
238 }
239 // try to collect smaller extents as we could fail to retrieve
240 // that large block due to misaligned extents
241 size = p2align(size >> 1, unit);
242 } while (size >= unit);
243 } else {
244 do {
245 /*
246 * Find the largest power of 2 block size that evenly divides the
247 * requested size. This is used to try to allocate blocks with similar
248 * alignment from the same area (i.e. same cursor bucket) but it does
249 * not guarantee that other allocations sizes may exist in the same
250 * region.
251 */
252 uint64_t align = size & -size;
253 ceph_assert(align != 0);
254 uint64_t* cursor = &lbas[cbits(align) - 1];
255
256 start = _block_picker(range_tree, cursor, size, unit);
257 dout(20) << __func__ << " first fit=" << start << " size=" << size << dendl;
258 if (start != uint64_t(-1ULL)) {
259 break;
260 }
261 // try to collect smaller extents as we could fail to retrieve
262 // that large block due to misaligned extents
263 size = p2align(size >> 1, unit);
264 } while (size >= unit);
265 }
266 if (start == -1ULL) {
267 return -ENOSPC;
268 }
269
270 _remove_from_tree(start, size);
271
272 *offset = start;
273 *length = size;
274 return 0;
275 }
276
277 void AvlAllocator::_release(const interval_set<uint64_t>& release_set)
278 {
279 for (auto p = release_set.begin(); p != release_set.end(); ++p) {
280 const auto offset = p.get_start();
281 const auto length = p.get_len();
282 ldout(cct, 10) << __func__ << std::hex
283 << " offset 0x" << offset
284 << " length 0x" << length
285 << std::dec << dendl;
286 _add_to_tree(offset, length);
287 }
288 }
289
290 void AvlAllocator::_release(const PExtentVector& release_set) {
291 for (auto& e : release_set) {
292 ldout(cct, 10) << __func__ << std::hex
293 << " offset 0x" << e.offset
294 << " length 0x" << e.length
295 << std::dec << dendl;
296 _add_to_tree(e.offset, e.length);
297 }
298 }
299
300 void AvlAllocator::_shutdown()
301 {
302 range_size_tree.clear();
303 range_tree.clear_and_dispose(dispose_rs{});
304 }
305
306 AvlAllocator::AvlAllocator(CephContext* cct,
307 int64_t device_size,
308 int64_t block_size,
309 uint64_t max_mem,
310 const std::string& name) :
311 Allocator(name),
312 num_total(device_size),
313 block_size(block_size),
314 range_size_alloc_threshold(
315 cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")),
316 range_size_alloc_free_pct(
317 cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")),
318 range_count_cap(max_mem / sizeof(range_seg_t)),
319 cct(cct)
320 {}
321
322 AvlAllocator::AvlAllocator(CephContext* cct,
323 int64_t device_size,
324 int64_t block_size,
325 const std::string& name) :
326 Allocator(name),
327 num_total(device_size),
328 block_size(block_size),
329 range_size_alloc_threshold(
330 cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")),
331 range_size_alloc_free_pct(
332 cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")),
333 cct(cct)
334 {}
335
336 AvlAllocator::~AvlAllocator()
337 {
338 shutdown();
339 }
340
341 int64_t AvlAllocator::allocate(
342 uint64_t want,
343 uint64_t unit,
344 uint64_t max_alloc_size,
345 int64_t hint, // unused, for now!
346 PExtentVector* extents)
347 {
348 ldout(cct, 10) << __func__ << std::hex
349 << " want 0x" << want
350 << " unit 0x" << unit
351 << " max_alloc_size 0x" << max_alloc_size
352 << " hint 0x" << hint
353 << std::dec << dendl;
354 ceph_assert(isp2(unit));
355 ceph_assert(want % unit == 0);
356
357 if (max_alloc_size == 0) {
358 max_alloc_size = want;
359 }
360 if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max();
361 max_alloc_size >= cap) {
362 max_alloc_size = p2align(uint64_t(cap), (uint64_t)block_size);
363 }
364 std::lock_guard l(lock);
365 return _allocate(want, unit, max_alloc_size, hint, extents);
366 }
367
368 void AvlAllocator::release(const interval_set<uint64_t>& release_set) {
369 std::lock_guard l(lock);
370 _release(release_set);
371 }
372
373 uint64_t AvlAllocator::get_free()
374 {
375 std::lock_guard l(lock);
376 return num_free;
377 }
378
379 double AvlAllocator::get_fragmentation()
380 {
381 std::lock_guard l(lock);
382 return _get_fragmentation();
383 }
384
385 void AvlAllocator::dump()
386 {
387 std::lock_guard l(lock);
388 _dump();
389 }
390
391 void AvlAllocator::_dump() const
392 {
393 ldout(cct, 0) << __func__ << " range_tree: " << dendl;
394 for (auto& rs : range_tree) {
395 ldout(cct, 0) << std::hex
396 << "0x" << rs.start << "~" << rs.end
397 << std::dec
398 << dendl;
399 }
400
401 ldout(cct, 0) << __func__ << " range_size_tree: " << dendl;
402 for (auto& rs : range_size_tree) {
403 ldout(cct, 0) << std::hex
404 << "0x" << rs.start << "~" << rs.end
405 << std::dec
406 << dendl;
407 }
408 }
409
410 void AvlAllocator::dump(std::function<void(uint64_t offset, uint64_t length)> notify)
411 {
412 for (auto& rs : range_tree) {
413 notify(rs.start, rs.end - rs.start);
414 }
415 }
416
417 void AvlAllocator::init_add_free(uint64_t offset, uint64_t length)
418 {
419 if (!length)
420 return;
421 std::lock_guard l(lock);
422 ldout(cct, 10) << __func__ << std::hex
423 << " offset 0x" << offset
424 << " length 0x" << length
425 << std::dec << dendl;
426 _add_to_tree(offset, length);
427 }
428
429 void AvlAllocator::init_rm_free(uint64_t offset, uint64_t length)
430 {
431 if (!length)
432 return;
433 std::lock_guard l(lock);
434 ldout(cct, 10) << __func__ << std::hex
435 << " offset 0x" << offset
436 << " length 0x" << length
437 << std::dec << dendl;
438 _remove_from_tree(offset, length);
439 }
440
441 void AvlAllocator::shutdown()
442 {
443 std::lock_guard l(lock);
444 _shutdown();
445 }