]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/AvlAllocator.cc
8c43b37dc26827399b5fd4a35631a9369c2a3eb9
[ceph.git] / ceph / src / os / bluestore / AvlAllocator.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "AvlAllocator.h"
5
6 #include <limits>
7
8 #include "common/config_proxy.h"
9 #include "common/debug.h"
10
11 #define dout_context cct
12 #define dout_subsys ceph_subsys_bluestore
13 #undef dout_prefix
14 #define dout_prefix *_dout << "AvlAllocator "
15
16 MEMPOOL_DEFINE_OBJECT_FACTORY(range_seg_t, range_seg_t, bluestore_alloc);
17
18 namespace {
19 // a light-weight "range_seg_t", which only used as the key when searching in
20 // range_tree and range_size_tree
21 struct range_t {
22 uint64_t start;
23 uint64_t end;
24 };
25 }
26
27 /*
28 * This is a helper function that can be used by the allocator to find
29 * a suitable block to allocate. This will search the specified AVL
30 * tree looking for a block that matches the specified criteria.
31 */
32 template<class Tree>
33 uint64_t AvlAllocator::_block_picker(const Tree& t,
34 uint64_t *cursor,
35 uint64_t size,
36 uint64_t align)
37 {
38 const auto compare = t.key_comp();
39 auto rs_start = t.lower_bound(range_t{*cursor, size}, compare);
40 for (auto rs = rs_start; rs != t.end(); ++rs) {
41 uint64_t offset = p2roundup(rs->start, align);
42 if (offset + size <= rs->end) {
43 *cursor = offset + size;
44 return offset;
45 }
46 }
47 /*
48 * If we know we've searched the whole tree (*cursor == 0), give up.
49 * Otherwise, reset the cursor to the beginning and try again.
50 */
51 if (*cursor == 0 || rs_start == t.begin()) {
52 return -1ULL;
53 }
54 *cursor = 0;
55 return _block_picker(t, cursor, size, align);
56 }
57
58 void AvlAllocator::_add_to_tree(uint64_t start, uint64_t size)
59 {
60 ceph_assert(size != 0);
61
62 uint64_t end = start + size;
63
64 auto rs_after = range_tree.upper_bound(range_t{start, end},
65 range_tree.key_comp());
66
67 /* Make sure we don't overlap with either of our neighbors */
68 auto rs_before = range_tree.end();
69 if (rs_after != range_tree.begin()) {
70 rs_before = std::prev(rs_after);
71 }
72
73 bool merge_before = (rs_before != range_tree.end() && rs_before->end == start);
74 bool merge_after = (rs_after != range_tree.end() && rs_after->start == end);
75
76 if (merge_before && merge_after) {
77 _range_size_tree_rm(*rs_before);
78 _range_size_tree_rm(*rs_after);
79 rs_after->start = rs_before->start;
80 range_tree.erase_and_dispose(rs_before, dispose_rs{});
81 _range_size_tree_try_insert(*rs_after);
82 } else if (merge_before) {
83 _range_size_tree_rm(*rs_before);
84 rs_before->end = end;
85 _range_size_tree_try_insert(*rs_before);
86 } else if (merge_after) {
87 _range_size_tree_rm(*rs_after);
88 rs_after->start = start;
89 _range_size_tree_try_insert(*rs_after);
90 } else {
91 _try_insert_range(start, end, &rs_after);
92 }
93 }
94
95 void AvlAllocator::_process_range_removal(uint64_t start, uint64_t end,
96 AvlAllocator::range_tree_t::iterator& rs)
97 {
98 bool left_over = (rs->start != start);
99 bool right_over = (rs->end != end);
100
101 _range_size_tree_rm(*rs);
102
103 if (left_over && right_over) {
104 auto old_right_end = rs->end;
105 auto insert_pos = rs;
106 ceph_assert(insert_pos != range_tree.end());
107 ++insert_pos;
108 rs->end = start;
109
110 // Insert tail first to be sure insert_pos hasn't been disposed.
111 // This woulnd't dispose rs though since it's out of range_size_tree.
112 // Don't care about a small chance of 'not-the-best-choice-for-removal' case
113 // which might happen if rs has the lowest size.
114 _try_insert_range(end, old_right_end, &insert_pos);
115 _range_size_tree_try_insert(*rs);
116
117 } else if (left_over) {
118 rs->end = start;
119 _range_size_tree_try_insert(*rs);
120 } else if (right_over) {
121 rs->start = end;
122 _range_size_tree_try_insert(*rs);
123 } else {
124 range_tree.erase_and_dispose(rs, dispose_rs{});
125 }
126 }
127
128 void AvlAllocator::_remove_from_tree(uint64_t start, uint64_t size)
129 {
130 uint64_t end = start + size;
131
132 ceph_assert(size != 0);
133 ceph_assert(size <= num_free);
134
135 auto rs = range_tree.find(range_t{start, end}, range_tree.key_comp());
136 /* Make sure we completely overlap with someone */
137 ceph_assert(rs != range_tree.end());
138 ceph_assert(rs->start <= start);
139 ceph_assert(rs->end >= end);
140
141 _process_range_removal(start, end, rs);
142 }
143
144 void AvlAllocator::_try_remove_from_tree(uint64_t start, uint64_t size,
145 std::function<void(uint64_t, uint64_t, bool)> cb)
146 {
147 uint64_t end = start + size;
148
149 ceph_assert(size != 0);
150
151 auto rs = range_tree.find(range_t{ start, end },
152 range_tree.key_comp());
153
154 if (rs == range_tree.end() || rs->start >= end) {
155 cb(start, size, false);
156 return;
157 }
158
159 do {
160
161 auto next_rs = rs;
162 ++next_rs;
163
164 if (start < rs->start) {
165 cb(start, rs->start - start, false);
166 start = rs->start;
167 }
168 auto range_end = std::min(rs->end, end);
169 _process_range_removal(start, range_end, rs);
170 cb(start, range_end - start, true);
171 start = range_end;
172
173 rs = next_rs;
174 } while (rs != range_tree.end() && rs->start < end && start < end);
175 if (start < end) {
176 cb(start, end - start, false);
177 }
178 }
179
180 int64_t AvlAllocator::_allocate(
181 uint64_t want,
182 uint64_t unit,
183 uint64_t max_alloc_size,
184 int64_t hint, // unused, for now!
185 PExtentVector* extents)
186 {
187 uint64_t allocated = 0;
188 while (allocated < want) {
189 uint64_t offset, length;
190 int r = _allocate(std::min(max_alloc_size, want - allocated),
191 unit, &offset, &length);
192 if (r < 0) {
193 // Allocation failed.
194 break;
195 }
196 extents->emplace_back(offset, length);
197 allocated += length;
198 }
199 return allocated ? allocated : -ENOSPC;
200 }
201
202 int AvlAllocator::_allocate(
203 uint64_t size,
204 uint64_t unit,
205 uint64_t *offset,
206 uint64_t *length)
207 {
208 uint64_t max_size = 0;
209 if (auto p = range_size_tree.rbegin(); p != range_size_tree.rend()) {
210 max_size = p->end - p->start;
211 }
212
213 bool force_range_size_alloc = false;
214 if (max_size < size) {
215 if (max_size < unit) {
216 return -ENOSPC;
217 }
218 size = p2align(max_size, unit);
219 ceph_assert(size > 0);
220 force_range_size_alloc = true;
221 }
222 /*
223 * Find the largest power of 2 block size that evenly divides the
224 * requested size. This is used to try to allocate blocks with similar
225 * alignment from the same area (i.e. same cursor bucket) but it does
226 * not guarantee that other allocations sizes may exist in the same
227 * region.
228 */
229 const uint64_t align = size & -size;
230 ceph_assert(align != 0);
231 uint64_t *cursor = &lbas[cbits(align) - 1];
232
233 const int free_pct = num_free * 100 / num_total;
234 uint64_t start = 0;
235 /*
236 * If we're running low on space switch to using the size
237 * sorted AVL tree (best-fit).
238 */
239 if (force_range_size_alloc ||
240 max_size < range_size_alloc_threshold ||
241 free_pct < range_size_alloc_free_pct) {
242 *cursor = 0;
243 do {
244 start = _block_picker(range_size_tree, cursor, size, unit);
245 if (start != -1ULL || !force_range_size_alloc) {
246 break;
247 }
248 // try to collect smaller extents as we could fail to retrieve
249 // that large block due to misaligned extents
250 size = p2align(size >> 1, unit);
251 } while (size >= unit);
252 } else {
253 start = _block_picker(range_tree, cursor, size, unit);
254 }
255 if (start == -1ULL) {
256 return -ENOSPC;
257 }
258
259 _remove_from_tree(start, size);
260
261 *offset = start;
262 *length = size;
263 return 0;
264 }
265
266 void AvlAllocator::_release(const interval_set<uint64_t>& release_set)
267 {
268 for (auto p = release_set.begin(); p != release_set.end(); ++p) {
269 const auto offset = p.get_start();
270 const auto length = p.get_len();
271 ldout(cct, 10) << __func__ << std::hex
272 << " offset 0x" << offset
273 << " length 0x" << length
274 << std::dec << dendl;
275 _add_to_tree(offset, length);
276 }
277 }
278
279 void AvlAllocator::_release(const PExtentVector& release_set) {
280 for (auto& e : release_set) {
281 ldout(cct, 10) << __func__ << std::hex
282 << " offset 0x" << e.offset
283 << " length 0x" << e.length
284 << std::dec << dendl;
285 _add_to_tree(e.offset, e.length);
286 }
287 }
288
289 void AvlAllocator::_shutdown()
290 {
291 range_size_tree.clear();
292 range_tree.clear_and_dispose(dispose_rs{});
293 }
294
295 AvlAllocator::AvlAllocator(CephContext* cct,
296 int64_t device_size,
297 int64_t block_size,
298 uint64_t max_mem,
299 const std::string& name) :
300 Allocator(name, device_size, block_size),
301 num_total(device_size),
302 block_size(block_size),
303 range_size_alloc_threshold(
304 cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")),
305 range_size_alloc_free_pct(
306 cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")),
307 range_count_cap(max_mem / sizeof(range_seg_t)),
308 cct(cct)
309 {}
310
311 AvlAllocator::AvlAllocator(CephContext* cct,
312 int64_t device_size,
313 int64_t block_size,
314 const std::string& name) :
315 Allocator(name, device_size, block_size),
316 num_total(device_size),
317 block_size(block_size),
318 range_size_alloc_threshold(
319 cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")),
320 range_size_alloc_free_pct(
321 cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")),
322 cct(cct)
323 {}
324
325 AvlAllocator::~AvlAllocator()
326 {
327 shutdown();
328 }
329
330 int64_t AvlAllocator::allocate(
331 uint64_t want,
332 uint64_t unit,
333 uint64_t max_alloc_size,
334 int64_t hint, // unused, for now!
335 PExtentVector* extents)
336 {
337 ldout(cct, 10) << __func__ << std::hex
338 << " want 0x" << want
339 << " unit 0x" << unit
340 << " max_alloc_size 0x" << max_alloc_size
341 << " hint 0x" << hint
342 << std::dec << dendl;
343 ceph_assert(isp2(unit));
344 ceph_assert(want % unit == 0);
345
346 if (max_alloc_size == 0) {
347 max_alloc_size = want;
348 }
349 if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max();
350 max_alloc_size >= cap) {
351 max_alloc_size = p2align(uint64_t(cap), (uint64_t)block_size);
352 }
353 std::lock_guard l(lock);
354 return _allocate(want, unit, max_alloc_size, hint, extents);
355 }
356
357 void AvlAllocator::release(const interval_set<uint64_t>& release_set) {
358 std::lock_guard l(lock);
359 _release(release_set);
360 }
361
362 uint64_t AvlAllocator::get_free()
363 {
364 std::lock_guard l(lock);
365 return num_free;
366 }
367
368 double AvlAllocator::get_fragmentation()
369 {
370 std::lock_guard l(lock);
371 return _get_fragmentation();
372 }
373
374 void AvlAllocator::dump()
375 {
376 std::lock_guard l(lock);
377 _dump();
378 }
379
380 void AvlAllocator::_dump() const
381 {
382 ldout(cct, 0) << __func__ << " range_tree: " << dendl;
383 for (auto& rs : range_tree) {
384 ldout(cct, 0) << std::hex
385 << "0x" << rs.start << "~" << rs.end
386 << std::dec
387 << dendl;
388 }
389
390 ldout(cct, 0) << __func__ << " range_size_tree: " << dendl;
391 for (auto& rs : range_size_tree) {
392 ldout(cct, 0) << std::hex
393 << "0x" << rs.start << "~" << rs.end
394 << std::dec
395 << dendl;
396 }
397 }
398
399 void AvlAllocator::dump(std::function<void(uint64_t offset, uint64_t length)> notify)
400 {
401 for (auto& rs : range_tree) {
402 notify(rs.start, rs.end - rs.start);
403 }
404 }
405
406 void AvlAllocator::init_add_free(uint64_t offset, uint64_t length)
407 {
408 std::lock_guard l(lock);
409 ldout(cct, 10) << __func__ << std::hex
410 << " offset 0x" << offset
411 << " length 0x" << length
412 << std::dec << dendl;
413 _add_to_tree(offset, length);
414 }
415
416 void AvlAllocator::init_rm_free(uint64_t offset, uint64_t length)
417 {
418 std::lock_guard l(lock);
419 ldout(cct, 10) << __func__ << std::hex
420 << " offset 0x" << offset
421 << " length 0x" << length
422 << std::dec << dendl;
423 _remove_from_tree(offset, length);
424 }
425
426 void AvlAllocator::shutdown()
427 {
428 std::lock_guard l(lock);
429 _shutdown();
430 }