]>
Commit | Line | Data |
---|---|---|
20effc67 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include "BtreeAllocator.h" | |
5 | ||
1e59de90 | 6 | #include <bit> |
20effc67 TL |
7 | #include <limits> |
8 | ||
9 | #include "common/config_proxy.h" | |
10 | #include "common/debug.h" | |
11 | ||
12 | #define dout_context cct | |
13 | #define dout_subsys ceph_subsys_bluestore | |
14 | #undef dout_prefix | |
15 | #define dout_prefix *_dout << "BtreeAllocator " | |
16 | ||
17 | /* | |
18 | * This is a helper function that can be used by the allocator to find | |
19 | * a suitable block to allocate. This will search the specified B-tree | |
20 | * looking for a block that matches the specified criteria. | |
21 | */ | |
22 | uint64_t BtreeAllocator::_pick_block_after(uint64_t *cursor, | |
23 | uint64_t size, | |
24 | uint64_t align) | |
25 | { | |
26 | auto rs_start = range_tree.lower_bound(*cursor); | |
27 | for (auto rs = rs_start; rs != range_tree.end(); ++rs) { | |
aee94f69 | 28 | uint64_t offset = rs->first; |
20effc67 TL |
29 | if (offset + size <= rs->second) { |
30 | *cursor = offset + size; | |
31 | return offset; | |
32 | } | |
33 | } | |
34 | if (*cursor == 0) { | |
35 | // If we already started from beginning, don't bother with searching from beginning | |
36 | return -1ULL; | |
37 | } | |
38 | // If we reached end, start from beginning till cursor. | |
39 | for (auto rs = range_tree.begin(); rs != rs_start; ++rs) { | |
aee94f69 | 40 | uint64_t offset = rs->first; |
20effc67 TL |
41 | if (offset + size <= rs->second) { |
42 | *cursor = offset + size; | |
43 | return offset; | |
44 | } | |
45 | } | |
46 | return -1ULL; | |
47 | } | |
48 | ||
49 | uint64_t BtreeAllocator::_pick_block_fits(uint64_t size, | |
50 | uint64_t align) | |
51 | { | |
52 | // instead of searching from cursor, just pick the smallest range which fits | |
53 | // the needs | |
54 | auto rs_start = range_size_tree.lower_bound(range_value_t{0,size}); | |
55 | for (auto rs = rs_start; rs != range_size_tree.end(); ++rs) { | |
aee94f69 | 56 | uint64_t offset = rs->start; |
20effc67 TL |
57 | if (offset + size <= rs->start + rs->size) { |
58 | return offset; | |
59 | } | |
60 | } | |
61 | return -1ULL; | |
62 | } | |
63 | ||
64 | void BtreeAllocator::_add_to_tree(uint64_t start, uint64_t size) | |
65 | { | |
66 | ceph_assert(size != 0); | |
67 | ||
68 | uint64_t end = start + size; | |
69 | ||
70 | auto rs_after = range_tree.upper_bound(start); | |
71 | ||
72 | /* Make sure we don't overlap with either of our neighbors */ | |
73 | auto rs_before = range_tree.end(); | |
74 | if (rs_after != range_tree.begin()) { | |
75 | rs_before = std::prev(rs_after); | |
76 | } | |
77 | ||
78 | bool merge_before = (rs_before != range_tree.end() && rs_before->second == start); | |
79 | bool merge_after = (rs_after != range_tree.end() && rs_after->first == end); | |
80 | ||
81 | if (merge_before && merge_after) { | |
82 | // | before |//////| after | | |
83 | // | before >>>>>>>>>>>>>>> | | |
84 | range_seg_t seg_before{rs_before->first, rs_before->second}; | |
85 | range_seg_t seg_after{rs_after->first, rs_after->second}; | |
86 | // expand the head seg before rs_{before,after} are invalidated | |
87 | rs_before->second = seg_after.end; | |
88 | // remove the tail seg from offset tree | |
89 | range_tree.erase(rs_after); | |
90 | // remove the head and tail seg from size tree | |
91 | range_size_tree.erase(seg_before); | |
92 | range_size_tree.erase(seg_after); | |
93 | // insert the merged seg into size tree | |
94 | range_size_tree.emplace(seg_before.start, seg_after.end); | |
95 | } else if (merge_before) { | |
96 | // | before |//////| | |
97 | // | before >>>>>>>> | | |
98 | // remove the head seg from the size tree | |
99 | range_seg_t seg_before{rs_before->first, rs_before->second}; | |
100 | range_size_tree.erase(seg_before); | |
101 | // expand the head seg in the offset tree | |
102 | rs_before->second = end; | |
103 | // insert the merged seg into size tree | |
104 | range_size_tree.emplace(seg_before.start, end); | |
105 | } else if (merge_after) { | |
106 | // |//////| after | | |
107 | // | merge after | | |
108 | // remove the tail seg from size tree | |
109 | range_seg_t seg_after{rs_after->first, rs_after->second}; | |
110 | range_size_tree.erase(seg_after); | |
111 | // remove the tail seg from offset tree | |
112 | range_tree.erase(rs_after); | |
113 | // insert the merged seg | |
114 | range_tree.emplace(start, seg_after.end); | |
115 | range_size_tree.emplace(start, seg_after.end); | |
116 | } else { | |
117 | // no neighbours | |
118 | range_tree.emplace_hint(rs_after, start, end); | |
119 | range_size_tree.emplace(start, end); | |
120 | } | |
121 | num_free += size; | |
122 | } | |
123 | ||
124 | void BtreeAllocator::_process_range_removal(uint64_t start, uint64_t end, | |
125 | BtreeAllocator::range_tree_t::iterator& rs) | |
126 | { | |
127 | bool left_over = (rs->first != start); | |
128 | bool right_over = (rs->second != end); | |
129 | ||
130 | range_seg_t seg_whole{rs->first, rs->second}; | |
131 | range_size_tree.erase(seg_whole); | |
132 | ||
133 | // | left <|////| right | | |
134 | if (left_over && right_over) { | |
135 | // add the spin-off right seg | |
136 | range_seg_t seg_after{end, seg_whole.end}; | |
137 | range_tree.emplace_hint(rs, seg_after.start, seg_after.end); | |
138 | range_size_tree.emplace(seg_after); | |
139 | // shink the left seg in offset tree | |
140 | rs->second = start; | |
141 | // insert the shrinked left seg back into size tree | |
142 | range_size_tree.emplace(seg_whole.start, start); | |
143 | } else if (left_over) { | |
144 | // | left <|///////////| | |
145 | // shrink the left seg in the offset tree | |
146 | rs->second = start; | |
147 | // insert the shrinked left seg back into size tree | |
148 | range_size_tree.emplace(seg_whole.start, start); | |
149 | } else if (right_over) { | |
150 | // |//////////| right | | |
151 | // remove the whole seg from offset tree | |
152 | range_tree.erase(rs); | |
153 | // add the spin-off right seg | |
154 | range_seg_t seg_after{end, seg_whole.end}; | |
155 | range_tree.emplace(seg_after.start, seg_after.end); | |
156 | range_size_tree.emplace(seg_after); | |
157 | } else { | |
158 | range_tree.erase(rs); | |
159 | } | |
160 | num_free -= (end - start); | |
161 | } | |
162 | ||
163 | void BtreeAllocator::_remove_from_tree(uint64_t start, uint64_t size) | |
164 | { | |
165 | uint64_t end = start + size; | |
166 | ||
167 | ceph_assert(size != 0); | |
168 | ceph_assert(size <= num_free); | |
169 | ||
170 | auto rs = range_tree.find(start); | |
171 | /* Make sure we completely overlap with someone */ | |
172 | ceph_assert(rs != range_tree.end()); | |
173 | ceph_assert(rs->first <= start); | |
174 | ceph_assert(rs->second >= end); | |
175 | ||
176 | _process_range_removal(start, end, rs); | |
177 | } | |
178 | ||
179 | void BtreeAllocator::_try_remove_from_tree(uint64_t start, uint64_t size, | |
180 | std::function<void(uint64_t, uint64_t, bool)> cb) | |
181 | { | |
182 | uint64_t end = start + size; | |
183 | ||
184 | ceph_assert(size != 0); | |
185 | ||
186 | auto rs = range_tree.find(start); | |
187 | ||
188 | if (rs == range_tree.end() || rs->first >= end) { | |
189 | cb(start, size, false); | |
190 | return; | |
191 | } | |
192 | ||
193 | do { | |
194 | ||
195 | auto next_rs = rs; | |
196 | ++next_rs; | |
197 | ||
198 | if (start < rs->first) { | |
199 | cb(start, rs->first - start, false); | |
200 | start = rs->first; | |
201 | } | |
202 | auto range_end = std::min(rs->second, end); | |
203 | _process_range_removal(start, range_end, rs); | |
204 | cb(start, range_end - start, true); | |
205 | start = range_end; | |
206 | ||
207 | rs = next_rs; | |
208 | } while (rs != range_tree.end() && rs->first < end && start < end); | |
209 | if (start < end) { | |
210 | cb(start, end - start, false); | |
211 | } | |
212 | } | |
213 | ||
214 | int64_t BtreeAllocator::_allocate( | |
215 | uint64_t want, | |
216 | uint64_t unit, | |
217 | uint64_t max_alloc_size, | |
218 | int64_t hint, // unused, for now! | |
219 | PExtentVector* extents) | |
220 | { | |
221 | uint64_t allocated = 0; | |
222 | while (allocated < want) { | |
223 | uint64_t offset, length; | |
224 | int r = _allocate(std::min(max_alloc_size, want - allocated), | |
225 | unit, &offset, &length); | |
226 | if (r < 0) { | |
227 | // Allocation failed. | |
228 | break; | |
229 | } | |
230 | extents->emplace_back(offset, length); | |
231 | allocated += length; | |
232 | } | |
233 | assert(range_size_tree.size() == range_tree.size()); | |
234 | return allocated ? allocated : -ENOSPC; | |
235 | } | |
236 | ||
237 | int BtreeAllocator::_allocate( | |
238 | uint64_t size, | |
239 | uint64_t unit, | |
240 | uint64_t *offset, | |
241 | uint64_t *length) | |
242 | { | |
243 | uint64_t max_size = 0; | |
244 | if (auto p = range_size_tree.rbegin(); p != range_size_tree.rend()) { | |
245 | max_size = p->size; | |
246 | } | |
247 | ||
248 | bool force_range_size_alloc = false; | |
249 | if (max_size < size) { | |
250 | if (max_size < unit) { | |
251 | return -ENOSPC; | |
252 | } | |
253 | size = p2align(max_size, unit); | |
254 | ceph_assert(size > 0); | |
255 | force_range_size_alloc = true; | |
256 | } | |
257 | ||
258 | const int free_pct = num_free * 100 / device_size; | |
259 | uint64_t start = 0; | |
260 | /* | |
261 | * If we're running low on space switch to using the size | |
262 | * sorted B-tree (best-fit). | |
263 | */ | |
264 | if (force_range_size_alloc || | |
265 | max_size < range_size_alloc_threshold || | |
266 | free_pct < range_size_alloc_free_pct) { | |
267 | do { | |
268 | start = _pick_block_fits(size, unit); | |
269 | dout(20) << __func__ << " best fit=" << start << " size=" << size << dendl; | |
270 | if (start != uint64_t(-1ULL)) { | |
271 | break; | |
272 | } | |
273 | // try to collect smaller extents as we could fail to retrieve | |
274 | // that large block due to misaligned extents | |
275 | size = p2align(size >> 1, unit); | |
276 | } while (size >= unit); | |
277 | } else { | |
278 | do { | |
279 | /* | |
280 | * Find the largest power of 2 block size that evenly divides the | |
281 | * requested size. This is used to try to allocate blocks with similar | |
282 | * alignment from the same area (i.e. same cursor bucket) but it does | |
283 | * not guarantee that other allocations sizes may exist in the same | |
284 | * region. | |
285 | */ | |
286 | uint64_t* cursor = &lbas[cbits(size) - 1]; | |
287 | start = _pick_block_after(cursor, size, unit); | |
288 | dout(20) << __func__ << " first fit=" << start << " size=" << size << dendl; | |
289 | if (start != uint64_t(-1ULL)) { | |
290 | break; | |
291 | } | |
292 | // try to collect smaller extents as we could fail to retrieve | |
293 | // that large block due to misaligned extents | |
294 | size = p2align(size >> 1, unit); | |
295 | } while (size >= unit); | |
296 | } | |
297 | if (start == -1ULL) { | |
298 | return -ENOSPC; | |
299 | } | |
300 | ||
301 | _remove_from_tree(start, size); | |
302 | ||
303 | *offset = start; | |
304 | *length = size; | |
305 | return 0; | |
306 | } | |
307 | ||
308 | void BtreeAllocator::_release(const interval_set<uint64_t>& release_set) | |
309 | { | |
310 | for (auto p = release_set.begin(); p != release_set.end(); ++p) { | |
311 | const auto offset = p.get_start(); | |
312 | const auto length = p.get_len(); | |
313 | ceph_assert(offset + length <= uint64_t(device_size)); | |
314 | ldout(cct, 10) << __func__ << std::hex | |
315 | << " offset 0x" << offset | |
316 | << " length 0x" << length | |
317 | << std::dec << dendl; | |
318 | _add_to_tree(offset, length); | |
319 | } | |
320 | } | |
321 | ||
322 | void BtreeAllocator::_release(const PExtentVector& release_set) { | |
323 | for (auto& e : release_set) { | |
324 | ldout(cct, 10) << __func__ << std::hex | |
325 | << " offset 0x" << e.offset | |
326 | << " length 0x" << e.length | |
327 | << std::dec << dendl; | |
328 | _add_to_tree(e.offset, e.length); | |
329 | } | |
330 | } | |
331 | ||
332 | void BtreeAllocator::_shutdown() | |
333 | { | |
334 | range_size_tree.clear(); | |
335 | range_tree.clear(); | |
336 | } | |
337 | ||
338 | BtreeAllocator::BtreeAllocator(CephContext* cct, | |
339 | int64_t device_size, | |
340 | int64_t block_size, | |
341 | uint64_t max_mem, | |
342 | std::string_view name) : | |
343 | Allocator(name, device_size, block_size), | |
344 | range_size_alloc_threshold( | |
345 | cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")), | |
346 | range_size_alloc_free_pct( | |
347 | cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")), | |
348 | range_count_cap(max_mem / sizeof(range_seg_t)), | |
349 | cct(cct) | |
350 | {} | |
351 | ||
352 | BtreeAllocator::BtreeAllocator(CephContext* cct, | |
353 | int64_t device_size, | |
354 | int64_t block_size, | |
355 | std::string_view name) : | |
356 | BtreeAllocator(cct, device_size, block_size, 0 /* max_mem */, name) | |
357 | {} | |
358 | ||
359 | BtreeAllocator::~BtreeAllocator() | |
360 | { | |
361 | shutdown(); | |
362 | } | |
363 | ||
364 | int64_t BtreeAllocator::allocate( | |
365 | uint64_t want, | |
366 | uint64_t unit, | |
367 | uint64_t max_alloc_size, | |
368 | int64_t hint, // unused, for now! | |
369 | PExtentVector* extents) | |
370 | { | |
371 | ldout(cct, 10) << __func__ << std::hex | |
372 | << " want 0x" << want | |
373 | << " unit 0x" << unit | |
374 | << " max_alloc_size 0x" << max_alloc_size | |
375 | << " hint 0x" << hint | |
376 | << std::dec << dendl; | |
1e59de90 | 377 | ceph_assert(std::has_single_bit(unit)); |
20effc67 TL |
378 | ceph_assert(want % unit == 0); |
379 | ||
380 | if (max_alloc_size == 0) { | |
381 | max_alloc_size = want; | |
382 | } | |
383 | if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max(); | |
384 | max_alloc_size >= cap) { | |
385 | max_alloc_size = p2align(uint64_t(cap), (uint64_t)block_size); | |
386 | } | |
387 | std::lock_guard l(lock); | |
388 | return _allocate(want, unit, max_alloc_size, hint, extents); | |
389 | } | |
390 | ||
391 | void BtreeAllocator::release(const interval_set<uint64_t>& release_set) { | |
392 | std::lock_guard l(lock); | |
393 | _release(release_set); | |
394 | } | |
395 | ||
396 | uint64_t BtreeAllocator::get_free() | |
397 | { | |
398 | std::lock_guard l(lock); | |
399 | return num_free; | |
400 | } | |
401 | ||
402 | double BtreeAllocator::get_fragmentation() | |
403 | { | |
404 | std::lock_guard l(lock); | |
405 | return _get_fragmentation(); | |
406 | } | |
407 | ||
408 | void BtreeAllocator::dump() | |
409 | { | |
410 | std::lock_guard l(lock); | |
411 | _dump(); | |
412 | } | |
413 | ||
414 | void BtreeAllocator::_dump() const | |
415 | { | |
416 | ldout(cct, 0) << __func__ << " range_tree: " << dendl; | |
417 | for (auto& rs : range_tree) { | |
418 | ldout(cct, 0) << std::hex | |
419 | << "0x" << rs.first << "~" << rs.second | |
420 | << std::dec | |
421 | << dendl; | |
422 | } | |
423 | ||
424 | ldout(cct, 0) << __func__ << " range_size_tree: " << dendl; | |
425 | for (auto& rs : range_size_tree) { | |
426 | ldout(cct, 0) << std::hex | |
427 | << "0x" << rs.size << "@" << rs.start | |
428 | << std::dec | |
429 | << dendl; | |
430 | } | |
431 | } | |
432 | ||
1e59de90 | 433 | void BtreeAllocator::foreach(std::function<void(uint64_t offset, uint64_t length)> notify) |
20effc67 | 434 | { |
1e59de90 | 435 | std::lock_guard l(lock); |
20effc67 TL |
436 | for (auto& rs : range_tree) { |
437 | notify(rs.first, rs.second - rs.first); | |
438 | } | |
439 | } | |
440 | ||
441 | void BtreeAllocator::init_add_free(uint64_t offset, uint64_t length) | |
442 | { | |
443 | if (!length) | |
444 | return; | |
445 | std::lock_guard l(lock); | |
446 | ceph_assert(offset + length <= uint64_t(device_size)); | |
447 | ldout(cct, 10) << __func__ << std::hex | |
448 | << " offset 0x" << offset | |
449 | << " length 0x" << length | |
450 | << std::dec << dendl; | |
451 | _add_to_tree(offset, length); | |
452 | } | |
453 | ||
454 | void BtreeAllocator::init_rm_free(uint64_t offset, uint64_t length) | |
455 | { | |
456 | if (!length) | |
457 | return; | |
458 | std::lock_guard l(lock); | |
459 | ceph_assert(offset + length <= uint64_t(device_size)); | |
460 | ldout(cct, 10) << __func__ << std::hex | |
461 | << " offset 0x" << offset | |
462 | << " length 0x" << length | |
463 | << std::dec << dendl; | |
464 | _remove_from_tree(offset, length); | |
465 | } | |
466 | ||
467 | void BtreeAllocator::shutdown() | |
468 | { | |
469 | std::lock_guard l(lock); | |
470 | _shutdown(); | |
471 | } |