]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/AvlAllocator.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / os / bluestore / AvlAllocator.h
CommitLineData
9f95a23c
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#pragma once
5
6#include <mutex>
7#include <boost/intrusive/avl_set.hpp>
8
9#include "Allocator.h"
10#include "os/bluestore/bluestore_types.h"
11#include "include/mempool.h"
12
13struct range_seg_t {
14 MEMPOOL_CLASS_HELPERS(); ///< memory monitoring
15 uint64_t start; ///< starting offset of this segment
16 uint64_t end; ///< ending offset (non-inclusive)
17
18 range_seg_t(uint64_t start, uint64_t end)
19 : start{start},
20 end{end}
21 {}
22 // Tree is sorted by offset, greater offsets at the end of the tree.
23 struct before_t {
24 template<typename KeyLeft, typename KeyRight>
25 bool operator()(const KeyLeft& lhs, const KeyRight& rhs) const {
26 return lhs.end <= rhs.start;
27 }
28 };
29 boost::intrusive::avl_set_member_hook<> offset_hook;
30
31 // Tree is sorted by size, larger sizes at the end of the tree.
32 struct shorter_t {
33 template<typename KeyType>
34 bool operator()(const range_seg_t& lhs, const KeyType& rhs) const {
35 auto lhs_size = lhs.end - lhs.start;
36 auto rhs_size = rhs.end - rhs.start;
37 if (lhs_size < rhs_size) {
38 return true;
39 } else if (lhs_size > rhs_size) {
40 return false;
41 } else {
42 return lhs.start < rhs.start;
43 }
44 }
45 };
e306af50
TL
46 inline uint64_t length() const {
47 return end - start;
48 }
9f95a23c
TL
49 boost::intrusive::avl_set_member_hook<> size_hook;
50};
51
e306af50
TL
52class AvlAllocator : public Allocator {
53 struct dispose_rs {
54 void operator()(range_seg_t* p)
55 {
56 delete p;
57 }
58 };
59
60protected:
61 /*
62 * ctor intended for the usage from descendant class(es) which
63 * provides handling for spilled over entries
64 * (when entry count >= max_entries)
65 */
66 AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
67 uint64_t max_mem,
20effc67 68 std::string_view name);
e306af50 69
9f95a23c
TL
70public:
71 AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
20effc67 72 std::string_view name);
e306af50 73 ~AvlAllocator();
f67539c2
TL
74 const char* get_type() const override
75 {
76 return "avl";
77 }
9f95a23c
TL
78 int64_t allocate(
79 uint64_t want,
80 uint64_t unit,
81 uint64_t max_alloc_size,
82 int64_t hint,
e306af50
TL
83 PExtentVector *extents) override;
84 void release(const interval_set<uint64_t>& release_set) override;
e306af50
TL
85 uint64_t get_free() override;
86 double get_fragmentation() override;
87
88 void dump() override;
1e59de90
TL
89 void foreach(
90 std::function<void(uint64_t offset, uint64_t length)> notify) override;
e306af50
TL
91 void init_add_free(uint64_t offset, uint64_t length) override;
92 void init_rm_free(uint64_t offset, uint64_t length) override;
93 void shutdown() override;
9f95a23c
TL
94
95private:
20effc67
TL
96 // pick a range by search from cursor forward
97 uint64_t _pick_block_after(
98 uint64_t *cursor,
99 uint64_t size,
100 uint64_t align);
101 // pick a range with exactly the same size or larger
102 uint64_t _pick_block_fits(
103 uint64_t size,
9f95a23c 104 uint64_t align);
9f95a23c
TL
105 int _allocate(
106 uint64_t size,
107 uint64_t unit,
108 uint64_t *offset,
109 uint64_t *length);
110
111 using range_tree_t =
112 boost::intrusive::avl_set<
113 range_seg_t,
114 boost::intrusive::compare<range_seg_t::before_t>,
115 boost::intrusive::member_hook<
116 range_seg_t,
117 boost::intrusive::avl_set_member_hook<>,
118 &range_seg_t::offset_hook>>;
119 range_tree_t range_tree; ///< main range tree
120 /*
121 * The range_size_tree should always contain the
122 * same number of segments as the range_tree.
123 * The only difference is that the range_size_tree
124 * is ordered by segment sizes.
125 */
126 using range_size_tree_t =
127 boost::intrusive::avl_multiset<
128 range_seg_t,
129 boost::intrusive::compare<range_seg_t::shorter_t>,
130 boost::intrusive::member_hook<
131 range_seg_t,
132 boost::intrusive::avl_set_member_hook<>,
e306af50
TL
133 &range_seg_t::size_hook>,
134 boost::intrusive::constant_time_size<true>>;
9f95a23c
TL
135 range_size_tree_t range_size_tree;
136
9f95a23c
TL
137 uint64_t num_free = 0; ///< total bytes in freelist
138
139 /*
140 * This value defines the number of elements in the ms_lbas array.
141 * The value of 64 was chosen as it covers all power of 2 buckets
142 * up to UINT64_MAX.
143 * This is the equivalent of highest-bit of UINT64_MAX.
144 */
145 static constexpr unsigned MAX_LBAS = 64;
146 uint64_t lbas[MAX_LBAS] = {0};
147
148 /*
149 * Minimum size which forces the dynamic allocator to change
150 * it's allocation strategy. Once the allocator cannot satisfy
151 * an allocation of this size then it switches to using more
152 * aggressive strategy (i.e search by size rather than offset).
153 */
154 uint64_t range_size_alloc_threshold = 0;
155 /*
156 * The minimum free space, in percent, which must be available
157 * in allocator to continue allocations in a first-fit fashion.
158 * Once the allocator's free space drops below this level we dynamically
159 * switch to using best-fit allocations.
160 */
161 int range_size_alloc_free_pct = 0;
20effc67
TL
162 /*
163 * Maximum number of segments to check in the first-fit mode, without this
164 * limit, fragmented device can see lots of iterations and _block_picker()
165 * becomes the performance limiting factor on high-performance storage.
166 */
167 const uint32_t max_search_count;
168 /*
169 * Maximum distance to search forward from the last offset, without this
170 * limit, fragmented device can see lots of iterations and _block_picker()
171 * becomes the performance limiting factor on high-performance storage.
172 */
173 const uint32_t max_search_bytes;
e306af50
TL
174 /*
175 * Max amount of range entries allowed. 0 - unlimited
176 */
177 uint64_t range_count_cap = 0;
178
179 void _range_size_tree_rm(range_seg_t& r) {
180 ceph_assert(num_free >= r.length());
181 num_free -= r.length();
182 range_size_tree.erase(r);
183
184 }
185 void _range_size_tree_try_insert(range_seg_t& r) {
186 if (_try_insert_range(r.start, r.end)) {
187 range_size_tree.insert(r);
188 num_free += r.length();
189 } else {
190 range_tree.erase_and_dispose(r, dispose_rs{});
191 }
192 }
193 bool _try_insert_range(uint64_t start,
194 uint64_t end,
195 range_tree_t::iterator* insert_pos = nullptr) {
196 bool res = !range_count_cap || range_size_tree.size() < range_count_cap;
197 bool remove_lowest = false;
198 if (!res) {
199 if (end - start > _lowest_size_available()) {
200 remove_lowest = true;
201 res = true;
202 }
203 }
204 if (!res) {
205 _spillover_range(start, end);
206 } else {
207 // NB: we should do insertion before the following removal
208 // to avoid potential iterator disposal insertion might depend on.
209 if (insert_pos) {
210 auto new_rs = new range_seg_t{ start, end };
211 range_tree.insert_before(*insert_pos, *new_rs);
212 range_size_tree.insert(*new_rs);
213 num_free += new_rs->length();
214 }
215 if (remove_lowest) {
216 auto r = range_size_tree.begin();
217 _range_size_tree_rm(*r);
218 _spillover_range(r->start, r->end);
219 range_tree.erase_and_dispose(*r, dispose_rs{});
220 }
221 }
222 return res;
223 }
224 virtual void _spillover_range(uint64_t start, uint64_t end) {
225 // this should be overriden when range count cap is present,
226 // i.e. (range_count_cap > 0)
227 ceph_assert(false);
228 }
229protected:
230 // called when extent to be released/marked free
231 virtual void _add_to_tree(uint64_t start, uint64_t size);
232
233protected:
9f95a23c
TL
234 CephContext* cct;
235 std::mutex lock;
e306af50
TL
236
237 double _get_fragmentation() const {
20effc67 238 auto free_blocks = p2align(num_free, (uint64_t)block_size) / block_size;
e306af50
TL
239 if (free_blocks <= 1) {
240 return .0;
241 }
242 return (static_cast<double>(range_tree.size() - 1) / (free_blocks - 1));
243 }
244 void _dump() const;
1e59de90 245 void _foreach(std::function<void(uint64_t offset, uint64_t length)>) const;
e306af50
TL
246
247 uint64_t _lowest_size_available() {
248 auto rs = range_size_tree.begin();
249 return rs != range_size_tree.end() ? rs->length() : 0;
250 }
251
252 int64_t _allocate(
253 uint64_t want,
254 uint64_t unit,
255 uint64_t max_alloc_size,
256 int64_t hint,
257 PExtentVector *extents);
258
259 void _release(const interval_set<uint64_t>& release_set);
260 void _release(const PExtentVector& release_set);
261 void _shutdown();
262
263 void _process_range_removal(uint64_t start, uint64_t end, range_tree_t::iterator& rs);
264 void _remove_from_tree(uint64_t start, uint64_t size);
265 void _try_remove_from_tree(uint64_t start, uint64_t size,
266 std::function<void(uint64_t offset, uint64_t length, bool found)> cb);
267
268 uint64_t _get_free() const {
269 return num_free;
270 }
9f95a23c 271};